In [18]:
import pandas as pd
df = pd.read_csv('smoking.csv')
df = df[:10000]

In [19]:
df = df.drop(['ID','height(cm)','oral'], axis=1)
df['tartar'] = df['tartar'].apply(lambda x: 1 if x == 'Y' else 0)
df['gender'] = df['gender'].apply(lambda x: 1 if x == 'M' else 0)

In [20]:
from sklearn.model_selection import train_test_split

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [21]:
from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [None]:
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Al crescere del valore di C aumenta la complessità dell'iperpiano -> rischio overfitting
clf = svm.SVC(kernel='linear', random_state=42, C=100, probability=True)

clf.fit(X_train, y_train)

# Effettua le predizioni sul test set
y_pred = clf.predict(X_test) # predizione su dati non ancora visti (inferenza)

In [None]:
# Calcola l'accuratezza del modello
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuratezza: {accuracy}\n\n")
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
print("\n\nConfusion matrix:\n", cm)

In [None]:
print (f"Number of support vectors: {clf.support_vectors_.shape[0]}")

In [None]:
y_pred = clf.predict(X_train)

# Calcola l'accuratezza del modello
accuracy = accuracy_score(y_train, y_pred)

print('Training set score: {:.4f}'.format(clf.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(clf.score(X_test, y_test)))

In [None]:
import seaborn as sns

cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'],
                                 index=['Predict Positive:1', 'Predict Negative:0'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

In [None]:
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]

In [None]:
# print classification accuracy

classification_accuracy = (TP + TN) / float(TP + TN + FP + FN)

print('Classification accuracy : {0:0.4f}'.format(classification_accuracy))

In [None]:
# print classification error

classification_error = (FP + FN) / float(TP + TN + FP + FN)

print('Classification error : {0:0.4f}'.format(classification_error))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

y_pred_prob = clf.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.figure(figsize=(6,4))

plt.plot(fpr, tpr, linewidth=2)

plt.plot([0,1], [0,1], 'k--' )

plt.rcParams['font.size'] = 12

plt.title('ROC curve')

plt.xlabel('False Positive Rate (1 - Specificity)')

plt.ylabel('True Positive Rate (Sensitivity)')

plt.show()

In [None]:
# compute ROC AUC

from sklearn.metrics import roc_auc_score

ROC_AUC = roc_auc_score(y_test, y_pred_prob)

print('ROC AUC : {:.4f}'.format(ROC_AUC))