# Demo: Binary Classification

In [1]:
import pandas as pd

# replace path with the path to disnetes.csv of your computer
path = "C:\\Users\\Duo\\Downloads\\diabetes.csv"

# read_csv function of pandas
df = pd.read_csv(path)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
# feature engineering, select manually, recursively or using PCA

#feature_names = ['BMI', 'Age']
feature_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
                 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

In [3]:
X = df[feature_names]
y = df.Outcome

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=0)

In [4]:
# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model
logreg = LogisticRegression()

# fit the model with data
logreg.fit(X_train,y_train)

# make prediction
predictions = logreg.predict(X_test)

In [5]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predictions))

[[119  11]
 [ 26  36]]


In [6]:
# Accuracy, precision, recall
from sklearn.metrics import accuracy_score, precision_score, recall_score

logreg_accuracy_score = accuracy_score(y_test, predictions)
logreg_precision_score = precision_score(y_test, predictions, average=None)
logreg_recall_score = recall_score(y_test, predictions, average=None)

print(logreg_accuracy_score)
print(logreg_precision_score)
print(logreg_recall_score)

0.8072916666666666
[0.82068966 0.76595745]
[0.91538462 0.58064516]


In [7]:
# F1 score
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       0.82      0.92      0.87       130
          1       0.77      0.58      0.66        62

avg / total       0.80      0.81      0.80       192



# Multi Class

In [8]:
# Load datasets, as same as lecture 2
from sklearn.datasets import load_wine
wine = load_wine()



#Create dataframe for features
df = pd.DataFrame(wine.data,
                  columns = ['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash',
                            'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
                            'Proanthocyanins', 'Color intensity', 'Hue',
                            'OD280/OD315 of diluted wines', 'Proline']) 

df['targets'] = pd.DataFrame(wine.target)

# Get features and targets from a dataframe 
X = df.iloc[:,:13]
y = df['targets']

#Split train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [9]:
# RBF support vector machine
from sklearn.svm import SVC

rbf_SVC = SVC(kernel='rbf', 
              C=10000,
              gamma=1e-6, 
              random_state=0)

rbf_SVC.fit(X_train,y_train)

predictions = rbf_SVC.predict(X_test)

print("confusion matrix: \n", confusion_matrix(y_test, predictions))
print("\n")

rbf_SVC_accuracy_score = accuracy_score(y_test, predictions)
rbf_SVC_precision_score = precision_score(y_test, predictions, average=None)
rbf_SVC_recall_score = recall_score(y_test, predictions, average=None)

print("accuracy: {}\nprecision score: {}\nrecall score: {}".format(rbf_SVC_accuracy_score, rbf_SVC_precision_score, rbf_SVC_recall_score))
print("\n")
print("classification report:\n", classification_report(y_test, predictions))

confusion matrix: 
 [[15  1  0]
 [ 0 21  0]
 [ 0  1  7]]


accuracy: 0.9555555555555556
precision score: [1.         0.91304348 1.        ]
recall score: [0.9375 1.     0.875 ]


classification report:
              precision    recall  f1-score   support

          0       1.00      0.94      0.97        16
          1       0.91      1.00      0.95        21
          2       1.00      0.88      0.93         8

avg / total       0.96      0.96      0.96        45



In [10]:
#Grid search and cross validation
RBF_SVC = SVC()

#Grid search for best model and parameters
from sklearn.model_selection import GridSearchCV

parameters = {'C': [0.01, 0.1, 1, 10, 100, 1000, 10000, 100000],  
              'gamma': [1e-7, 1e-6, 1e-5, 1e-4, 0.001, 0.01, 0.1]
             }

grid_search = GridSearchCV(estimator = RBF_SVC,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 5)

grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

#here is the best accuracy
print(best_accuracy)
print(best_parameters)

0.9398496240601504
{'C': 10000, 'gamma': 1e-06}


In [11]:
# Naive Bayesian
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train,y_train)

predictions = clf.predict(X_test)
print(confusion_matrix(y_test, predictions))

[[16  0  0]
 [ 2 18  1]
 [ 0  0  8]]


In [12]:
# Decision tree
from sklearn import tree

clf = tree.DecisionTreeClassifier(max_depth=4, random_state=0)
clf.fit(X_train,y_train)

predictions = clf.predict(X_test)
print(confusion_matrix(y_test, predictions))

[[14  2  0]
 [ 0 20  1]
 [ 0  0  8]]


In [15]:
# Random forest
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=3, n_estimators=200, random_state=0)
clf.fit(X_train,y_train)

predictions = clf.predict(X_test)
print(confusion_matrix(y_test, predictions))

[[16  0  0]
 [ 1 19  1]
 [ 0  0  8]]


In [14]:
# Gradient Boosting Decision Tree
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state=0, max_depth=3, learning_rate=0.01, n_estimators=200, )
clf.fit(X_train,y_train)

predictions = clf.predict(X_test)
print(confusion_matrix(y_test, predictions))

[[16  0  0]
 [ 1 19  1]
 [ 0  0  8]]
