In [18]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn import metrics as met

In [71]:
pd.set_option('display.max_columns', None)

# 1. Iris dataset

In [14]:
iris = pd.read_csv('data/iris.data', header=None)
iris.columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'target']

In [15]:
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [17]:
X = iris.drop('target', axis = 1)
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [68]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn import metrics as met
import pandas as pd

def create_pipeline(estimator):
    pipeline = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('estimator', estimator)
    ])
    return pipeline

def predict(pipeline, X_train, y_train, X_test):
    pipeline.fit(X_train, y_train)
    y_pred_train = pipeline.predict(X_train)
    y_pred_test = pipeline.predict(X_test)
    return y_pred_train, y_pred_test

def calc_confusion_matrix(y_true, y_pred):
    cm = met.confusion_matrix(y_true, y_pred)
    TP = cm.diagonal()
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (TP + FP + FN)
    return TP, FP, FN, TN

def cross_validate(pipeline, X_train, y_train):
    cv_accuracy = cross_val_score(pipeline, X_train, y_train, scoring='accuracy', cv=10)
    cv_f1 = cross_val_score(pipeline, X_train, y_train, scoring='f1_weighted', cv=10)
    return cv_accuracy, cv_f1

def generate_scores_df(estimators, X_train, y_train, X_test, y_test):
    all_est_df = pd.DataFrame()
    class_reports = {}
    for estimator in estimators:
        pipeln = create_pipeline(estimator)
        y_pred_train, y_pred_test = predict(pipeln, X_train, y_train, X_test)
        
        TP_train, FP_train, FN_train, TN_train = calc_confusion_matrix(y_train, y_pred_train)
        TP_test, FP_test, FN_test, TN_test = calc_confusion_matrix(y_test, y_pred_test)
        
        cv_accuracy, cv_f1 = cross_validate(pipeln, X_train, y_train)
        
        scores = {'estimator':estimator,
                  'TP train - setosa': TP_train[0],
                  'FP train - setosa': FP_train[0],
                  'FN train - setosa': FN_train[0],
                  'TN train - setosa': TN_train[0],
                  'TP train - versicolor': TP_train[1],
                  'FP train - versicolor': FP_train[1],
                  'FN train - versicolor': FN_train[1],
                  'TN train - versicolor': TN_train[1],
                  'TP train - virginica': TP_train[2],
                  'FP train - virginica': FP_train[2],
                  'FN train - virginica': FN_train[2],
                  'TN train - virginica': TN_train[2],
                  'TP test - setosa': TP_test[0],
                  'FP test - setosa': FP_test[0],
                  'FN test - setosa': FN_test[0],
                  'TN test - setosa': TN_test[0],
                  'TP test - versicolor': TP_test[1],
                  'FP test - versicolor': FP_test[1],
                  'FN test - versicolor': FN_test[1],
                  'TN test - versicolor': TN_test[1],
                  'TP test - virginica': TP_test[2],
                  'FP test - virginica': FP_test[2],
                  'FN test - virginica': FN_test[2],
                  'TN test - virginica': TN_test[2],
                  'CV accuracy': cv_accuracy.mean(),
                  'CV f1': cv_f1.mean()}
        
        scores_df = pd.DataFrame(scores, index=[0])
        all_est_df = pd.concat([all_est_df, scores_df], axis=0)
        
        class_reports[estimator] = met.classification_report(y_test, y_pred_test)
        
    return all_est_df, class_reports

In [69]:
estimators = [DecisionTreeClassifier(max_depth=4), SVC(), KNeighborsClassifier(n_neighbors=5),
             GaussianNB(), MLPClassifier(max_iter=500)]
results, class_reports = generate_scores_df(estimators, X_train, y_train, X_test, y_test)



In [74]:
results

Unnamed: 0,estimator,TP train - setosa,FP train - setosa,FN train - setosa,TN train - setosa,TP train - versicolor,FP train - versicolor,FN train - versicolor,TN train - versicolor,TP train - virginica,FP train - virginica,FN train - virginica,TN train - virginica,TP test - setosa,FP test - setosa,FN test - setosa,TN test - setosa,TP test - versicolor,FP test - versicolor,FN test - versicolor,TN test - versicolor,TP test - virginica,FP test - virginica,FN test - virginica,TN test - virginica,CV accuracy,CV f1
0,DecisionTreeClassifier(max_depth=4),38,0,0,74,36,0,1,75,37,1,0,74,12,0,0,26,12,0,1,25,13,1,0,24,0.946212,0.943131
0,SVC(),38,0,0,74,35,2,2,73,35,2,2,73,12,0,0,26,12,1,1,24,12,1,1,24,0.963636,0.963088
0,KNeighborsClassifier(),38,0,0,74,36,1,1,74,36,1,1,74,12,0,0,26,13,3,0,22,10,0,3,25,0.945455,0.942179
0,GaussianNB(),38,0,0,74,36,2,1,73,35,1,2,74,12,0,0,26,12,2,1,23,11,1,2,24,0.955303,0.954622
0,MLPClassifier(max_iter=500),38,0,0,74,36,1,1,74,36,1,1,74,11,0,1,26,12,1,1,24,13,1,0,24,0.954545,0.951674


In [82]:
for k,v in class_reports.items():
    print(k)
    print(v)
    print('-'*60)

DecisionTreeClassifier(max_depth=4)
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        12
Iris-versicolor       1.00      0.92      0.96        13
 Iris-virginica       0.93      1.00      0.96        13

       accuracy                           0.97        38
      macro avg       0.98      0.97      0.97        38
   weighted avg       0.98      0.97      0.97        38

------------------------------------------------------------
SVC()
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        12
Iris-versicolor       0.92      0.92      0.92        13
 Iris-virginica       0.92      0.92      0.92        13

       accuracy                           0.95        38
      macro avg       0.95      0.95      0.95        38
   weighted avg       0.95      0.95      0.95        38

------------------------------------------------------------
KNeighborsClassifier()
         

In [84]:
wdbc = pd.read_csv('data/wdbc.data', header=None)

In [85]:
wdbc.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [105]:
wdbc.shape

(569, 32)

In [89]:
X = wdbc.iloc[:,2:]
y = wdbc.iloc[:,1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [101]:
def generate_scores_df(estimators, X_train, y_train, X_test, y_test):
    all_est_df = pd.DataFrame()
    class_reports = {}
    for estimator in estimators:
        pipeln = create_pipeline(estimator)
        y_pred_train, y_pred_test = predict(pipeln, X_train, y_train, X_test)
        
        TP_train, FP_train, FN_train, TN_train = calc_confusion_matrix(y_train, y_pred_train)
        TP_test, FP_test, FN_test, TN_test = calc_confusion_matrix(y_test, y_pred_test)
        
        cv_accuracy, cv_f1 = cross_validate(pipeln, X_train, y_train)
        
        scores = {'estimator':estimator,
                  'TP train - Benign': TP_train[0],
                  'FP train - Benign': FP_train[0],
                  'FN train - Benign': FN_train[0],
                  'TN train - Benign': TN_train[0],
                  'TP train - Malignant': TP_train[1],
                  'FP train - Malignant': FP_train[1],
                  'FN train - Malignant': FN_train[1],
                  'TN train - Malignant': TN_train[1],
                  'TP test - Benign': TP_test[0],
                  'FP test - Benign': FP_test[0],
                  'FN test - Benign': FN_test[0],
                  'TN test - Benign': TN_test[0],
                  'TP test - Malignant': TP_test[1],
                  'FP test - Malignant': FP_test[1],
                  'FN test - Malignant': FN_test[1],
                  'TN test - Malignant': TN_test[1],
                  'CV accuracy': cv_accuracy.mean(),
                  'CV f1': cv_f1.mean()}
        
        scores_df = pd.DataFrame(scores, index=[0])
        all_est_df = pd.concat([all_est_df, scores_df], axis=0)
        
        class_reports[estimator] = met.classification_report(y_test, y_pred_test)
        
    return all_est_df, class_reports

In [102]:
estimators = [DecisionTreeClassifier(max_depth=4), SVC(), KNeighborsClassifier(n_neighbors=5),
             GaussianNB(), MLPClassifier(max_iter=500)]
results, class_reports = generate_scores_df(estimators, X_train, y_train, X_test, y_test)

In [103]:
results

Unnamed: 0,estimator,TP train - Benign,FP train - Benign,FN train - Benign,TN train - Benign,TP train - Malignant,FP train - Malignant,FN train - Malignant,TN train - Malignant,TP test - Benign,FP test - Benign,FN test - Benign,TN test - Benign,TP test - Malignant,FP test - Malignant,FN test - Malignant,TN test - Malignant,CV accuracy,CV f1
0,DecisionTreeClassifier(max_depth=4),266,8,1,151,151,1,8,266,88,13,2,40,40,2,13,88,0.934109,0.935641
0,SVC(),267,6,0,153,153,0,6,267,90,4,0,49,49,0,4,90,0.974086,0.974018
0,KNeighborsClassifier(),266,9,1,150,150,1,9,266,89,5,1,48,48,1,5,89,0.964673,0.964344
0,GaussianNB(),259,16,8,143,143,8,16,259,88,6,2,47,47,2,6,88,0.936434,0.936329
0,MLPClassifier(max_iter=500),267,2,0,157,157,0,2,267,90,5,0,48,48,0,5,90,0.983721,0.9836


In [104]:
for k,v in class_reports.items():
    print(k)
    print(v)
    print('-'*60)

DecisionTreeClassifier(max_depth=4)
              precision    recall  f1-score   support

           B       0.87      0.98      0.92        90
           M       0.95      0.75      0.84        53

    accuracy                           0.90       143
   macro avg       0.91      0.87      0.88       143
weighted avg       0.90      0.90      0.89       143

------------------------------------------------------------
SVC()
              precision    recall  f1-score   support

           B       0.96      1.00      0.98        90
           M       1.00      0.92      0.96        53

    accuracy                           0.97       143
   macro avg       0.98      0.96      0.97       143
weighted avg       0.97      0.97      0.97       143

------------------------------------------------------------
KNeighborsClassifier()
              precision    recall  f1-score   support

           B       0.95      0.99      0.97        90
           M       0.98      0.91      0.94       

Iris dataset has 150 rows and 4 independent features. Since it is clean data we do not need any data cleaning or imputation. All of the variables are contunious, no discrete or categorical varabiles. This means we just need to scale numerical variables before modeling as it is requirement for linear models. Before modeling I applied train-test split 75%-25% with stratify = y parameter which ensures to get same target distribution in train and test sets. Random state is set to 42 to ensure reproducibilty. I have created functions to make it easier to compare different methods mentioned in the task description. This also includes cross validation to get real performance of model.

There are no significant differences in TP between different methods which makes it harder to make a choice. But if we consider cross validation scores, we can see Support Vector Classifier got best cross validation accuracy and weighted f1-score


For 2nd - breast cancer dataset we have 569 rows and 30 features. Again all features are contunious so we only need scaling. Since both datasets share similar characterics I applied same functions to them. The only major difference is that iris has 3 classes, but breast cancer only 2, so it required making adjustment to generate_scores_df function. For this dataset, MLP - Artificial Neural Network method dispalyed better results in terms of all metrics combined.