In [1]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report,confusion_matrix
get_ipython().run_line_magic('matplotlib', 'inline')


# In[ ]:





# In[2]:


def loaddata():
    col = ['Quality','Pre-screening','MA1','MA2','MA3','MA4','MA5','MA6','MA7',
      'exudates1','exudates2','exudates3','exudates4','exudates5','exudates6','exudates7',
      'macula_opticdisc','opticdisc_diamter','AM/FM','Class_label']
    data = pd.read_csv('messidor_features.arff',error_bad_lines=False)
    data.columns = col
    data = data[data['Quality'] != 0]
    return data


# In[3]:


def printaccuracy(y_test,predict,model):
    print(model," report")
    print("-------------------------------------")
    print(" ")
    print(confusion_matrix(y_test,predict))
    print(classification_report(y_test,predict))
    print(" ")
    print("-------------------------------------")
    print(" ")


# In[4]:


def normalizedata(x):
    SS = StandardScaler()
    norm = SS.fit_transform(x)
    print("Normalization done")
    return norm
    


# In[5]:


def PC(components,x):
    cols = []
    pca = PCA(n_components=components)
    pc = pca.fit_transform(x)
    for i in range(components):
        cols.append('pc'+str(i))
    pc_data = pd.DataFrame(data = pc, columns = cols)
    return pc_data


# In[ ]:





# In[6]:


def removeoutliers(data,inplace=False):
    prev_rows = len(data)
    data_copy = data.copy()
    z_score = np.abs(stats.zscore(data_copy))
    data_copy = data_copy[(z_score < 3).all(axis=1)]
    if inplace:
        data=data_copy
    print("Before removing outliers , rows - ", prev_rows)
    print("After removing outliers , rows -", len(data_copy))
    print("Number of records deleted - ", (prev_rows - len(data_copy)))


# In[7]:


def validatecols(data):
    if len(data.columns) == len(col):
        return True
    else:
        return False


# In[8]:


def validatedatatypes(trained, newdata):
    for i in range(trained.columns):
        if trained[trained.columns[i]] != newdata[newdata.columns[i]]:
            return False
    return True    


# In[9]:


def preprocess():
    #newdata = data[data['Quality'] != 0]
    #data = newdata
    #data.drop(['Quality','Pre-screening'],axis=1,inplace=True)
    data.drop(['Quality'],axis=1,inplace=True)
    #data['Class_label'] = pd.Categorical(data['Diagnosis']).codes
    removeoutliers(data,inplace=True)
    X = data.drop('Class_label',axis=1)
    X_copy = X.copy()
    y= data['Class_label']
    X = normalizedata(X)
    print(X[:5,:])
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
    return X_train,X_test,y_train,y_test,X_copy,y


# In[10]:


def k_vs_error_graph():
    knn_error = []
    for i in range(2,50):
        knn = KNeighborsClassifier(n_neighbors=i)
        knn.fit(X_train,y_train)
        knn_predict= knn.predict(X_test)
        knn_error.append(np.mean(y_test!=knn_predict))
    plt.plot(range(2,50),knn_error)
    plt.xlabel("K value")
    plt.ylabel("Error")


# In[11]:


def logisticregression():
    lr = LogisticRegression(solver='lbfgs',max_iter=10000,random_state=0)
    lr.fit(X_train,y_train)
    lr_predict = lr.predict(X_test)
    printaccuracy(y_test,lr_predict,"Logistic Regression")
    return f1_score(y_test,lr_predict)


# In[12]:


def KNN():
    neighbors={'n_neighbors':np.array(range(2,50))}
    knn_grid=GridSearchCV(KNeighborsClassifier(),neighbors,verbose=False,refit=True,cv=3)
    knn_grid.fit(X_train,y_train)
    #knn_grid.best_params_
    knn_predict = knn_grid.predict(X_test)
    printaccuracy(y_test,knn_predict,"KNN")
    return f1_score(y_test,knn_predict)


# In[13]:


def SVM():
    svm = SVC(kernel='rbf',random_state=0)
    params = { 'C' : np.logspace(0, 3, 4), 'gamma' : np.logspace(-2, 1, 4)}
    svm_grid = GridSearchCV(svm, params,cv=3,verbose=False,return_train_score=True)
    #PCA
#     svm_X = X.copy()
#     svm_X = PC(2,svm_X)
#     svm_X_train,svm_X_test,svm_y_train,svm_y_test=train_test_split(svm_X,y,test_size=0.3)
#     svm_grid.fit(svm_X_train,svm_y_train)
#     svm_predict = svm_grid.predict(svm_X_test)
#     printaccuracy(svm_y_test,svm_predict,"SVM")
    svm_grid.fit(X_train,y_train)
    svm_predict = svm_grid.predict(X_test)
    printaccuracy(y_test,svm_predict,"SVM")
    return f1_score(y_test,svm_predict)


# In[14]:


def DecisionTree():
    dt = DecisionTreeClassifier(random_state=0)
    dt.fit(X_train,y_train)
    dt_predict = dt.predict(X_test)
    printaccuracy(y_test,dt_predict,"Decision Tree")
    return f1_score(y_test,dt_predict)


# In[15]:


def RandomForest():
    rf = RandomForestClassifier(random_state=0)
    params = { 'n_estimators' : np.arange(10,100,10), 'max_depth' : np.arange(5,50,5)}
    rf_grid = GridSearchCV(rf, params, verbose=False, cv=3)
    rf_grid.fit(X_train,y_train)
    rf_predict = rf_grid.predict(X_test)
    printaccuracy(y_test,rf_predict,"Random Forest")
    rf
    return f1_score(y_test,rf_predict)


# In[16]:


def Adaboost():
    ab = AdaBoostClassifier(random_state=0)
    params = { 'n_estimators' : np.arange(10,100,10)}
    ab_grid = GridSearchCV(ab, params, verbose=False, cv=3)
    ab_grid.fit(X_train,y_train)
    ab_predict = ab_grid.predict(X_test)
    printaccuracy(y_test,ab_predict,"Adaboost")
    return f1_score(y_test,ab_predict)


# In[17]:


def GaussionNB():
    gnb = GaussianNB()
    gnb.fit(X_train,y_train)
    gnb_predict = gnb.predict(X_test)
    printaccuracy(y_test,gnb_predict,"GaussionNB")
    return f1_score(y_test,gnb_predict)


# In[18]:


def NueralNetwork():
    nn = MLPClassifier(solver='sgd',random_state=0)
    params = {
    'hidden_layer_sizes': np.arange(50,150,20),
    'learning_rate': ['constant','adaptive'],
    'max_iter': np.arange(200,300,50)
    }
    #'hidden_layer_sizes': [(100,50), (50,20), (20,10)],
    #'hidden_layer_sizes': np.arange(10,100,20)
    # 'activation': ['tanh', 'relu'],
    # 'alpha': 10.0 ** -np.arange(1, 5),
    #'solver': ['sgd', 'adam'],
    nn_grid = GridSearchCV(nn, params, cv=3,verbose=False)
    nn_grid.fit(X_train,y_train)
    nn_predict = nn_grid.predict(X_test)
    printaccuracy(y_test,nn_predict,"Nueral Network")
    return f1_score(y_test,nn_predict)




def EvaluateModels(f1scores):
    models=['Logistic Regression',
            'K Nearest Neighbours',
            'Support Vector Machine',
            'Decision Tree',
            'Random Forest',
            'AdaBoost',
            'Guassian Naive Bayes',
            'Nueral Network'
           ]

    print("        Model Results    ")
    print(" --------------------------- ")
    for i in range(len(f1scores)):
        print(models[i]," : f1 score - ",f1scores[i])
    print(" -----------------------------")
    print("Best model")
    idx = f1scores.index(np.max(f1scores))
    print(models[idx] ," : f1 score - ", np.max(f1scores))
    
    
    
    
    


# In[20]:


def trainmodels():
    f1scores=[]
    f1scores.append(logisticregression())
    f1scores.append(KNN())
    f1scores.append(SVM())
    f1scores.append(DecisionTree())
    f1scores.append(RandomForest())
    f1scores.append(Adaboost())
    f1scores.append(GaussionNB())
    f1scores.append(NueralNetwork())
    return f1scores


# In[ ]:





# In[21]:


data = loaddata()
X_train,X_test,y_train,y_test,X,y = preprocess()
f1scores = trainmodels()
EvaluateModels(f1scores)




Before removing outliers , rows -  1146
After removing outliers , rows - 968
Number of records deleted -  178
Normalization done
[[ 0.29892075 -0.56538008 -0.53781883 -0.57822015 -0.67938225 -0.65549369
  -0.54164818 -0.11320685  0.02902586 -0.46757334 -0.40974174 -0.2247698
  -0.19766898 -0.2056762  -0.18677961 -0.08203334  2.01997381 -0.70849554]
 [ 0.29892075  0.9166624   0.95453075  1.04289138  1.02448366  0.93244145
   0.78172633 -0.14533594  0.22321933  0.34118444  0.76581507  0.33386819
   0.15114302 -0.11074238 -0.165352    0.27455888  1.1308808  -0.70849554]
 [ 0.29892075  0.64365457  0.66435167  0.78000843  0.83516522  0.72754659
   0.64938888 -0.40811981 -0.21888419  0.03290619  0.31452732  0.11136405
   0.05593083 -0.19628648 -0.20001218 -1.42421279  0.35991511 -0.70849554]
 [ 0.29892075  0.21464228  0.29126427  0.385684    0.40919875  0.52265174
   0.38471398 -0.79194201 -0.67614122 -0.71939424 -0.46935456 -0.22633842
  -0.20135496 -0.21544902 -0.20855309 -1.68637728  0.85







Nueral Network  report
-------------------------------------
 
[[129  44]
 [ 63 108]]
              precision    recall  f1-score   support

           0       0.67      0.75      0.71       173
           1       0.71      0.63      0.67       171

    accuracy                           0.69       344
   macro avg       0.69      0.69      0.69       344
weighted avg       0.69      0.69      0.69       344

 
-------------------------------------
 
        Model Results    
 --------------------------- 
Logistic Regression  : f1 score -  0.7283582089552239
K Nearest Neighbours  : f1 score -  0.6211180124223603
Support Vector Machine  : f1 score -  0.7477744807121661
Decision Tree  : f1 score -  0.6558265582655827
Random Forest  : f1 score -  0.6741573033707865
AdaBoost  : f1 score -  0.6685393258426967
Guassian Naive Bayes  : f1 score -  0.41818181818181815
Nueral Network  : f1 score -  0.6687306501547988
 -----------------------------
Best model
Support Vector Machine  : f1 score - 