# DATA CLEANING:

In [55]:
def preprocessing(abcd_csv, clas, lp):
    
    # Libraries: Standard ones
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import random as rnd

    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
    
    # Load the data: data_banknote_authentification
    data = pd.read_csv(abcd_csv,sep=";")
    # Statistical summary of the data
    data.describe() 
    
    # X/Y separation
    if isinstance(data[clas][0], str):
        Y = np.multiply([data[clas]==data[clas][0]],1)[0]
    else:
        Y = data[clas]
    data.drop(clas, 1, inplace=True)
    for c in data:
        if isinstance(data[c][lp], str):
            a = np.multiply([data[c] == data[c][lp]],1)
            data.drop(c, 1, inplace=True)
            data[c] = a[0]
        data[c] = np.nan_to_num(data[c], copy=True, nan=data[c].median())
    
    # Correlate columns 
    corr_matrix = data.corr().abs()
    high_corr_var=np.where(corr_matrix>0.75)
    high_corr_var=np.array([(corr_matrix.columns[x],corr_matrix.columns[y]) for x,y in zip(*high_corr_var) if x!=y and x<y])
    drop_corr = []
    for i in range(len(high_corr_var)):
        if high_corr_var[i][0] not in drop_corr and high_corr_var[i][1] not in drop_corr:
            c = high_corr_var[i][0]
            drop_corr.append(c)
            data.drop(c, 1, inplace=True)
        elif high_corr_var[i][0] in drop_corr:
            c = high_corr_var[i][1]
            drop_corr.append(c)
            data.drop(c, 1, inplace=True)
        elif high_corr_var[i][1] in drop_corr:
            c = high_corr_var[i][0]
            drop_corr.append(c)
            data.drop(c, 1, inplace=True)
    X = data
    X = StandardScaler().fit_transform(X)
    #
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)
    return  x_train, x_test, y_train, y_test, X, Y

# Result evaluation

In [45]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

def accuracy(y_predict, y_test):
    return np.mean([y_predict==y_test])

# 1-cross-validation

In [35]:
import numpy as np
# liste des models 
from sklearn import svm
from sklearn.linear_model import SGDClassifier  
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression


def models_1cv(x_train, x_test, y_train, y_test):
    
    models = [svm.SVC(kernel='linear'),
              svm.SVC(kernel='poly', degree=2, gamma='auto'),
              svm.SVC(kernel='rbf', gamma='auto'),
              svm.SVC(kernel='sigmoid', gamma=1./150),
              SGDClassifier(),
              DecisionTreeClassifier(),
              GaussianNB(),
              RandomForestRegressor(n_estimators = 1000, random_state = 42),
              MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5,2), random_state=42),
              LogisticRegression(random_state=0)]

    models = [clf.fit(x_train, y_train) for clf in models]

    scores = [clf.score(x_test,y_test) for clf in models]

    # title 
    titles = ['SVC with linear kernel',
              'SVC with polynomial (degree 2) kernel',
              'SVC with RBF kernel',
              'SVC with sigmoid kernel',
              'Stochastic Gradient Descent',
              'Desicion Trees',
              'Bayesien Network: Gnb',
              'Random Forest',
              'Neural Network',
              'Probit model']

    return titles[scores.index(max(scores))], max(scores)

In [None]:
# test data_set1
x_train, x_test, y_train, y_test, X, Y = preprocessing("data_banknote_authentification.csv", "classification", 4)

In [86]:
#test data_set2
x_train, x_test, y_train, y_test, X, Y = preprocessing("kidney_disease.csv", "classification", 4)


KeyError: 'classification'

# N-cross-validation

In [60]:
# liste des models 
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.linear_model import SGDClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

def models_Ncv(X, Y, N):
    
    models = [svm.SVC(kernel='linear'),
              svm.SVC(kernel='poly', degree=2, gamma='auto'),
              svm.SVC(kernel='rbf', gamma='auto'),
              svm.SVC(kernel='sigmoid', gamma=1./150),
              SGDClassifier(),
              DecisionTreeClassifier(),
              GaussianNB(),
              RandomForestRegressor(n_estimators = 1000, random_state = 42),
              MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5,2), random_state=42),
              LogisticRegression(random_state=0)]
    
    scores = [np.mean(cross_val_score(clf, X, Y, cv=N)) for clf in models]
    
    # title 
    titles = ['SVC with linear kernel',
              'SVC with polynomial (degree 2) kernel',
              'SVC with RBF kernel',
              'SVC with sigmoid kernel',
              'Stochastic Gradient Descent',
              'Desicion Trees',
              'Bayesien Network: Gnb',
              'Random Forest',
              'Neural Network',
              'Probit model']
    
    return titles[scores.index(max(scores))], max(scores)



In [61]:
models_Ncv(X,Y,2)

('SVC with RBF kernel', 0.9781521466977309)

# Model Mean

In [46]:
# liste des models 
from sklearn import svm
from sklearn.linear_model import SGDClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression


def model_mean(x_train, x_test, y_train, y_test):
    
    models = [svm.SVC(kernel='linear'),
              svm.SVC(kernel='poly', degree=2, gamma='auto'),
              svm.SVC(kernel='rbf', gamma='auto'),
              svm.SVC(kernel='sigmoid', gamma=1./150),
              SGDClassifier(),
              DecisionTreeClassifier(),
              GaussianNB(),
              RandomForestRegressor(n_estimators = 1000, random_state = 42),
              MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5,2), random_state=42),
              LogisticRegression(random_state=0)]

    models = [clf.fit(x_train, y_train) for clf in models]
    
    predicts = [clf.predict(x_test) for clf in models]

    scores = [clf.score(x_test,y_test) for clf in models]
    
    mean_predict = sum(p for p in predicts)/10
    mean_predict = [np.round(mean_predict[i]) for i in range(len(mean_predict))]
    
    scores.append(accuracy(mean_predict, y_test))

    # title 
    titles = ['SVC with linear kernel',
              'SVC with polynomial (degree 2) kernel',
              'SVC with RBF kernel',
              'SVC with sigmoid kernel',
              'Stochastic Gradient Descent',
              'Desicion Trees',
              'Bayesien Network: Gnb',
              'Random Forest',
              'Neural Network',
              'Probit model',
              'Model Mean']

    return titles[scores.index(max(scores))], max(scores)

In [47]:
model_mean(x_train, x_test, y_train, y_test)

529     0
243     0
1310    1
664     0
745     0
       ..
1095    1
1130    1
1294    1
860     1
1126    1
Name: class, Length: 1098, dtype: int64


('SVC with RBF kernel', 0.9709090909090909)