<h1>Machine learning project</h1>
<hr>
<p>This Jupyter notebook resume all our programmation work on binary classification, the Banknote Authentication Dataset and the Kidney Disease Dataset.</p>
<p>Students:<br>
    <li>Ettoré Hidoux</li>
    <li>Agathe Fernandez Machado</li>
    <li>Yasmine Diouri</li>
    <li>Clément Mathé</li></p>

## Data Cleaning:

In [33]:
# Libraries: Standard ones
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import random as rnd
    # Libraries: scikit learn for preprocessing 
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split

def preprocessing(abcd_csv, clas, lp, sep):
    #abcd_csv is data name
    #clas is the class namme of the data
    #lp is yhe number of a full row 
    #sep is the symbol use as separator for the data
    
    # Load the data: data_banknote_authentification
    data = pd.read_csv(abcd_csv,sep=sep)
    
    # X/Y separation
    # transform class column from string into boolean if it's necessary
    if isinstance(data[clas][lp], str):
        Y = np.multiply([data[clas]==data[clas][0]],1)[0]
    else:
        Y = data[clas]
    data.drop(clas, 1, inplace=True)
    
    #transform data column from string into boolean if it's necessary
    for c in data:
        if isinstance(data[c][lp], str):
            a = np.multiply([data[c] == data[c][lp]],1)
            data.drop(c, 1, inplace=True)
            data[c] = a[0]
        data[c] = np.nan_to_num(data[c], copy=True, nan=data[c].median())
    
    # Correlation matrix of the data columns 
    corr_matrix = data.corr().abs()
    high_corr_var=np.where(corr_matrix>0.75)
    high_corr_var=np.array([(corr_matrix.columns[x],corr_matrix.columns[y]) for x,y in zip(*high_corr_var) if x!=y and x<y])
    
    # Suppresion of correlate columns
    for i in range(len(high_corr_var)):
        c = high_corr_var[i][0]
        data.drop(c, 1, inplace=True)
        
    X = data
    X = StandardScaler().fit_transform(X)
    
    # Creation of a dataset to train and another to test
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)
    return  x_train, x_test, y_train, y_test, X, Y

In [34]:
# test data_set1
x_train1, x_test1, y_train1, y_test1, X1, Y1 = preprocessing("data_banknote_authentication.csv", "class", 4, ";")

In [35]:
#test data_set2
x_train2, x_test2, y_train2, y_test2, X2, Y2 = preprocessing("kidney_disease.csv", "classification", 4, ',')


### Result evaluation

In [10]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# the function that give the percentage of accuracy of the function (the same as .score())
def accuracy(y_predict, y_test):
    return np.mean([y_predict==y_test])

## 1-cross-validation

In [3]:
import numpy as np

# Models import list
from sklearn import svm
from sklearn.linear_model import SGDClassifier  
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

# model could take the value "best" or "scores"
def models_1cv(x_train, x_test, y_train, y_test, model): 
    # Models list
    models = [svm.SVC(kernel='linear'),
              svm.SVC(kernel='poly', degree=2, gamma='auto'),
              svm.SVC(kernel='rbf', gamma='auto'),
              svm.SVC(kernel='sigmoid', gamma=1./150),
              SGDClassifier(),
              DecisionTreeClassifier(),
              GaussianNB(),
              RandomForestRegressor(n_estimators = 1000, random_state = 42),
              MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5,2), random_state=42),
              LogisticRegression(random_state=0)]
    # Models apply the train dataset list
    models = [clf.fit(x_train, y_train) for clf in models]
    # Models score list
    scores = [clf.score(x_test,y_test) for clf in models]

    # Models name list 
    titles = ['SVC with linear kernel',
              'SVC with polynomial (degree 2) kernel',
              'SVC with RBF kernel',
              'SVC with sigmoid kernel',
              'Stochastic Gradient Descent',
              'Desicion Trees',
              'Bayesien Network: Gnb',
              'Random Forest',
              'Neural Network',
              'Probit model']
    
    if model == 'best':
        # return the name and the score of the method that obtain the best result 
        return titles[scores.index(max(scores))], max(scores)
    if model == 'scores':
        # return the name and the score of all the method
        return [(titles[i],scores[i]) for i in range(10)]
    

In [15]:
models_1cv(x_train1, x_test1, y_train1, y_test1, 'best')

('SVC with RBF kernel', 0.9709090909090909)

In [36]:
models_1cv(x_train2, x_test2, y_train2, y_test2, 'best')

('SVC with RBF kernel', 0.9875)

In [43]:
models_1cv(x_train1, x_test1, y_train1, y_test1, 'scores')

[('SVC with linear kernel', 0.9090909090909091),
 ('SVC with polynomial (degree 2) kernel', 0.7236363636363636),
 ('SVC with RBF kernel', 0.9709090909090909),
 ('SVC with sigmoid kernel', 0.8909090909090909),
 ('Stochastic Gradient Descent', 0.8872727272727273),
 ('Desicion Trees', 0.9563636363636364),
 ('Bayesien Network: Gnb', 0.8036363636363636),
 ('Random Forest', 0.9055074922855927),
 ('Neural Network', 0.9672727272727273),
 ('Probit model', 0.9054545454545454)]

## N-cross-validation

In [17]:
# Models import list
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.linear_model import SGDClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

# model could take the value "best" or "scores"
def models_Ncv(X, Y, N, model):
    # Models list
    models = [svm.SVC(kernel='linear'),
              svm.SVC(kernel='poly', degree=2, gamma='auto'),
              svm.SVC(kernel='rbf', gamma='auto'),
              svm.SVC(kernel='sigmoid', gamma=1./150),
              SGDClassifier(),
              DecisionTreeClassifier(),
              GaussianNB(),
              RandomForestRegressor(n_estimators = 1000, random_state = 42),
              MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5,2), random_state=42),
              LogisticRegression(random_state=0)]
    # Models score list with a N-cross-validation (the method is try N times each)
    scores = [np.mean(cross_val_score(clf, X, Y, cv=N)) for clf in models]
    
    # Models name list
    titles = ['SVC with linear kernel',
              'SVC with polynomial (degree 2) kernel',
              'SVC with RBF kernel',
              'SVC with sigmoid kernel',
              'Stochastic Gradient Descent',
              'Desicion Trees',
              'Bayesien Network: Gnb',
              'Random Forest',
              'Neural Network',
              'Probit model']
    
    if model == 'best':
        # return the name and the score of the method that obtain the best result 
        return titles[scores.index(max(scores))], max(scores)
    if model == 'scores':
        # return the name and the score of all the method
        return [(titles[i],scores[i]) for i in range(10)]

In [18]:
models_Ncv(X1,Y1,10, 'best')

('SVC with RBF kernel', 0.9832487041150959)

In [37]:
models_Ncv(X2,Y2,10, 'best')

('SVC with RBF kernel', 0.9875)

In [45]:
scores_Ncv(X1,Y1,10, 'scores')

[('SVC with linear kernel', 0.9082196128213266),
 ('SVC with polynomial (degree 2) kernel', 0.7377975245953665),
 ('SVC with RBF kernel', 0.9832487041150959),
 ('SVC with sigmoid kernel', 0.8965672273352375),
 ('Stochastic Gradient Descent', 0.8936898339151593),
 ('Desicion Trees', 0.9628530625198349),
 ('Bayesien Network: Gnb', 0.8325187771077964),
 ('Random Forest', 0.0922050889032258),
 ('Neural Network', 0.9766952290278219),
 ('Probit model', 0.9067703374590078)]

## Model Mean

In [20]:
# Models import list
from sklearn import svm
from sklearn.linear_model import SGDClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression


def model_mean(x_train, x_test, y_train, y_test, model):
    # Models list
    models = [svm.SVC(kernel='linear'),
              svm.SVC(kernel='poly', degree=2, gamma='auto'),
              svm.SVC(kernel='rbf', gamma='auto'),
              svm.SVC(kernel='sigmoid', gamma=1./150),
              SGDClassifier(),
              DecisionTreeClassifier(),
              GaussianNB(),
              RandomForestRegressor(n_estimators = 1000, random_state = 42),
              MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5,2), random_state=42),
              LogisticRegression(random_state=0)]
    # Models apply the train dataset list
    models = [clf.fit(x_train, y_train) for clf in models]
    # Models predictions for the input x_test
    predicts = [clf.predict(x_test) for clf in models]
    # Models score list
    scores = [clf.score(x_test,y_test) for clf in models]
    # Models predictions mean 
    mean_predict = sum(p for p in predicts)/10
    mean_predict = [np.round(mean_predict[i]) for i in range(len(mean_predict))]
    # Add model mean score to scores
    scores.append(accuracy(mean_predict, y_test))

    # Models name list
    titles = ['SVC with linear kernel',
              'SVC with polynomial (degree 2) kernel',
              'SVC with RBF kernel',
              'SVC with sigmoid kernel',
              'Stochastic Gradient Descent',
              'Desicion Trees',
              'Bayesien Network: Gnb',
              'Random Forest',
              'Neural Network',
              'Probit model',
              'Model Mean']

    if model == 'best':
        # return the name and the score of the method that obtain the best result 
        return titles[scores.index(max(scores))], max(scores)
    if model == 'scores':
        # return the name and the score of all the method
        return [(titles[i],scores[i]) for i in range(11)]

In [21]:
model_mean(x_train1, x_test1, y_train1, y_test1, 'best')

('SVC with RBF kernel', 0.9709090909090909)

In [38]:
model_mean(x_train2, x_test2, y_train2, y_test2, 'best')

('Model Mean', 1.0)

In [49]:
model_mean(x_train1, x_test1, y_train1, y_test1, 'scores')

[('SVC with linear kernel', 0.9090909090909091),
 ('SVC with polynomial (degree 2) kernel', 0.7236363636363636),
 ('SVC with RBF kernel', 0.9709090909090909),
 ('SVC with sigmoid kernel', 0.8909090909090909),
 ('Stochastic Gradient Descent', 0.8945454545454545),
 ('Desicion Trees', 0.9527272727272728),
 ('Bayesien Network: Gnb', 0.8036363636363636),
 ('Random Forest', 0.9055074922855927),
 ('Neural Network', 0.9672727272727273),
 ('Probit model', 0.9054545454545454),
 ('Model Mean', 0.9345454545454546)]

# GLOBAL FUNCTION

In [None]:
# Libraries: Standard ones
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random as rnd
# Libraries: scikit learn for preprocessing 
from sklearn.preprocessing import StandardScalefrom sklearn.model_selection import train_test_split
# Models import list
from sklearn import svm
from sklearn.linear_model import SGDClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

def finalFunction(abcd_csv, clas, lp, sep, N, model):
    # Data cleaning 
    x_train, x_test, y_train, y_test, X, Y = preprocessing(abcd_csv, clas, lp, sep)
    # Show the result 
    if N == 1:
        return model_mean(x_train, x_test, y_train, y_test, model)
    if N > 1:
        return models_Ncv(X, Y, N, model)
