In [192]:
import warnings
warnings.filterwarnings('ignore')

In [193]:
def test_classifiers(data, withScaller=False, withMinMaxScaller=False):
    scores = {}
    scaled_features = data.drop('Outcome', axis=1)
    
    from sklearn.model_selection import cross_val_score

    # min max scaller

    if withMinMaxScaller:
        from sklearn.preprocessing import MinMaxScaler
        scaler = MinMaxScaler()
        scaler.fit(data.drop('Outcome', axis=1))
        scaled_features = scaler.transform(data.drop('Outcome', axis=1))

    # standard scaller

    if withScaller:
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        scaler.fit(data.drop('Outcome', axis=1))
        scaled_features = scaler.transform(data.drop('Outcome', axis=1))

    #cross validation

    from sklearn.model_selection import train_test_split
    X = scaled_features
    y = data['Outcome']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=121)

    #SVC

    from sklearn.svm import SVC
    from sklearn.model_selection import GridSearchCV
    from sklearn.metrics import accuracy_score

    #     param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']}
    #     grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
    #     grid.fit(X_train,y_train)
    #     grid.best_params_
    #     grid.best_estimator_
    #     grid_predictions = grid.predict(X_test)

    svc = SVC(gamma='auto')
    svc.fit(X_train, y_train)
    cross_val_score(svc, X_train, y_train, cv = 10)
    predictions = svc.predict(X_test)
    scores['SVC'] = accuracy_score(y_test, predictions)

    # KNN

    from sklearn.neighbors import KNeighborsClassifier

    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)
    predictions = knn.predict(X_test)
    scores['KNN'] = accuracy_score(y_test, predictions)

    ##Decision tree

    from sklearn.tree import DecisionTreeClassifier
    dtree = DecisionTreeClassifier()
    dtree.fit(X_train, y_train)
    predictions = dtree.predict(X_test)
    scores['DecTree'] = accuracy_score(y_test, predictions)

    # Random forest

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import GridSearchCV
    from sklearn.metrics import accuracy_score
    
#     param_grid = {"max_depth": [3, None],
#               "max_features": [1, 2, 3, 4, 5, 6, 7],
#               "min_samples_split": [2, 3, 7, 10],
#               "bootstrap": [True, False],
#               "criterion": ["gini", "entropy"]}
#     grid = GridSearchCV(RandomForestClassifier(n_estimators=100),param_grid,cv=5)
#     grid.fit(X_train,y_train)
#     print(grid.best_params_)
    rfc = RandomForestClassifier(n_estimators=100)
    rfc.fit(X_train, y_train)
    predictions = rfc.predict(X_test)
    scores['RandomForest'] = accuracy_score(y_test, predictions)

    # Logistic regression

    from sklearn.linear_model import LogisticRegression
    logmodel = LogisticRegression(solver='liblinear')
    logmodel.fit(X_train, y_train)
    predictions = logmodel.predict(X_test)
    scores['LogisticReg'] = accuracy_score(y_test, predictions)

    # Gaussian naive bayes

    from sklearn.naive_bayes import GaussianNB
    gnbmodel = GaussianNB()
    gnbmodel.fit(X_train, y_train)
    predictions = gnbmodel.predict(X_test)
    scores['Gaussian'] = accuracy_score(y_test, predictions)
    
    #Ada boost

    from sklearn.ensemble import AdaBoostClassifier
    ab = AdaBoostClassifier()
    ab.fit(X_train, y_train)
    predictions = ab.predict(X_test)
    scores['AdaBoost'] = accuracy_score(y_test, predictions)

    return scores

In [194]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
%matplotlib inline
import json

from os import listdir
from os.path import isfile, join
mypath = './tests'
files = [f for f in listdir(mypath) if isfile(join(mypath, f))]

#files = ['diabetes_median_log.csv', 'diabetes_median.csv']
scores = {}
bestScores = {}

for i in files:
    data1 = pd.read_csv('tests/' + i)
    
    scores[i + ' bezScalera'] = test_classifiers(data1)
    scores[i + ' standardScaller'] = test_classifiers(data1, withScaller=True)
    scores[i + ' minMaxScaller']  = test_classifiers(data1, withMinMaxScaller=True)

for i in files:
    for j in [' bezScalera', ' standardScaller', ' minMaxScaller']:
        
        tempScores = scores[i + j]
    
        print('Test dla pliku: ' + i)
        print('\n')
        print(j)
        print(json.dumps(tempScores, indent=1))
        print('Najlepszy klasyfikator w zbiorze: ')
        best = max(tempScores, key=tempScores.get)
        print(best + ' dopasowanie: ' + str(tempScores[best]))
        bestScores[i + j] = tempScores[best]
        print('\n')
        
print('Najlepszy wynik: ')
best = max(bestScores, key=bestScores.get)
print(best + ' dopasowanie: ' + str(bestScores[best]))


Test dla pliku: diabetes.csv


 bezScalera
{
 "SVC": 0.645021645021645,
 "KNN": 0.7662337662337663,
 "DecTree": 0.7359307359307359,
 "RandomForest": 0.8051948051948052,
 "LogisticReg": 0.8138528138528138,
 "Gaussian": 0.7835497835497836,
 "AdaBoost": 0.7748917748917749
}
Najlepszy klasyfikator w zbiorze: 
LogisticReg dopasowanie: 0.8138528138528138


Test dla pliku: diabetes.csv


 standardScaller
{
 "SVC": 0.7878787878787878,
 "KNN": 0.7922077922077922,
 "DecTree": 0.7662337662337663,
 "RandomForest": 0.7878787878787878,
 "LogisticReg": 0.8138528138528138,
 "Gaussian": 0.7835497835497836,
 "AdaBoost": 0.7748917748917749
}
Najlepszy klasyfikator w zbiorze: 
LogisticReg dopasowanie: 0.8138528138528138


Test dla pliku: diabetes.csv


 minMaxScaller
{
 "SVC": 0.7619047619047619,
 "KNN": 0.7792207792207793,
 "DecTree": 0.7705627705627706,
 "RandomForest": 0.8181818181818182,
 "LogisticReg": 0.7878787878787878,
 "Gaussian": 0.7835497835497836,
 "AdaBoost": 0.7748917748917749
}
Najlepszy kl