In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def classify(clf, data):
    scaled_features = data.drop('Outcome', axis=1)

    #cross validation

    X = scaled_features
    y = data['Outcome']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=121)

    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    return accuracy_score(y_test, predictions)

def test_classifiers(data, withScaller=False, withMinMaxScaller=False):
    scores = {}
    
    from sklearn.model_selection import cross_val_score

    # min max scaller

    if withMinMaxScaller:
        from sklearn.preprocessing import MinMaxScaler
        scaler = MinMaxScaler()
        scaler.fit(data.drop('Outcome', axis=1))
        scaled_features = scaler.transform(data.drop('Outcome', axis=1))

    # standard scaller

    if withScaller:
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        scaler.fit(data.drop('Outcome', axis=1))
        scaled_features = scaler.transform(data.drop('Outcome', axis=1))

    #SVC

    from sklearn.svm import SVC
    from sklearn.model_selection import GridSearchCV

    svc = SVC(gamma='auto')
    scores['SVC'] = classify(svc, data)

    # KNN

    from sklearn.neighbors import KNeighborsClassifier

    knn = KNeighborsClassifier(n_neighbors=5)
    scores['KNN'] = classify(knn, data)

    ##Decision tree

    from sklearn.tree import DecisionTreeClassifier
    dtree = DecisionTreeClassifier()
    scores['DecTree'] = classify(dtree, data)

    # Random forest

    from sklearn.ensemble import RandomForestClassifier

    rfc = RandomForestClassifier(n_estimators=100)
    scores['RandomForest'] = classify(rfc, data)

    # Logistic regression

    from sklearn.linear_model import LogisticRegression
    logmodel = LogisticRegression(solver='liblinear')
    scores['LogisticReg'] = classify(logmodel, data)

    # Gaussian naive bayes

    from sklearn.naive_bayes import GaussianNB
    gnbmodel = GaussianNB()
    scores['Gaussian'] = classify(gnbmodel, data)
    
    #Ada boost

    from sklearn.ensemble import AdaBoostClassifier
    ab = AdaBoostClassifier()
    scores['AdaBoost'] = classify(ab, data)
    
    # grid search for random forest
    
#     from sklearn.model_selection import GridSearchCV
#     from sklearn.ensemble import RandomForestClassifier
    
#     scaled_features = data.drop('Outcome', axis=1)

#     #cross validation

#     X = scaled_features
#     y = data['Outcome']
#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.3, random_state=121)

#     param_grid = {"max_depth": [1, 2, 3, 5, 10, 20, None],
#           "max_features": [1, 2, 3, 4, 5, 6, 7],
#           "min_samples_split": [2, 3, 7, 10, 20],
#           "bootstrap": [True, False],
#           "criterion": ["gini", "entropy"]}
#     grid = GridSearchCV(RandomForestClassifier(n_estimators=100, n_jobs = 4),param_grid,cv=5, n_jobs = -1)
#     grid.fit(X_train,y_train)
#     print(grid.best_params_)
#     predictions = grid.predict(X_test)
#     scores['RandomForest'] = accuracy_score(y_test, predictions)

    return scores

In [5]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
%matplotlib inline
import json

from os import listdir
from os.path import isfile, join
mypath = './tests'
files = [f for f in listdir(mypath) if isfile(join(mypath, f))]

#files = ['diabetes_mean.csv']

scores = {}
bestScores = {}

for i in files:
    data1 = pd.read_csv('tests/' + i)
    
    scores[i + ' bezScalera'] = test_classifiers(data1)
    scores[i + ' standardScaller'] = test_classifiers(data1, withScaller=True)
    scores[i + ' minMaxScaller']  = test_classifiers(data1, withMinMaxScaller=True)

for i in files:
    for j in [' bezScalera', ' standardScaller', ' minMaxScaller']:
    #for j in [' standardScaller']:
        
        tempScores = scores[i + j]
    
        print('Test dla pliku: ' + i)
        print('\n')
        print(j)
        print(json.dumps(tempScores, indent=1))
        print('Najlepszy klasyfikator w zbiorze: ')
        best = max(tempScores, key=tempScores.get)
        print(best + ' dopasowanie: ' + str(tempScores[best]))
        bestScores[i + j] = tempScores[best]
        print('\n')
        
print('Najlepszy wynik: ')
best = max(bestScores, key=bestScores.get)
print(best + ' dopasowanie: ' + str(bestScores[best]))


Test dla pliku: diabetes.csv


 bezScalera
{
 "SVC": 0.645021645021645,
 "KNN": 0.7662337662337663,
 "DecTree": 0.7662337662337663,
 "RandomForest": 0.8181818181818182,
 "LogisticReg": 0.8138528138528138,
 "Gaussian": 0.7835497835497836,
 "AdaBoost": 0.7748917748917749
}
Najlepszy klasyfikator w zbiorze: 
RandomForest dopasowanie: 0.8181818181818182


Test dla pliku: diabetes.csv


 standardScaller
{
 "SVC": 0.645021645021645,
 "KNN": 0.7662337662337663,
 "DecTree": 0.7748917748917749,
 "RandomForest": 0.8181818181818182,
 "LogisticReg": 0.8138528138528138,
 "Gaussian": 0.7835497835497836,
 "AdaBoost": 0.7748917748917749
}
Najlepszy klasyfikator w zbiorze: 
RandomForest dopasowanie: 0.8181818181818182


Test dla pliku: diabetes.csv


 minMaxScaller
{
 "SVC": 0.645021645021645,
 "KNN": 0.7662337662337663,
 "DecTree": 0.7575757575757576,
 "RandomForest": 0.8181818181818182,
 "LogisticReg": 0.8138528138528138,
 "Gaussian": 0.7835497835497836,
 "AdaBoost": 0.7748917748917749
}
Najlepszy kl