In [12]:
import warnings
warnings.filterwarnings('ignore')

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def classify(clf, data, withMinMaxScaller = False, withScaller = False):
    scaled_features = data.drop('Outcome', axis=1)
    
    # min max scaller

    if withMinMaxScaller:
        from sklearn.preprocessing import MinMaxScaler
        scaler = MinMaxScaler()
        scaler.fit(data.drop('Outcome', axis=1))
        scaled_features = scaler.transform(data.drop('Outcome', axis=1))

    # standard scaller

    if withScaller:
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        scaler.fit(data.drop('Outcome', axis=1))
        scaled_features = scaler.transform(data.drop('Outcome', axis=1))

    #cross validation

    X = scaled_features
    y = data['Outcome']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=121)

    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    return accuracy_score(y_test, predictions)

def test_classifiers(data, withScaller=False, withMinMaxScaller=False):
    scores = {}
        
    from sklearn.svm import SVC
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import AdaBoostClassifier

    classifiers = [
        SVC(gamma = 'auto'),
        KNeighborsClassifier(n_neighbors=5),
        DecisionTreeClassifier(),
        LogisticRegression(solver='liblinear'),
        GaussianNB(),
        AdaBoostClassifier(),        
    ]
    classNames = [
        'SVC',
        'KNN',
        'DecisionTree',
        'RandomForest',
        'LogisticRegression',
        'GaussianNaive',
        'AdaBoost',
    ]
    
    for name, classif in zip(classNames, classifiers):
        scores[name] = classify(classif, data, withMinMaxScaller, withScaller)
    
    # grid search for random forest
    
#     from sklearn.model_selection import GridSearchCV
#     from sklearn.ensemble import RandomForestClassifier
    
#     scaled_features = data.drop('Outcome', axis=1)

#     #cross validation

#     X = scaled_features
#     y = data['Outcome']
#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.3, random_state=121)

#     param_grid = {"max_depth": [1, 2, 3, 5, 10, 20, None],
#           "max_features": [1, 2, 3, 4, 5, 6, 7],
#           "min_samples_split": [2, 3, 7, 10, 20],
#           "bootstrap": [True, False],
#           "criterion": ["gini", "entropy"]}
#     grid = GridSearchCV(RandomForestClassifier(n_estimators=100, n_jobs = 4),param_grid,cv=5, n_jobs = -1)
#     grid.fit(X_train,y_train)
#     print(grid.best_params_)
#     predictions = grid.predict(X_test)
#     scores['RandomForest'] = accuracy_score(y_test, predictions)

    return scores

In [18]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
%matplotlib inline
import json

from os import listdir
from os.path import isfile, join
mypath = './tests'
files = [f for f in listdir(mypath) if isfile(join(mypath, f))]

#files = ['diabetes_mean.csv']

scores = {}
bestScores = {}

for i in files:
    data1 = pd.read_csv('tests/' + i)
    
    scores[i + ' bezScalera'] = test_classifiers(data1)
    scores[i + ' standardScaller'] = test_classifiers(data1, withScaller=True)
    scores[i + ' minMaxScaller']  = test_classifiers(data1, withMinMaxScaller=True)

for i in files:
    for j in [' bezScalera', ' standardScaller', ' minMaxScaller']:
    #for j in [' standardScaller']:
        
        tempScores = scores[i + j]
    
        print('Test dla pliku: ' + i)
        print('\n')
        print(j)
        print(json.dumps(tempScores, indent=1))
        print('Najlepszy klasyfikator w zbiorze: ')
        best = max(tempScores, key=tempScores.get)
        print(best + ' dopasowanie: ' + str(tempScores[best]))
        bestScores[i + j] = tempScores[best]
        print('\n')
        
print('Najlepszy wynik: ')
best = max(bestScores, key=bestScores.get)
print(best + ' dopasowanie: ' + str(bestScores[best]))


Test dla pliku: diabetes_average.csv


 bezScalera
{
 "SVC": 0.645021645021645,
 "KNN": 0.7532467532467533,
 "DecisionTree": 0.7619047619047619,
 "RandomForest": 0.7965367965367965,
 "LogisticRegression": 0.7619047619047619,
 "GaussianNaive": 0.7748917748917749
}
Najlepszy klasyfikator w zbiorze: 
RandomForest dopasowanie: 0.7965367965367965


Test dla pliku: diabetes_average.csv


 standardScaller
{
 "SVC": 0.7792207792207793,
 "KNN": 0.7965367965367965,
 "DecisionTree": 0.7532467532467533,
 "RandomForest": 0.7965367965367965,
 "LogisticRegression": 0.7619047619047619,
 "GaussianNaive": 0.7748917748917749
}
Najlepszy klasyfikator w zbiorze: 
KNN dopasowanie: 0.7965367965367965


Test dla pliku: diabetes_average.csv


 minMaxScaller
{
 "SVC": 0.8051948051948052,
 "KNN": 0.7965367965367965,
 "DecisionTree": 0.7272727272727273,
 "RandomForest": 0.8095238095238095,
 "LogisticRegression": 0.7619047619047619,
 "GaussianNaive": 0.7748917748917749
}
Najlepszy klasyfikator w zbiorze: 
RandomFo