In [37]:
import warnings
warnings.filterwarnings('ignore')

In [121]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import GridSearchCV
import numpy as np

# classifiers
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
    
def scaleData(data, withMinMaxScaller = False, withScaller = False):    
    #data vector
    scaled_features = data.drop('Outcome', axis=1)

    #min max scaller
    if withMinMaxScaller:
        from sklearn.preprocessing import MinMaxScaler
        scaler = MinMaxScaler()
        scaler.fit(data.drop('Outcome', axis=1))
        scaled_features = scaler.transform(data.drop('Outcome', axis=1))

    #standard scaller

    if withScaller:
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        scaler.fit(data.drop('Outcome', axis=1))
        scaled_features = scaler.transform(data.drop('Outcome', axis=1))
        
    return scaled_features

def gridSearch(clf, params_grid, X, y):
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=121)
    
    grid = GridSearchCV(clf, params_grid, cv=5, n_jobs = -1, scoring = 'accuracy')

    grid.fit(X_train,y_train)
        
    predictions = grid.predict(X_test)
    
    return {
        'bestParams': grid.best_params_,
        'score': accuracy_score(y_test, predictions)
    }

def classify(clf, data, params_grid = {}, withMinMaxScaller = False, withScaller = False):    
    # k in kfold cross validation 
    k = 5

    #construct data vector
    scaled_features = scaleData(data, withMinMaxScaller, withScaller)
    
    #cross validation
    X = scaled_features
    y = data['Outcome']
    
    return gridSearch(clf, params_grid, X, y)
    
    ###Cross Validation
    #     predictions = cross_val_predict(clf, X,y, cv=k)

    #     # take mean of scores and multiply by 100 to get percents

    #     return (cross_val_score(clf, X, y, scoring='accuracy', cv = k)).mean() * 100

    ###Bez kross walidacji

    #     X_train, X_test, y_train, y_test = train_test_split(
    #         X, y, test_size=0.3, random_state=121)

    #     clf.fit(X_train, y_train)
    #     predictions = clf.predict(X_test)
    #     return accuracy_score(y_test, predictions)

def test_classifiers(data, withScaller=False, withMinMaxScaller=False):
    scores = {}

    classifiers = [
        SVC(gamma = 'auto'),
        KNeighborsClassifier(n_neighbors=5),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        LogisticRegression(solver='liblinear'),
        GaussianNB(),
        AdaBoostClassifier(),        
    ]
    
    classNames = [
        'SVC',
        'KNN',
        'DecisionTree',
        'RandomForest',
        'LogisticRegression',
        'GaussianNaive',
        'AdaBoost',
    ]

    param_grid = {
        'SVC': {
            'kernel': ['linear', 'rbf'],
            'gamma': ['auto', 0.01, 0.1, 0.5, 1, 2, 10],
            'C': [0.001, 0.01, 0.1, 1, 10],
        },
        'KNN': {
            'n_neighbors': list(range(1,5)),
            'weights': ["uniform", "distance"],
        },
        'DecisionTree': {
            'max_depth': np.arange(3, 10),
        },
        'LogisticRegression': {
            'C': np.logspace(-3,3,7),
            'penalty': ['l1','l2'],
        },
        # has no hyperparams
        'GaussianNaive': {},
        'AdaBoost': {},
        'RandomForest': {
            "max_depth": [2, 3, None],
            "max_features": [1, 2, 6, 7],
            "min_samples_split": [2, 3],
            "bootstrap": [True, False],
            "criterion": ["gini", "entropy"],
            "n_estimators": [10, 100],
        }
    }     
    for name, classif in zip(classNames, classifiers):
        print('Calculate for: ' + name)
        scores[name] = classify(classif, data, param_grid[name], withMinMaxScaller, withScaller)

    return scores

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
%matplotlib inline
import json

from os import listdir
from os.path import isfile, join

# get all files in tests directory

mypath = './tests'
files = [f for f in listdir(mypath) if isfile(join(mypath, f))]

#files = ['diabetes_mean.csv']

scores = []
bestScores = []

def getDiggedData(diction, dataToExtract):
    finalScore = {}
    for i in diction:
        finalScore[i] = diction[i][dataToExtract]
        
    return finalScore

   
for i in files:
    data1 = pd.read_csv('tests/' + i)    
    scoresPerFile = []
    
    for scale in ['bez', 'std', 'minMax']:        
        result = test_classifiers(data1, True if scale == 'std' else False)
        localScore = {
            'scaler': scale,
            'clf': result,
        }

        scoresPerFile.append(localScore)
        
    fileScoresObj = {
        'file': i,
        'scores': scoresPerFile
    }
    
    scores.append(fileScoresObj)   

print(json.dumps(scores, indent=1))

for i in scores:
    print('Test dla pliku: ' + i['file'])
    
    for scores in i['scores']:
        print(' przy uzyciu scalera: ' + scores['scaler'] + '\n')
        print('Najlepszy klasyfikator w zbiorze: ')
        allClfScores = getDiggedData(scores['clf'], 'score')
        best = max(allClfScores, key=allClfScores.get)
        print(best + ' dopasowanie: ' + str(scores['clf'][best]['score']) + '\n')
        bestScores.append({
            'file': i['file'],
            'scaler': scores['scaler'],
            'score': scores['clf'][best]['score'],
            'params': getDiggedData(scores['clf'], 'bestParams')[best],
        })
        
print('Najlepszy wynik: ')
print(max(bestScores, key=lambda item: item['score']))

Calculate for: SVC
