# Load raw data from the dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import cross_val_score,validation_curve,ShuffleSplit,learning_curve
from sklearn.dummy import DummyClassifier
from sklearn import svm,tree
from sklearn.svm import LinearSVC,SVC
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.metrics import confusion_matrix
import itertools
import os
from pylab import *

In [2]:
def LetsTraining(X,y, estimator, param=None, param_grid=None, scaler=MinMaxScaler(), n_splits=20):
    """
    this function will first conduct train/test split, then normalize on with 'scaler' on trian set
    then cast normalization on test set to avoid data leakage from test set 
    It will fit the traing data from 'estimator' with parameters of 'param'
    Finally, it will print the calculated traning score and test with n_splits
    ----------------------------------------------------------------------
    X: pd DataFrame 
    y: pd DataFrame or Series
    
    estimator: like SVC,LogisticRegression,LinearSVC,etc.
    
    param: parameters to pass into estimator {'C':1 }
    
    scaler:Normalization function, Defualt: MinMaxScaler
    
    n_splits: number of iterations to run and average, Defualt: 20
     ----------------------------------------------------------------------
     Output:
     clf : return the classifier with best test score
     cache:  dictionary including 'trainmean','trainstd','testmean','teststd', 'confusion_matrix','scaler'
    """
    scores=[]
    trainscores=[]
    cnf_matrixes=[]
    for i in range(n_splits):
        #split Training the test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=i)
        
        #Feature Normalization
        X_train=scaler.fit_transform(X_train)
        X_test=scaler.transform(X_test)
        
        #traing data using estimator
        clf=estimator()
        if param!=None:
            clf=clf.set_params(**param)
        
        """
        #grid search to find best parameters
        
        if param_grid!=None:
            #param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty':['l1','l2']}
            nfolds=5
            grid_search = GridSearchCV(estimator(), param_grid, cv=nfolds,return_train_score=True)
            grid_search.fit(X_train,y_train)
            print(grid_search.best_params_)
            clf=clf.set_params(**grid_search.best_params_)
        """
        
        clf.fit(X_train,y_train)
        #pass score of test set 
        scores.append(clf.score(X_test,y_test))
        trainscores.append(clf.score(X_train,y_train))
        
        #calculate confusion matrix
        y_pred=clf.predict(X_test)
        cnf_matrix = confusion_matrix(y_test, y_pred)
        cnf_matrixes.append(cnf_matrix)
        
    print("The training model is {} with parmameter {}\n".format(str(estimator),str(param)))
    print("The training score is: {:.2f} +- {:.2f}".format(np.mean(trainscores),np.std(trainscores)))
    print("The test score is: {:.2f} +- {:.2f}\n".format(np.mean(scores),np.std(scores)))
    cache={'trainmean':np.mean(trainscores),'trainstd':np.std(trainscores),'testmean':np.mean(scores),'teststd':np.std(scores),'confusion_matrix':cnf_matrixes,'scaler':scaler}
    return clf,cache
#param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}#X_cor_select
LetsTraining(X,y,LogisticRegression,param={'C': 10,'penalty':'l1'})   