# Machine Learing, Assignment 5
Authors: Jianjun Du, Bo Huang

In [70]:
import numpy as np
import pandas as pd
# model selection related
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
# different models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier,BaggingClassifier
from sklearn.naive_bayes import GaussianNB
# plot 
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

In [2]:
def clean(data):
    # remove all the rows with nulls
    data=data.dropna()
        
    # transfer all the classes labels to numbers, but not create extra dummy variables
    classes=data.iloc[:,-1]
    y=pd.Categorical(classes).codes
    y=pd.DataFrame(y)
    y.columns=['class']
        
    # process feature variables
    X=data.iloc[:,:-1]
        
    # process the numeric variables
    numeric=X.select_dtypes(include=['int64','float64'])
    normalized=(numeric-numeric.mean())/numeric.std()
    # process the categorical variables
    categorical=X.select_dtypes(include=['object'])
    if categorical.shape[1]!=0:
        dummies=pd.get_dummies(categorical,drop_first=True)
            
    # combine the processed variables
    if categorical.shape[1]!=0 and numeric.shape[1]!=0:
        X=pd.concat([normalized,dummies],axis=1)
    elif categorical.shape[1]==0:
        X=normalized
    else:
        X=dummies
            
    return X,y

In [22]:
def model_parameters(X_train,y_train,model,tuned_parameters):
    
    scores = ['accuracy', 'precision_micro']

    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        clf = GridSearchCV(model, tuned_parameters, cv=5,
                           scoring='%s' % score)
        clf.fit(X_train, y_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

In [17]:
data=pd.read_csv('iris.csv')
X,y=clean(data)
y=y.values.flatten()
X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.2, random_state=42)
    

In [23]:
# decision tree

# Set the parameters by cross-validation
tuned_parameters = [{'criterion': ['gini'], 'min_samples_leaf':[1,3,5]},
                    {'criterion': ['entropy'], 'min_samples_leaf': [1,3,5]}]

model_parameters(X_train,y_train,DecisionTreeClassifier(),tuned_parameters)

# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'criterion': 'entropy', 'min_samples_leaf': 3}

Grid scores on development set:

0.933 (+/-0.100) for {'criterion': 'gini', 'min_samples_leaf': 1}
0.950 (+/-0.134) for {'criterion': 'gini', 'min_samples_leaf': 3}
0.950 (+/-0.134) for {'criterion': 'gini', 'min_samples_leaf': 5}
0.933 (+/-0.114) for {'criterion': 'entropy', 'min_samples_leaf': 1}
0.958 (+/-0.129) for {'criterion': 'entropy', 'min_samples_leaf': 3}
0.950 (+/-0.134) for {'criterion': 'entropy', 'min_samples_leaf': 5}

# Tuning hyper-parameters for precision_micro

Best parameters set found on development set:

{'criterion': 'gini', 'min_samples_leaf': 3}

Grid scores on development set:

0.933 (+/-0.100) for {'criterion': 'gini', 'min_samples_leaf': 1}
0.958 (+/-0.129) for {'criterion': 'gini', 'min_samples_leaf': 3}
0.950 (+/-0.134) for {'criterion': 'gini', 'min_samples_leaf': 5}
0.933 (+/-0.114) for {'criterion': 'entropy', 'min_sam

In [26]:
# Perceptron

tuned_parameters=[{'penalty':['l1','l2'],'alpha':[0.0001,0.0005,0.001,0.002,0.005,0.01]}]
model_parameters(X_train,y_train,Perceptron(),tuned_parameters)

# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'alpha': 0.002, 'penalty': 'l1'}

Grid scores on development set:

0.800 (+/-0.129) for {'alpha': 0.0001, 'penalty': 'l1'}
0.825 (+/-0.163) for {'alpha': 0.0001, 'penalty': 'l2'}
0.858 (+/-0.237) for {'alpha': 0.0005, 'penalty': 'l1'}
0.858 (+/-0.189) for {'alpha': 0.0005, 'penalty': 'l2'}
0.867 (+/-0.166) for {'alpha': 0.001, 'penalty': 'l1'}
0.842 (+/-0.203) for {'alpha': 0.001, 'penalty': 'l2'}
0.883 (+/-0.120) for {'alpha': 0.002, 'penalty': 'l1'}
0.867 (+/-0.203) for {'alpha': 0.002, 'penalty': 'l2'}
0.858 (+/-0.237) for {'alpha': 0.005, 'penalty': 'l1'}
0.842 (+/-0.145) for {'alpha': 0.005, 'penalty': 'l2'}
0.850 (+/-0.155) for {'alpha': 0.01, 'penalty': 'l1'}
0.800 (+/-0.237) for {'alpha': 0.01, 'penalty': 'l2'}

# Tuning hyper-parameters for precision_micro

Best parameters set found on development set:

{'alpha': 0.002, 'penalty': 'l1'}

Grid scores on development set:

0.800 (+/-0.129) for

In [39]:
# Neural network

tuned_parameters=[{'hidden_layer_sizes':[(100,),(10,20,5)],\
                   'activation':['relu','logistic'],'tol':[0.001],'max_iter':[1000]}]

model_parameters(X_train,y_train,MLPClassifier(),tuned_parameters)

# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'activation': 'relu', 'hidden_layer_sizes': (100,), 'max_iter': 1000, 'tol': 0.001}

Grid scores on development set:

0.950 (+/-0.122) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'max_iter': 1000, 'tol': 0.001}
0.792 (+/-0.470) for {'activation': 'relu', 'hidden_layer_sizes': (10, 20, 5), 'max_iter': 1000, 'tol': 0.001}
0.892 (+/-0.088) for {'activation': 'logistic', 'hidden_layer_sizes': (100,), 'max_iter': 1000, 'tol': 0.001}
0.342 (+/-0.022) for {'activation': 'logistic', 'hidden_layer_sizes': (10, 20, 5), 'max_iter': 1000, 'tol': 0.001}

# Tuning hyper-parameters for precision_micro

Best parameters set found on development set:

{'activation': 'relu', 'hidden_layer_sizes': (100,), 'max_iter': 1000, 'tol': 0.001}

Grid scores on development set:

0.942 (+/-0.125) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'max_iter': 1000, 'tol': 0.001}
0.742 (+/-0.455) for {'activatio

In [40]:
# SVM

tuned_parameters=[{'C':[0.01,0.1,1,10,50],'kernel':['rbf','linear','poly'],'gamma':['auto',0.05,0.1,0.2]}]

model_parameters(X_train,y_train,SVC(),tuned_parameters)

# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}

Grid scores on development set:

0.408 (+/-0.279) for {'C': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}
0.850 (+/-0.153) for {'C': 0.01, 'gamma': 'auto', 'kernel': 'linear'}
0.492 (+/-0.170) for {'C': 0.01, 'gamma': 'auto', 'kernel': 'poly'}
0.408 (+/-0.279) for {'C': 0.01, 'gamma': 0.05, 'kernel': 'rbf'}
0.850 (+/-0.153) for {'C': 0.01, 'gamma': 0.05, 'kernel': 'linear'}
0.400 (+/-0.246) for {'C': 0.01, 'gamma': 0.05, 'kernel': 'poly'}
0.408 (+/-0.279) for {'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'}
0.850 (+/-0.153) for {'C': 0.01, 'gamma': 0.1, 'kernel': 'linear'}
0.400 (+/-0.246) for {'C': 0.01, 'gamma': 0.1, 'kernel': 'poly'}
0.408 (+/-0.279) for {'C': 0.01, 'gamma': 0.2, 'kernel': 'rbf'}
0.850 (+/-0.153) for {'C': 0.01, 'gamma': 0.2, 'kernel': 'linear'}
0.400 (+/-0.246) for {'C': 0.01, 'gamma': 0.2, 'kernel': 'poly'}
0.867 (+/-0.109) for {'C': 0.1, 'gamma':

In [45]:
# Naive Bayes

tuned_parameters=[{'priors':[(0.3,0.5,0.2),None]}]

model_parameters(X_train,y_train,GaussianNB(),tuned_parameters)

# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'priors': (0.3, 0.5, 0.2)}

Grid scores on development set:

0.950 (+/-0.122) for {'priors': (0.3, 0.5, 0.2)}
0.942 (+/-0.125) for {'priors': None}

# Tuning hyper-parameters for precision_micro

Best parameters set found on development set:

{'priors': (0.3, 0.5, 0.2)}

Grid scores on development set:

0.950 (+/-0.122) for {'priors': (0.3, 0.5, 0.2)}
0.942 (+/-0.125) for {'priors': None}



In [52]:
# Logistic Regression

tuned_parameters=[{'penalty':['l1','l2'],'C':[200,150,100,20,1,0.5]}]

model_parameters(X_train,y_train,LogisticRegression(),tuned_parameters)

# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'C': 100, 'penalty': 'l2'}

Grid scores on development set:

0.950 (+/-0.122) for {'C': 200, 'penalty': 'l1'}
0.950 (+/-0.122) for {'C': 200, 'penalty': 'l2'}
0.950 (+/-0.122) for {'C': 150, 'penalty': 'l1'}
0.950 (+/-0.122) for {'C': 150, 'penalty': 'l2'}
0.950 (+/-0.122) for {'C': 100, 'penalty': 'l1'}
0.958 (+/-0.091) for {'C': 100, 'penalty': 'l2'}
0.950 (+/-0.122) for {'C': 20, 'penalty': 'l1'}
0.942 (+/-0.155) for {'C': 20, 'penalty': 'l2'}
0.900 (+/-0.184) for {'C': 1, 'penalty': 'l1'}
0.875 (+/-0.146) for {'C': 1, 'penalty': 'l2'}
0.900 (+/-0.151) for {'C': 0.5, 'penalty': 'l1'}
0.833 (+/-0.189) for {'C': 0.5, 'penalty': 'l2'}

# Tuning hyper-parameters for precision_micro

Best parameters set found on development set:

{'C': 100, 'penalty': 'l2'}

Grid scores on development set:

0.950 (+/-0.122) for {'C': 200, 'penalty': 'l1'}
0.950 (+/-0.122) for {'C': 200, 'penalty': 'l2'}
0.950 (+/-0.12

In [53]:
# K nearest Neighbors

tuned_parameters=[{'n_neighbors':[3]}]

model_parameters(X_train,y_train,KNeighborsClassifier(),tuned_parameters)

# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'n_neighbors': 3}

Grid scores on development set:

0.942 (+/-0.124) for {'n_neighbors': 3}

# Tuning hyper-parameters for precision_micro

Best parameters set found on development set:

{'n_neighbors': 3}

Grid scores on development set:

0.942 (+/-0.124) for {'n_neighbors': 3}



In [54]:
# Bagging

tuned_parameters=[{'base_estimator':[None,KNeighborsClassifier(),LogisticRegression()]}]

model_parameters(X_train,y_train,BaggingClassifier(),tuned_parameters)

# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'base_estimator': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')}

Grid scores on development set:

0.933 (+/-0.100) for {'base_estimator': None}
0.942 (+/-0.125) for {'base_estimator': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')}
0.867 (+/-0.171) for {'base_estimator': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)}

# Tuning hyper-parameters for precision_micro

Best parameters set found on development set:

{'base_estimator': KNeighborsClassifier(alg

In [55]:
# Random forest

tuned_parameters=[{'n_estimators':[5,10,20,50],'max_features':['auto','sqrt','log2'],'min_samples_leaf':[1,2,5,10]}]

model_parameters(X_train,y_train,RandomForestClassifier(),tuned_parameters)

# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'max_features': 'auto', 'min_samples_leaf': 10, 'n_estimators': 20}

Grid scores on development set:

0.933 (+/-0.114) for {'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 5}
0.942 (+/-0.113) for {'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 10}
0.942 (+/-0.113) for {'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 20}
0.942 (+/-0.113) for {'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 50}
0.942 (+/-0.155) for {'max_features': 'auto', 'min_samples_leaf': 2, 'n_estimators': 5}
0.958 (+/-0.091) for {'max_features': 'auto', 'min_samples_leaf': 2, 'n_estimators': 10}
0.942 (+/-0.086) for {'max_features': 'auto', 'min_samples_leaf': 2, 'n_estimators': 20}
0.958 (+/-0.091) for {'max_features': 'auto', 'min_samples_leaf': 2, 'n_estimators': 50}
0.958 (+/-0.129) for {'max_features': 'auto', 'min_samples_leaf': 5, 'n_estimators': 5}
0.958 (+/-0.

In [59]:
# Adaboost

tuned_parameters=[{'base_estimator':[DecisionTreeClassifier(criterion='entropy', min_samples_leaf=3),RandomForestClassifier(min_samples_leaf=5, n_estimators=20)]}]

model_parameters(X_train,y_train,AdaBoostClassifier(),tuned_parameters)

# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'base_estimator': DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=3,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')}

Grid scores on development set:

0.942 (+/-0.067) for {'base_estimator': DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=3,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')}
0.925 (+/-0.144) for {'base_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
  

In [61]:
# Gradient Boost
tuned_parameters=[{'n_estimators':[5,10,20,50],'max_features':['auto','sqrt','log2'],'min_samples_leaf':[1,2,5,10]}]

model_parameters(X_train,y_train,GradientBoostingClassifier(),tuned_parameters)


# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'max_features': 'auto', 'min_samples_leaf': 5, 'n_estimators': 10}

Grid scores on development set:

0.950 (+/-0.123) for {'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 5}
0.942 (+/-0.113) for {'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 10}
0.942 (+/-0.113) for {'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 20}
0.933 (+/-0.100) for {'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 50}
0.950 (+/-0.134) for {'max_features': 'auto', 'min_samples_leaf': 2, 'n_estimators': 5}
0.958 (+/-0.129) for {'max_features': 'auto', 'min_samples_leaf': 2, 'n_estimators': 10}
0.950 (+/-0.122) for {'max_features': 'auto', 'min_samples_leaf': 2, 'n_estimators': 20}
0.933 (+/-0.100) for {'max_features': 'auto', 'min_samples_leaf': 2, 'n_estimators': 50}
0.950 (+/-0.134) for {'max_features': 'auto', 'min_samples_leaf': 5, 'n_estimators': 5}
0.967 (+/-0.0

In [75]:
# test

clf1=DecisionTreeClassifier(criterion='entropy', min_samples_leaf=3)
clf2=Perceptron(alpha=0.002, penalty='l1')
clf3=MLPClassifier(activation='relu', hidden_layer_sizes=(100,), max_iter=1000)
clf4=SVC(C=1, gamma='auto',kernel='rbf')
clf5=GaussianNB(priors=(0.3, 0.5, 0.2))
clf6=LogisticRegression(C=100, penalty='l2')
clf7=KNeighborsClassifier(n_neighbors=3)
clf8=BaggingClassifier(base_estimator=KNeighborsClassifier())
clf9=RandomForestClassifier(max_features='auto', min_samples_leaf=10, n_estimators=20)
clf10=AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
clf11=GradientBoostingClassifier(max_features='auto', min_samples_leaf=5, n_estimators=10)

clf=[]
clf.append(clf1)
clf.append(clf2)
clf.append(clf3)
clf.append(clf4)
clf.append(clf5)
clf.append(clf6)
clf.append(clf7)
clf.append(clf8)
clf.append(clf9)
clf.append(clf10)
clf.append(clf11)

print('f1_score','precision','accuracy')
for model in clf:
    model.fit(X_train,y_train)
    y_predicted=model.predict(X_test)
    print(f1_score(y_test,y_predicted,average='macro'),precision_score(y_test,y_predicted,average='macro'),\
                    accuracy_score(y_test,y_predicted))


f1_score precision accuracy
1.0 1.0 1.0
0.933333333333 0.939393939394 0.933333333333
1.0 1.0 1.0
1.0 1.0 1.0
1.0 1.0 1.0
1.0 1.0 1.0
1.0 1.0 1.0
1.0 1.0 1.0
1.0 1.0 1.0
1.0 1.0 1.0
1.0 1.0 1.0
