In [None]:
#1: Verify imports
from sklearn.datasets import fetch_openml
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

from sklearn.neural_network import MLPClassifier
import warnings
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm

import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
#allow plots to appear within the notebook:
%matplotlib inline

print("All imports are good")

In [None]:
#2: Get the data
dataset1 = fetch_openml(name='phoneme') 
X_ds1 = dataset1.data #matrix
y_ds1 = dataset1.target #vector
X_Train_ds1, X_Test_ds1, y_Train_ds1, y_Test_ds1 = \
    train_test_split(X_ds1, y_ds1, test_size=0.2, random_state=0) 

dataset2 = fetch_openml(name='credit-g') 
X_ds2 = dataset2.data #matrix
y_ds2 = dataset2.target #vector
X_Train_ds2, X_Test_ds2, y_Train_ds2, y_Test_ds2 = \
    train_test_split(X_ds2, y_ds2, test_size=0.2, random_state=0) 

print("Data gotten")

In [None]:
#3 Explore Dataset 1
#3.1 Decision Trees
dtc = tree.DecisionTreeClassifier()
scores = []
cross_val_sizes = range(2,11)
cv_ds1 = 0
best_score = 0
for i in cross_val_sizes:
    score = cross_val_score(dtc, X_Train_ds1, y_Train_ds1, cv=i).mean()
    print(str(i) + " : " + str(score))
    scores.append(score)
    if score > best_score:
        best_score = score
        cv_ds1 = i
print("Decision Trees dataset 1 best score = " + str(best_score) + " at cv = " + str(cv_ds1))
plt.plot(cross_val_sizes, scores)
plt.xlabel("Cross validation size - dataset 1 (phoneme)")
plt.ylabel("Cross validation score")

#3.1.1 Decision Tree with pre-pruning
max_depth_sizes = range(2,21)
scores = []
max_depth_ds1 = 0
best_score = 0
for md in max_depth_sizes:
    dtc = RandomForestClassifier(random_state=0, max_depth=md)
    score = cross_val_score(dtc, X_Train_ds1, y_Train_ds1, cv=cv_ds1).mean()
    print(str(md) + " : " + str(score))
    scores.append(score)
    if score > best_score:
        best_score = score
        max_depth_ds1 = md
print("Decision Trees dataset 1 with pre-pruning best score = " + str(best_score) \
      + " at max_depth = " + str(max_depth_ds1))
plt.plot(max_depth_sizes, scores)
plt.xlabel("Max depth size on RandomForestClassifier")
plt.ylabel("Score")

#3.1.2 Decision Tree with post-pruning 
ccp_alpha_sizes = range(0,11)
scores = []
ccp_alpha_ds1 = 0
best_score = 0
for ccpa in ccp_alpha_sizes:
    dtc = RandomForestClassifier(random_state=0, ccp_alpha=ccpa)
    score = cross_val_score(dtc, X_Train_ds1, y_Train_ds1, cv=cv_ds1).mean()
    print(str(ccpa) + " : " + str(score))
    scores.append(score)
    if score > best_score:
        best_score = score
        ccp_alpha_ds1 = ccpa
print("Decision Trees dataset 1 with post-pruning best score = " + str(best_score) \
      + " at ccp_alpha = " + str(ccp_alpha_ds1))
plt.plot(ccp_alpha_sizes, scores)
plt.xlabel("CCP alpha size on RandomForestClassifier")
plt.ylabel("Score")

#3.2 Neural Networks
#3.3 Boosting
#3.4 Support Vector Machines
#3.5 K-Nearest Neighbors

#4 Explore Dataset 2
dtc = tree.DecisionTreeClassifier()
scores = []
cross_val_sizes = range(2,11)
cv_ds2 = 0
best_score = 0
for i in cross_val_sizes:
    score = cross_val_score(dtc, X_Train_ds2, y_Train_ds2, cv=i).mean()
    print(str(i) + " : " + str(score))
    scores.append(score)
    if score > best_score:
        best_score = score
        cv_ds2 = i
print("Decision Trees dataset 2 best score = " + str(best_score) + " at cv = " + str(cv_ds2))
plt.plot(cross_val_sizes, scores)
plt.xlabel("Cross validation size - dataset 1 (phoneme)")
plt.ylabel("Cross validation score")

#4.1 Decision Trees
#4.2 Neural Networks
#4.3 Boosting
#4.4 Support Vector Machines
#4.5 K-Nearest Neighbors

