# Homework 1

In [3]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from IPython.display import Image  

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pydotplus
import seaborn as sns

### Data Preprocess

Question 1: Remove the rows with missing labels (’label’) and rows with more than 7 missing features. Report the remaining number of rows. (2 mark)

In [4]:
# load data
dat = pd.read_csv('data/customer_data.csv')
dat.head()

Unnamed: 0,label,id,fea_1,fea_2,fea_3,fea_4,fea_5,fea_6,fea_7,fea_8,fea_9,fea_10,fea_11
0,0.0,59004779,4.0,1277.0,1.0,113000.0,2.0,8.0,-1.0,100.0,3.0,341759.0,207.17384
1,0.0,58990862,7.0,1298.0,1.0,110000.0,2.0,11.0,-1.0,101.0,5.0,72001.0,
2,1.0,58995168,7.0,1335.5,1.0,151000.0,2.0,11.0,5.0,110.0,3.0,60084.0,
3,0.0,54987320,7.0,,2.0,59000.0,2.0,11.0,5.0,108.0,4.0,450081.0,197.403141
4,0.0,59005995,6.0,1217.0,3.0,56000.0,2.0,6.0,-1.0,100.0,3.0,60091.0,


In [5]:
print('Original Rows: %s' % len(dat))

dat_remove_label = dat.dropna(subset=['label'])
print('after remove NA-label row: %s' % len(dat_remove_label))

dat_remove_7_missing = dat_remove_label.iloc[:, 2:] # drop `label` because it's not a feature
dat_remove_7_missing = dat_remove_7_missing.dropna(thresh=8)
print('after drop more than 7 feature: %s' % len(dat_remove_7_missing))

Original Rows: 1124
after remove NA-label row: 1110
after drop more than 7 feature: 1095


Question 2: Remove features with > 50% of missing values. For other features with missing values fill them with the mean of the corresponding features. Report the removed features (if any) and standard deviation of features with missing values after filling. (2 marks)

In [6]:
print('original columns: %s' % len(dat_remove_7_missing.columns))

dat_remove_50pct = dat_remove_7_missing.dropna(thresh=0.5 * len(dat_remove_7_missing), axis='columns')
dat_fill_mean = dat_remove_50pct.fillna(dat_remove_50pct.mean())
print('after remove feature has 50pct  NA: %s' % len(dat_remove_50pct.columns))

for feature in dat_remove_7_missing.columns:
    if feature not in dat_fill_mean.columns:
        print('removed feature is %s' % feature)

dat_fill_mean.std()

original columns: 11
after remove feature has 50pct  NA: 10
removed feature is fea_11


fea_1          1.378574
fea_2         48.151339
fea_3          0.876765
fea_4      89256.523379
fea_5          0.260353
fea_6          2.676198
fea_7          2.970648
fea_8         11.977444
fea_9          0.857937
fea_10    152455.809399
dtype: float64

### Decision Trees

Question 3: Train Decision Tree model on train data for criterions = {’gini’, ’entropy’} and report the accuracies on the validation data. Select the best criterion and report the accuracy on the test data. (1 mark)

Question 4: Use the criterion selected above to train Decision Tree model on train data for min samples split={2,5,10,20} and report the accuracies on the validation data. Select the best parameter and report the accuracy on the test data. (2 marks)

Question 5: Use the parameters selected above (Q4 and Q5) to train Decision Tree model using the first 50, 100, 200, 400, 600 and 704 samples from train data. Keep the validation set unchanged during this analysis. Report and plot the accuracies on the validation data. (2 marks)

In [7]:
# train_feature = pd.read_csv('data/customer_data_train.csv').iloc[:, 1:]
train_feature = pd.read_csv('data/customer_data_train.csv', index_col=0)
train_label = pd.read_csv('data/customer_data_train_labels.csv', index_col=0)

test_feature = pd.read_csv('data/customer_data_test.csv', index_col=0)
test_label = pd.read_csv('data/customer_data_test_labels.csv', index_col=0)

train_feature

Unnamed: 0,fea_1,fea_2,fea_3,fea_4,fea_5,fea_6,fea_7,fea_8,fea_9,fea_10
946,7.0,1275.500000,3.0,136000.0,2.0,11.0,5.0,113.0,4.0,151304.0
392,7.0,1304.000000,3.0,63000.0,2.0,11.0,5.0,110.0,4.0,60095.0
510,7.0,1296.500000,1.0,76000.0,2.0,11.0,10.0,113.0,4.0,72001.0
875,5.0,1250.000000,3.0,78000.0,2.0,15.0,5.0,82.0,5.0,350092.0
420,7.0,1257.500000,3.0,95000.0,1.0,11.0,4.0,111.0,4.0,450015.0
...,...,...,...,...,...,...,...,...,...,...
567,5.0,1230.500000,3.0,61000.0,2.0,15.0,5.0,109.0,4.0,60020.0
936,4.0,1305.500000,1.0,128000.0,2.0,8.0,3.0,96.0,3.0,60042.0
534,7.0,1284.180818,2.0,70000.0,2.0,11.0,9.0,110.0,4.0,151304.0
734,5.0,1191.500000,3.0,61000.0,2.0,15.0,-1.0,111.0,4.0,72001.0


In [8]:
def visualize(clf, feature_names, label_names):

    plt.figure(figsize=(60, 30))

    return plot_tree(clf, feature_names=feature_names, class_names=label_names, filled=True)

In [9]:
criterions = ['gini', 'entropy']
min_split = [2, 5, 10, 20]
samples_cut = [50, 100, 200, 400, 600, 704]

for cri in criterions:
    for minsplit in min_split:
        for cut in samples_cut:

            clf = DecisionTreeClassifier(criterion=cri, min_samples_split=minsplit, random_state=34)
            clf.fit(train_feature[:cut], train_label[:cut])
            predictions = clf.predict(test_feature)

            print('criterions %(cri)s, min_split %(minsplit)s, samples_cut %(cut)s' % {'cri': cri, 'minsplit': minsplit, 'cut': cut})
            print('accuracy = %s' % str(accuracy_score(test_label, predictions)))

criterions gini, min_split 2, samples_cut 50
accuracy = 0.6606334841628959
criterions gini, min_split 2, samples_cut 100
accuracy = 0.7149321266968326
criterions gini, min_split 2, samples_cut 200
accuracy = 0.6923076923076923
criterions gini, min_split 2, samples_cut 400
accuracy = 0.6606334841628959
criterions gini, min_split 2, samples_cut 600
accuracy = 0.669683257918552
criterions gini, min_split 2, samples_cut 704
accuracy = 0.7013574660633484
criterions gini, min_split 5, samples_cut 50
accuracy = 0.6515837104072398
criterions gini, min_split 5, samples_cut 100
accuracy = 0.7330316742081447
criterions gini, min_split 5, samples_cut 200
accuracy = 0.7013574660633484
criterions gini, min_split 5, samples_cut 400
accuracy = 0.6877828054298643
criterions gini, min_split 5, samples_cut 600
accuracy = 0.6832579185520362
criterions gini, min_split 5, samples_cut 704
accuracy = 0.6742081447963801
criterions gini, min_split 10, samples_cut 50
accuracy = 0.751131221719457
criterions gini,

Question 6: Use the test data to compute the confusion matrix for the predictions of your model. Report the confusion matrix. (1 mark)

### Nearest Neighbor
Normalize Data: Normalize features such that for each feature the mean is 0 and the standard deviation is 1 in the train+validation data. Use the normalizing factors calculated on train+validation data to modify the values in train, validation and test data.

Question 7: Train k-nn model on train + validation data and report accuracy on test data. Use Euclidean distance and k=3. (1 mark)

Question 8: Train the model on train data for distance metrics defined by l1,linf, l2. Report the accuracies on the validation data. Select the best metric and report the accuracy on the test data for the selected metric. Use k=3. (1 mark)

Question 9: Train the k-nn model on train data for k=1,3,5,7,9. Report and plot the accuracies on the validation data. Select the best ’k’ value and report the accuracy on the test data for the selected ’k’. Use Chebyshev distance. (2 marks)

Question 10: Instead of using full train data, train the model using the first 50, 100, 200, 400, 600 and 704 data samples from train data. Keep the validation set unchanged during this analysis. Report and plot the accuracies on the validation data. Use Chebyshev distance and k=3. Note: Don’t shuffle the data and use only the ’first n samples’, otherwise your answers may differ. (2 marks)

Question 11: Train a k-nn model with k=3 and k=7 with the Chebyshev distance on the train and validation data combined. Plot the ROC curve for the prediction you get on the test data for both models. Also report the accuracy, precision, recall and F-1 score.
Please comment on the evaluation results and the ROC curve, which model is better?(4 points)