# 1. Prequisite

In [14]:
import pandas as pd
import numpy as np

In [24]:
df = pd.read_csv('lab1.csv')

In [25]:
# change DemCluster from interval/integer to nominal/str
df['DemCluster'] = df['DemCluster'].astype(str)
    
# change DemHomeOwner into binary 0/1 variable
dem_home_owner_map = {'U':0, 'H': 1}
df['DemHomeOwner'] = df['DemHomeOwner'].map(dem_home_owner_map)
    
# denote errorneous values in DemMidIncome
mask = df['DemMedIncome'] < 1
df.loc[mask, 'DemMedIncome'] = np.nan
    
# impute missing values in DemAge with its mean
df['DemAge'].fillna(df['DemAge'].mean(), inplace=True)

# impute med income using mean
df['DemMedIncome'].fillna(df['DemMedIncome'].mean(), inplace=True)

# impute gift avg card 36 using mean
df['GiftAvgCard36'].fillna(df['GiftAvgCard36'].mean(), inplace=True)
    
# drop ID and the unused target variable
df.drop(['ID', 'TargetD'], axis=1, inplace=True)
    
# one-hot encoding
df = pd.get_dummies(df)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9686 entries, 0 to 9685
Data columns (total 86 columns):
TargetB             9686 non-null int64
GiftCnt36           9686 non-null int64
GiftCntAll          9686 non-null int64
GiftCntCard36       9686 non-null int64
GiftCntCardAll      9686 non-null int64
GiftAvgLast         9686 non-null float64
GiftAvg36           9686 non-null float64
GiftAvgAll          9686 non-null float64
GiftAvgCard36       9686 non-null float64
GiftTimeLast        9686 non-null int64
GiftTimeFirst       9686 non-null int64
PromCnt12           9686 non-null int64
PromCnt36           9686 non-null int64
PromCntAll          9686 non-null int64
PromCntCard12       9686 non-null int64
PromCntCard36       9686 non-null int64
PromCntCardAll      9686 non-null int64
StatusCatStarAll    9686 non-null int64
DemAge              9686 non-null float64
DemHomeOwner        9686 non-null int64
DemMedHomeValue     9686 non-null int64
DemPctVeterans      9686 non-null int64
Dem

# 2. Data Partitioning

# a) Describe training, validation and test dataset.

# b) What is the purpose for each of these split?

Training dataset is the dataset that used to train and fit the model.

Validation dataset used to provide an unbiased evaluation of a model fit on the training dataset while tuning the model hyperparameters.

Whereas testing dataset used to provide an unbiased evaluation of a final model that trained on the training dataset.

# c) What is k-fold cross validation? 

K-Fold cross validation is where a given data set is split into a K number of sections/folds where each fold is used as a testing set at some point. If the data set is split into k folds. In the first iteration, the first fold is used to test the model and the rest are used to train the model. In the second iteration, 2nd fold is used as the testing set while the rest serve as the training set. This process is repeated until each fold of the k folds have been used as the testing set.

# d) What is the advantage and disadvantage of k-fold CV compared to normal training/test/validation method?

No randomness of using some observations for training versus validation set like in training/test/validation method as each observation is considered for both training and validation. This can reduce the variability of model.

K-fold doesn’t perform well with time series data.

# 3. Decision Tree

In [27]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

In [28]:
def important_features(model, feature_names, n_to_display=20):
    # grab feature importances from the model
    importances = model.feature_importances_

    # sort them out in descending order
    indices = np.argsort(importances)
    indices = np.flip(indices, axis=0)

    # limit to 20 features
    indices = indices[:n_to_display]

    for i in indices:
       print(feature_names[i], ':', importances[i])

In [29]:
def visualize_decision_tree(model, feature_names, save_name):
    import pydot
    from io import StringIO
    from sklearn.tree import export_graphviz

    dotfile = StringIO()
    export_graphviz(model, out_file=dotfile, feature_names=feature_names)
    graph = pydot.graph_from_dot_data(dotfile.getvalue())
    graph[0].write_png(save_name) # saved in the following file

In [30]:
y = df['TargetB']
X = df.drop(['TargetB'], axis=1)

X_mat = np.asmatrix(X)
X_train, X_test, y_train, y_test = train_test_split(X_mat, y, test_size=0.3, stratify=y, random_state=0)

In [34]:
model = DecisionTreeClassifier(random_state=0)
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

In [35]:
model.score(X_train, y_train)

1.0


In [36]:
model.score(X_test, y_test)

0.5175498967653132

The lower accuracy score of testing data indicates the model is overfitting

In [37]:
important_features(model, X.columns, 20)

DemMedHomeValue : 0.08856216558156052
DemMedIncome : 0.07429455482500676
DemPctVeterans : 0.07291816310685263
DemAge : 0.0701716362678387
GiftAvgAll : 0.06046906677836176
GiftTimeFirst : 0.048850667094453015
GiftTimeLast : 0.04287600684851861
PromCnt36 : 0.03820770101221019
GiftAvgCard36 : 0.03374873291216412
GiftAvg36 : 0.03350964949337565
GiftCnt36 : 0.032095432655321235
PromCnt12 : 0.030508011841672542
PromCntAll : 0.029553356644503426
GiftAvgLast : 0.02951945886316395
GiftCntAll : 0.02918192123013657
PromCntCard36 : 0.028498712459709818
GiftCntCardAll : 0.027189622906557
PromCntCardAll : 0.02611013420524961
PromCntCard12 : 0.01267787275491708
GiftCntCard36 : 0.011898250866343303


The first top 5 import features are DemMedHomeValue, DemMedIncome, DemPctVeterans, DemAge and GiftAvgAll.

In [65]:
import pydot
import graphviz
from io import StringIO
from sklearn.tree import export_graphviz
import os
os.environ["PATH"] += os.pathsep + 'c:/users/tan chang jung/anaconda3/lib/site-packages/graphviz/'

In [66]:
dotfile = StringIO()
export_graphviz(model, out_file=dotfile, feature_names=X.columns)
graph = pydot.graph_from_dot_data(dotfile.getvalue())

In [68]:
print(graph[0])

digraph Tree {
node [shape=box];
0 [label="GiftCnt36 <= 2.5\ngini = 0.5\nsamples = 6780\nvalue = [3390, 3390]"];
1 [label="DemMedHomeValue <= 67550.0\ngini = 0.489\nsamples = 3118\nvalue = [1788, 1330]"];
0 -> 1  [headlabel="True", labelangle=45, labeldistance="2.5"];
2 [label="GiftAvgLast <= 14.5\ngini = 0.468\nsamples = 1211\nvalue = [758, 453]"];
1 -> 2;
3 [label="PromCntCard36 <= 6.5\ngini = 0.5\nsamples = 221\nvalue = [108, 113]"];
2 -> 3;
4 [label="DemMedHomeValue <= 55350.0\ngini = 0.472\nsamples = 76\nvalue = [47, 29]"];
3 -> 4;
5 [label="PromCntCardAll <= 10.5\ngini = 0.414\nsamples = 58\nvalue = [41, 17]"];
4 -> 5;
6 [label="DemAge <= 76.0\ngini = 0.278\nsamples = 6\nvalue = [1, 5]"];
5 -> 6;
7 [label="gini = 0.0\nsamples = 5\nvalue = [0, 5]"];
6 -> 7;
8 [label="gini = 0.0\nsamples = 1\nvalue = [1, 0]"];
6 -> 8;
9 [label="DemMedIncome <= 25340.0\ngini = 0.355\nsamples = 52\nvalue = [40, 12]"];
5 -> 9;
10 [label="gini = 0.0\nsamples = 2\nvalue = [0, 2]"];
9 -> 10;
11 [label="G