# Chapter 2 - Model Selection and Training
Guilherme de Oliveira <br>
8/30/2016

## Introduction

In Chapter 2 we will work on the classification model of the US Census data that was analyzed in Chapter 1. My biggest interest in modelling will be dealing with the class imbalance of the target variable. In particular, I am interested in the following aspects:
<ul>
<li> How best to assess the accuracy of the classifier. It is unlikely that accuracy will suffice, because of the [accuracy paradox](https://en.wikipedia.org/wiki/Accuracy_paradox).
<li> What are some approaches that we can use to deal with the class imbalance? Examples include oversampling, undersampling, incorporating clustering algorithms, etc...
</ul>
<br>
<br>
<br>
# This is a work in progress. Stay tuned for more...
<br>
<br>
<br>


In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

%matplotlib inline

## Preprocessing Data

In [2]:
# preprocessing function

def preprocessData(file_name):
    # columns stores tuples of (column_name, continuous/nominal/target, prefix for dummy encoding)
    the_columns  = [('age', 'continuous', 'age'), 
            ('class_of_worker', 'nominal', 'class_of_worker'), 
            ('detailed_industry_code', 'nominal', 'det_ind_code'), 
            ('detailed_occupation_code', 'nominal', 'det_occ_code'), 
            ('education', 'nominal', 'edu'), 
            ('wage_per_hour', 'continuous'), 
            ('enrolled_in_education_last_week', 'nominal', 'edu_last_week'),
            ('marital_status', 'nominal', 'marital_status'),
            ('major_industry_code', 'nominal', 'maj_ind_code'),
            ('major_occupation_code', 'nominal', 'maj_ocptn_code'),
            ('race', 'nominal', 'race'),
            ('hispanic_origin', 'nominal', 'hisp_orgn'),
            ('sex', 'nominal', 'sex'),
            ('member_of_labor_union', 'nominal', 'member_of_lbr_un'), 
            ('reason_for_unemployment', 'nominal', 'reason_for_unmplymnt'),
            ('full_or_part_time_employment_stat', 'nominal', 'ft_or_pt_emplymnt_stat'),
            ('capital_gains', 'continuous'),
            ('capital_losses', 'continuous'),
            ('dividends', 'continuous'),
            ('tax_filer', 'nominal', 'tax_filer'),
            ('region_of_previous_residence', 'nominal', 'region_pa'),
            ('state_of_previous_residence', 'nominal', 'state_pa'),
            ('detailed_household_family_stat', 'nominal', 'det_hse_fam_state'),
            ('detailed_household_summary', 'nominal', 'det_hse_summary'),
            ('instance_weight', 'continuous'),
            ('migration_code_change_in_msa', 'nominal', 'migr_code_msa'),
            ('migration_code_change_in_reg', 'nominal', 'migr_code_reg'),
            ('migration_code_move_within_reg', 'nominal', 'migr_code_move'),
            ('live_in_this_house_1_yr_ago', 'nominal', 'live_in_house_1_yr_ago'),
            ('migration_prev_res_in_sunbelt', 'nominal', 'migr_prev_res_sunbelt'),
            ('num_persons_worked_for_employer', 'continuous'),
            ('family_members_under_18', 'nominal', 'family_under_18'),
            ('cob_father', 'nominal', 'cob_father'),
            ('cob_mother', 'nominal', 'cob_mother'),
            ('cob_self', 'nominal', 'cob_self'),
            ('citizenship', 'nominal', 'citizenship'),
            ('own_business_or_self_employed', 'nominal', 'owner_or_se'),
            ('fill_in_questionnaire_for_veterans_admin', 'nominal', 'veterans_admin'),
            ('veterans_benefits', 'nominal', 'veterans_benefits'),
            ('weeks_worked_in_year', 'nominal', 'weeks_worked_in_yr'),
            ('year', 'nominal', 'year'),
            ('savings','target'),]
    raw_data = pd.read_csv(file_name, names=[c[0] for c in the_columns], index_col=False)
    original_shape = raw_data.shape
    
    raw_data.drop('instance_weight', axis=1, inplace=True)
    the_columns.remove(('instance_weight', 'continuous'))
    
    # find the duplicate rows, keep the first one
    duplicate_rows = raw_data.duplicated(keep='first')
    
    print 'number of duplicates = {:d}'.format(duplicate_rows.sum())
    raw_data = raw_data.drop_duplicates(keep='first')
    new_shape =  raw_data.shape
    print 'number of duplicates removed = {:d}'.format(original_shape[0] - new_shape[0])
    print 'original shape = {:d}, {:d}'.format(original_shape[0], original_shape[1])
    print 'new shape = {:d}, {:d}'.format(raw_data.shape[0], raw_data.shape[1])
    
    # convert nominal columns (object dtype) to integer type
    data = pd.DataFrame(raw_data.select_dtypes(include=['object']))
    object_columns = data.columns
    
    for column in object_columns:
        unique_values = data[column].unique()
        dictionary = {key:idx for idx,key in enumerate(unique_values)}
        data[column] = data[column].apply(lambda x : dictionary[x])
    
    # add nominal columns that were already in integer format 
    nominal_integer_columns = [c[0] for c in the_columns if c[1] == 'nominal' and c[0] not in data.columns]
    data[nominal_integer_columns] = raw_data[nominal_integer_columns]
    
    # convert 'sex', and 'savings' columns to binary; add year column
    data['savings'] = raw_data['savings'].map(lambda x: 1 if str(x).strip() == '50000+.' else 0)
    data['sex'] = raw_data['sex'].map(lambda x: 1 if str(x).strip() == 'Male' else 0)
    data['year'] = raw_data['year']
    
    # add continuous columns
    continuous_columns = [c[0] for c in the_columns if c[1] == 'continuous']
    data[continuous_columns] = raw_data[continuous_columns]
    
    # verify that we aren't missing any columns
    assert set(data.columns) == (set(raw_data.columns))

    print 'The final processed data has {:,d} rows and {:d} columns.\n'.format(data.shape[0], data.shape[1])
    return data


In [3]:
data = preprocessData('us_census_full/census_income_learn.csv')

number of duplicates = 46627
number of duplicates removed = 46627
original shape = 199523, 42
new shape = 152896, 41
The final processed data has 152,896 rows and 41 columns.



In [4]:
data.head(3)

Unnamed: 0,class_of_worker,education,enrolled_in_education_last_week,marital_status,major_industry_code,major_occupation_code,race,hispanic_origin,sex,member_of_labor_union,...,own_business_or_self_employed,veterans_benefits,weeks_worked_in_year,year,age,wage_per_hour,capital_gains,capital_losses,dividends,num_persons_worked_for_employer
0,0,0,0,0,0,0,0,0,0,0,...,0,2,0,95,73,0,0,0,0,0
1,1,1,0,1,1,1,0,0,1,0,...,0,2,52,94,58,0,0,0,0,1
2,0,2,1,2,0,0,1,0,0,0,...,0,2,0,95,18,0,0,0,0,0


In [5]:
# obtain training and test set for cross-validation
X = data.drop('savings', axis=1)
y = data.loc[:,'savings']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print 'size of training data: {:7d}, {:3d}'.format(X_train.shape[0], X_train.shape[1])
print 'size of test data:     {:7d}, {:3d}'.format(X_test.shape[0], X_test.shape[1])


size of training data:  107027,  40
size of test data:       45869,  40


## Baseline Random Forest Model

In [6]:
rf_clf = RandomForestClassifier(n_estimators = 100)
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)
print 'Random Forest accuracy with {:d} trees = {:.4f}'.format(rf_clf.n_estimators, accuracy_score(y_test, y_pred))

Random Forest accuracy with 100 trees = 0.9394


In [7]:
rf_clf = RandomForestClassifier(n_estimators = 200)
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)
print 'Random Forest accuracy = {:.4f}'.format(accuracy_score(y_test, y_pred))

Random Forest accuracy = 0.9399


In [8]:
rf_clf = RandomForestClassifier(n_estimators = 400)
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)
print 'Random Forest accuracy = {:.4f}'.format(accuracy_score(y_test, y_pred))

Random Forest accuracy = 0.9404


## Incorporate Some Feature Engineering
Start with the column "detailed_household_family_stat" and convert the classes that have no savings greater than 50K into one class.

In [9]:
dhfs = data['detailed_household_family_stat'][data['savings'] == 1].unique()
dhfs.sort()
print 'unique values for savings = 1', dhfs
dhfs_all = data['detailed_household_family_stat'].unique()
dhfs_all.sort()
print 'unique values for all vals   ', dhfs_all

diff = set(dhfs_all).difference(set(dhfs))
print ' the differences are........', diff
if diff is None:
    print '\n diff is empty'
else:
    print ' len(diff)', len(diff)
    val = max(diff) + 1
    print ' mapping values to:', val


unique values for savings = 1 [ 0  1  2  3  4  5  6  7 10 11 12 15 16 17 18 19 22 28 30]
unique values for all vals    [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37]
 the differences are........ set([32, 33, 34, 35, 36, 37, 8, 9, 13, 14, 20, 21, 23, 24, 25, 26, 27, 29, 31])
 len(diff) 19
 mapping values to: 38


In [10]:
def update_column(column):
    dhfs = data[column][data['savings'] == 1].unique()
    dhfs.sort()
    print 'unique values for svngs = 1', dhfs
    dhfs_all = data[column].unique()
    dhfs_all.sort()
    print 'unique values for all vals ', dhfs_all
    
    diff = set(dhfs_all).difference(set(dhfs))
    print ' the differences are........', diff
    if diff is None:
        print '\n diff is empty'
        return data[column]
    
    print ' len(diff)', len(diff)
    
    val = max(diff) + 1
    print ' mapping values to:', val
    return data[column].map(lambda x : val if x in diff else x)


In [11]:
data['detailed_household_family_stat'] = update_column('detailed_household_family_stat')


unique values for svngs = 1 [ 0  1  2  3  4  5  6  7 10 11 12 15 16 17 18 19 22 28 30]
unique values for all vals  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37]
 the differences are........ set([32, 33, 34, 35, 36, 37, 8, 9, 13, 14, 20, 21, 23, 24, 25, 26, 27, 29, 31])
 len(diff) 19
 mapping values to: 38


In [12]:
update_column('detailed_household_family_stat');


unique values for svngs = 1 [ 0  1  2  3  4  5  6  7 10 11 12 15 16 17 18 19 22 28 30]
unique values for all vals  [ 0  1  2  3  4  5  6  7 10 11 12 15 16 17 18 19 22 28 30 38]
 the differences are........ set([38])
 len(diff) 1
 mapping values to: 39


In [13]:
print data['detailed_household_family_stat'].unique()
print data['savings'].unique()

[ 0  1  2  3  4  5  6  7 38 10 11 12 15 16 17 18 19 22 28 30]
[0 1]


In [14]:
# Fit and transform x to visualise inside a 2D feature space

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
data_vis = pca.fit_transform(X)
print 'shape(data_vis):', data_vis.shape
print data_vis[:4,:]
print data_vis[-4:,:]
print 'pca.components_.shape:', pca.components_.shape
print 'pca.explained_variance_ratio_:', pca.explained_variance_ratio_


shape(data_vis): (152896, 2)
[[-582.86880395  219.21581787]
 [-582.85729319  219.25677883]
 [-582.8777853   219.26440239]
 [-582.87925719  219.27225601]]
[[ -582.8665234    219.20378154]
 [ 5821.79914647   633.03285533]
 [ -572.51524791    62.59966318]
 [ -582.8543837    219.28715813]]
pca.components_.shape: (2, 40)
pca.explained_variance_ratio_: [ 0.8470303   0.14724605]


In [15]:
print 'y==0 : ', (y==0).sum()
print 'y==1 : ', (y==1).sum()
print 'y==0 + y==1:', (y==0).sum() + (y==1).sum()


y==0 :  140529
y==1 :  12367
y==0 + y==1: 152896


In [16]:
# Plot the original data
# Plot the two classes

def scatter_plot(X, y):
    data_vis = pca.fit_transform(X)
    
    yeq0 = data_vis[ (y==0) ]
    yeq1 = data_vis[ (y==1) ]
    
    palette = sns.color_palette()
    almost_black = '#262626'
    
    fig=plt.figure(figsize=(9,9));
    ax = fig.gca();
    ax.scatter(yeq0[:, 0], yeq0[:, 1], label="Savings < 50K", alpha=0.3, facecolor=palette[0], 
               linewidth=0.15, edgecolor=almost_black);
    ax.scatter(yeq1[:, 0], yeq1[:, 1], label="Savings > 50K", alpha=0.3, facecolor=palette[2], 
               linewidth=0.15, edgecolor=almost_black);
    ax.legend(fontsize=16, loc='lower left', bbox_to_anchor=(1,0.8));
    
    return ax


In [None]:
ax = scatter_plot(X, y);

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import OneSidedSelection
from imblearn.under_sampling import NeighbourhoodCleaningRule
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import InstanceHardnessThreshold
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
#from imblearn.under_sampling import AllKNN


In [None]:
# Generate the new dataset using under-sampling method
verbose = False

# 'Random under-sampling'
US = RandomUnderSampler()
usx, usy = US.fit_sample(X, y)
ax = scatter_plot(usx, usy);


In [None]:
# 'Tomek links'
TL = TomekLinks()
tlx, tly = TL.fit_sample(X, y)
ax = scatter_plot(tlx, tly);



In [None]:
# 'Clustering centroids'
CC = ClusterCentroids()
ccx, ccy = CC.fit_sample(X, y)
ax = scatter_plot(ccx, ccy);


In [None]:
# 'NearMiss-1'
NM1 = NearMiss(version=1)
nm1x, nm1y = NM1.fit_sample(X, y)
ax = scatter_plot(nm1x, nm1y);


In [None]:
# 'NearMiss-2'
NM2 = NearMiss(version=2)
nm2x, nm2y = NM2.fit_sample(X, y)
ax = scatter_plot(nm2x, nm2y);



In [None]:
# 'NearMiss-3'
NM3 = NearMiss(version=3)
nm3x, nm3y = NM3.fit_sample(X, y)
ax = scatter_plot(nm3x, nm3y);



In [None]:
# 'Condensed Nearest Neighbour'
CNN = CondensedNearestNeighbour(size_ngh=51, n_seeds_S=51)
cnnx, cnny = CNN.fit_sample(X, y)
ax = scatter_plot(cnnx, cnny);



In [None]:
# 'One-Sided Selection'
OSS = OneSidedSelection(size_ngh=51, n_seeds_S=51)
ossx, ossy = OSS.fit_sample(X, y)
ax = scatter_plot(ossx, ossy);



In [None]:
# 'Neighboorhood Cleaning Rule'
NCR = NeighbourhoodCleaningRule(size_ngh=51)
ncrx, ncry = NCR.fit_sample(X, y) 
ax = scatter_plot(ncrx, ncry);



In [None]:
# 'Edited Neareast Neighbour'
ENN = EditedNearestNeighbours(size_ngh=51)
ennx, enny = ENN.fit_sample(X, y)
ax = scatter_plot(ennx, enny);



In [None]:
# 'Instance Hardness Threshold'
IHT = InstanceHardnessThreshold()
ihtx, ihty = IHT.fit_sample(X, y)
ax = scatter_plot(ihtx, ihty);



In [None]:
# 'Repeated Edited Nearest Neighbour'
RENN = RepeatedEditedNearestNeighbours(size_ngh=51)
rennx, renny = RENN.fit_sample(X, y)
ax = scatter_plot(rennx, renny);



In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans()

In [None]:
km = kmeans.fit(X)

In [None]:
km.cluster_centers_.shape