# Chapter 2 - Model Selection and Training
Guilherme de Oliveira <br>
8/30/2016

## Introduction

In Chapter 2 we will work on the classification model of the US Census data that was analyzed in Chapter 1. My biggest interest in modelling will be dealing with the class imbalance of the target variable. In particular, I am interested in the following aspects:
<ul>
<li> How best to assess the accuracy of the classifier. It is unlikely that accuracy will suffice, because of the [accuracy paradox](https://en.wikipedia.org/wiki/Accuracy_paradox).
<li> What are some approaches that we can use to deal with the class imbalance? Examples include oversampling, undersampling, incorporating clustering algorithms, etc...
</ul>
<br>
<br>
<br>
# This is a work in progress. Stay tuned for more...
<br>
<br>
<br>


In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


## Preprocessing Data

In [2]:
# preprocessing function

def preprocessData(file_name):
    # columns stores tuples of (column_name, continuous/nominal/target, prefix for dummy encoding)
    columns  = [('age', 'continuous', 'age'), 
            ('class_of_worker', 'nominal', 'class_of_worker'), 
            ('detailed_industry_code', 'nominal', 'det_ind_code'), 
            ('detailed_occupation_code', 'nominal', 'det_occ_code'), 
            ('education', 'nominal', 'edu'), 
            ('wage_per_hour', 'continuous'), 
            ('enrolled_in_education_last_week', 'nominal', 'edu_last_week'),
            ('marital_status', 'nominal', 'marital_status'),
            ('major_industry_code', 'nominal', 'maj_ind_code'),
            ('major_occupation_code', 'nominal', 'maj_ocptn_code'),
            ('race', 'nominal', 'race'),
            ('hispanic_origin', 'nominal', 'hisp_orgn'),
            ('sex', 'nominal', 'sex'),
            ('member_of_labor_union', 'nominal', 'member_of_lbr_un'), 
            ('reason_for_unemployment', 'nominal', 'reason_for_unmplymnt'),
            ('full_or_part_time_employment_stat', 'nominal', 'ft_or_pt_emplymnt_stat'),
            ('capital_gains', 'continuous'),
            ('capital_losses', 'continuous'),
            ('dividends', 'continuous'),
            ('tax_filer', 'nominal', 'tax_filer'),
            ('region_of_previous_residence', 'nominal', 'region_pa'),
            ('state_of_previous_residence', 'nominal', 'state_pa'),
            ('detailed_household_family_stat', 'nominal', 'det_hse_fam_state'),
            ('detailed_household_summary', 'nominal', 'det_hse_summary'),
            ('instance_weight', 'continuous'),
            ('migration_code_change_in_msa', 'nominal', 'migr_code_msa'),
            ('migration_code_change_in_reg', 'nominal', 'migr_code_reg'),
            ('migration_code_move_within_reg', 'nominal', 'migr_code_move'),
            ('live_in_this_house_1_yr_ago', 'nominal', 'live_in_house_1_yr_ago'),
            ('migration_prev_res_in_sunbelt', 'nominal', 'migr_prev_res_sunbelt'),
            ('num_persons_worked_for_employer', 'continuous'),
            ('family_members_under_18', 'nominal', 'family_under_18'),
            ('cob_father', 'nominal', 'cob_father'),
            ('cob_mother', 'nominal', 'cob_mother'),
            ('cob_self', 'nominal', 'cob_self'),
            ('citizenship', 'nominal', 'citizenship'),
            ('own_business_or_self_employed', 'nominal', 'owner_or_se'),
            ('fill_in_questionnaire_for_veterans_admin', 'nominal', 'veterans_admin'),
            ('veterans_benefits', 'nominal', 'veterans_benefits'),
            ('weeks_worked_in_year', 'nominal', 'weeks_worked_in_yr'),
            ('year', 'nominal', 'year'),
            ('savings','target'),]
    raw_data = pd.read_csv(file_name, names=[c[0] for c in columns], index_col=False)
    original_shape = raw_data.shape
    
    raw_data.drop('instance_weight', axis=1, inplace=True)
    columns.remove(('instance_weight', 'continuous'))
    
    # find the duplicate rows, keep the first one
    duplicate_rows = raw_data.duplicated(keep='first')
    
    print 'number of duplicates = {:d}'.format(duplicate_rows.sum())
    raw_data = raw_data.drop_duplicates(keep='first')
    new_shape =  raw_data.shape
    print 'number of duplicates removed = {:d}'.format(original_shape[0] - new_shape[0])
    print 'new shape = {:d}, {:d}'.format(raw_data.shape[0], raw_data.shape[1])

    return raw_data

