In [810]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

In [811]:
orig_train_data = pd.read_csv("data/train_2008.csv")
train_data = orig_data.copy()

orig_test_data = pd.read_csv("data/test_2008.csv")
test_data = orig_test_data.copy()

In [812]:
train_data_orig_cols = list(orig_train_data.columns) # original columns, with target PES1 on the end
target_var = orig_train_data['PES1']
target_var_bin = target_var.apply(lambda x: 0 if x == 2 else x)
train_data.drop('PES1', axis=1, inplace=True)

In [813]:
# Collect all features that are bad and need to be dropped
allocation_flag_feats = ['PRCITFLG', 'PRWERNAL', 'PRHERNAL', 'HXTENURE', 'HXHOUSUT', 
                         'HXTELHHD', 'HXTELAVL', 'HXPHONEO']
allocation_flag_feats.extend([col for col in train_data.columns if col[:2] == 'PX'])

recode_feats = ["PRFAMNUM","PRFAMREL","PRFAMTYP","PRPERTYP","PEMLR","PRDISC",
                "PRJOBSEA","PRWNTJOB","PRCOW1","PRCOW2","PRDTCOW1","PRDTCOW2","PRDTIND1",
                "PRDTIND2","PRDTOCC1","PRDTOCC2","PREMP","PRMJIND1","PRMJIND2","PRMJOCC1",
                "PRMJOCC2","PRNAGPWS","PRNAGWS","PRERNHLY","PRERNWA",
                "PRIMIND1","PRIMIND2","PRTFAGE","PTHR","PTWK","PTOT",]

one_answer_feats = ['HRMONTH', 'HRYEAR4', 'HUBUSL3', 'HUBUSL4', 'PULAYAVR', 'PULKDK2', 
                    'PULKDK3', 'PULKDK4', 'PULKDK5', 'PULKDK6', 'PULKPS2', 'PULKPS3', 
                    'PULKPS4', 'PULKPS5', 'PULKPS6', 'PUDWCK3', 'PUJHDP1O', 'PUERNH1C', 
                    'PTHR', 'PUERN2', 'PTOT',]
for feat in train_data.columns:
    if len(train_data[feat].unique()) == 1:
        one_answer_feats.append(feat)
        
bad_feats = ['QSTNUM', 'PRERNWA', 'PRERNHLY', 'PEERNH1O', 'HRHHID2', 'GTCSA', 'GTCO', 
             'HUINTTYP', 'HURESPLI', 'HRMIS', 'PRDTOCC1', 'PRFAMREL', 'PUSLFPRX', 'OCCURNUM',
             'PULINENO', 'PRMJOCC1', 'PRCHLD', 'GTCBSA', 'HRLONGLK', 'HUTYPB']

weight_feats = [col for col in train_data.columns if col[-3:] == 'WGT']

# Put them all together, and remove duplicates
feats_to_drop = list(set(allocation_flag_feats + recode_feats + one_answer_feats + 
                         bad_feats + weight_feats))

In [814]:
# Drop the bad columns
train_data.drop(feats_to_drop, axis=1, inplace=True)
test_data.drop(feats_to_drop, axis=1, inplace=True)

In [815]:
# Dummify categorical vars
to_dummy = ['GEREG', 'HUBUS', 'PTDTRACE', 'PENATVTY', 'PUABSOT', 'PEIO1COW', 'HUFINAL', 'GESTCEN', 'GESTFIPS',
            'PEIO1ICD', 'PEIO1OCD', 'PEIO2ICD', 'PEIO2OCD', 'PRCITSHP', 'PUDIS', 
           'PRABSREA', 'PRWKSTAT', 'HUPRSCNT', 'PERRP', 'GTCBSAST', 'PRMJOCGR', 'HRHTYPE', ]

train_dummy_df = pd.DataFrame()
test_dummy_df = pd.DataFrame()

for var in to_dummy:
    train_dummy_vars = pd.get_dummies(train_data[var], prefix=var)
    train_dummy_df = pd.concat([train_dummy_df, train_dummy_vars], axis=1)
    
    test_dummy_vars = pd.get_dummies(test_data[var], prefix=var)
    test_dummy_df = pd.concat([test_dummy_df, test_dummy_vars], axis=1)
    
# Drop the original categorical variables
train_data.drop(to_dummy, axis=1, inplace=True)
test_data.drop(to_dummy, axis=1, inplace=True)

In [816]:
# Add dummy vars to the data
train_data = pd.concat([train_data, train_dummy_df], axis=1)
test_data = pd.concat([test_data, test_dummy_df], axis=1)

In [817]:
# Keep track of non-dummy variables
non_dummy_cols = [col for col in train_data.columns if '_' not in col]

In [818]:
### First pass - use mean for NaN. Later we'll do intelligent filling on important features

# Replace negative values (all forms of N/A) with NaN
for feat in train_data.columns:
    train_data[feat] = train_data[feat].apply(lambda x: np.NaN if x < 0 else x)
    try:
        test_data[feat] = test_data[feat].apply(lambda x: np.NaN if x < 0 else x)
    except:
        continue
    
# Replace NaN with the mean of the column
for feat in train_data.columns:
    train_data[feat].fillna(train_data[feat].mean(), inplace=True)
    try:
        test_data[feat].fillna(test_data[feat].mean(), inplace=True)
    except:
        continue
    
# Check for columns that are all NaN, and delete them
nan_cols = []
for feat in train_data.columns:
    if np.all(np.isnan(train_data[feat])):
        nan_cols.append(feat)
train_data.drop(nan_cols, axis=1, inplace=True)
test_data.drop(nan_cols, axis=1, inplace=True)

In [819]:
# Update non-dummy variables
non_dummy_cols = [col for col in train_data.columns if '_' not in col]

In [820]:
# Few more cleanups
train_data.drop('id', axis=1, inplace=True)
test_data.drop('id', axis=1, inplace=True)
train_data['PES1'] = target_var

In [821]:
feats_test_needs = [f for f in train_data.columns if f not in test_data.columns and f != 'PES1']
feats_test_drop = [f for f in test_data.columns if f not in train_data.columns]

In [822]:
for feat in feats_test_needs:
    test_data[feat] = 0

In [823]:
test_data.drop(feats_train_drop, axis=1, inplace=True)

In [824]:
len(train_data.columns) == len(test_data.columns) + 1

True

In [825]:
len(train_data.columns)

1626

In [826]:
len(test_data.columns)

1625

In [744]:
test_data.drop(feats_test_needs, axis=1, inplace=True)

In [827]:
# Make sure columns are in same order
test_data = test_data[train_data.columns[:-1]]

In [832]:
# A couple checks first
assert train_data.columns[-1] == 'PES1'
assert len(train_data.columns) == len(test_data.columns) + 1
assert (train_data.columns[:-1] == test_data.columns).all()
assert train_data.isnull().values.any() == False
assert test_data.isnull().values.any() == False
len(train_data.columns)

1626

In [831]:
# Rewrite out the non-feature-selected data sets
train_data.to_csv("data/training_data_feat_eng_mean_fill_raw.csv", index=False)
test_data.to_csv("data/test_data_feat_eng_mean_fill_raw.csv", index=False)

In [637]:
# null_cols = []
# for feat in train_data.columns:
#     if pd.isnull(train_data[feat]).values.any():
#         null_cols.append(feat)
# for feat in test_data.columns:
#     if pd.isnull(test_data[feat]).values.any():
#         null_cols.append(feat)

In [639]:
# for col in null_cols:
#     print(col)
#     train_data[col].fillna(get_fill_value(train_data[col]), inplace=True)
#     test_data[col].fillna(get_fill_value(test_data[col]), inplace=True)

PUBUSCK1
PUDIS2
PULKDK1
PUNLFCK2


In [640]:
# Fit a classifier for evaluating most important features
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
clf = ExtraTreesClassifier(n_estimators=200)
clf = clf.fit(train_data.ix[:, :-1], target_var)

In [641]:
# Make a dataframe of the features and their importances
features = pd.DataFrame()
features['feature'] = train_data.columns[:-1]
features['importance'] = clf.feature_importances_

In [642]:
# Take a look at the most important features
features.sort(['importance'],ascending=False).head(200)

  from ipykernel import kernelapp as app


Unnamed: 0,feature,importance
20,PEEDUCA,0.029726
14,PEAGE,0.023882
6,HUFAMINC,0.022802
1,HETENURE,0.013730
12,GTCBSASZ,0.013288
7,HRNUMHOU,0.013167
180,PECYC,0.011053
16,PEMARITL,0.009870
26,PRMARSTA,0.009623
177,PRNMCHLD,0.008654


In [651]:
# Pull out the most important features
model = SelectFromModel(clf, prefit=True, threshold='1.0*mean')
train_data_selected_feats = model.transform(train_data.ix[:, :-1])

In [652]:
train_data_selected_feats.shape

(64667, 336)

In [653]:
# Kinda hacky, but we need some way of getting just the selected features in the test set (selecting features kills
# the column names).
imp_mean = features.importance.mean()
keep_feats = []
for f in features['feature']:
    if float(features[features['feature'] == f]['importance']) >= 1.0 * imp_mean:
        keep_feats.append(f)

In [656]:
len(keep_feats)

336

In [655]:
# Keep hacking....
keep_feats = [f for f in keep_feats if f in test_data.columns]

In [657]:
# Finally get the features we need
train_data_selected_feats = train_data[keep_feats]
test_data_selected_feats = test_data[keep_feats]

# Add the target back to the training data
train_data_selected_feats['targ'] = target_var

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [658]:
# Write out the data sets
train_data_selected_feats.to_csv("data/training_data_feat_eng_smart_fill_selected_feats.csv", index=False)
test_data_selected_feats.to_csv("data/test_data_feat_eng_smart_fill_selected_feats.csv", index=False)

In [635]:
len(train_data.columns)

1649

In [613]:
######### Havent used this stuff yet

# For numerics, define a find_min_entropy_split() function, and use that on
# negative valued responses. Negative valued responses indicate that the
# survey participant did not respond in some capacity.
def get_split_points(col):
    '''Get the points between the unique values in col. col is a pandas series. Set the stride such that
    there are ~10 splits in the feature.'''
    unique = col.unique()
    sort = sorted(unique)
    sort = [x for x in sort if not pd.isnull(x)]
    splits = []
    stride = int(np.ceil(len(sort) / 10))
    i = 0
    while i < (len(sort) - stride):
        splits.append(0.5 * (sort[i] + sort[i + stride]))
        i += stride
    return splits

def get_entropy(arr):
    '''Get the entropy of an array. Assumes the values of the array are 0/1'''
    if len(arr) == 0:
        return 0.0
    frac_pos = sum(arr) / len(arr)
    size = len(arr)
    if frac_pos == 0 or frac_pos == 1: # Workaround for defining 0 * log(0) = 0
        return 0.0
    return -size * (frac_pos * np.log(frac_pos) + (1 - frac_pos) * np.log(1 - frac_pos))

def find_min_entropy_split(col):
    '''Find the threshold in a column of data that a decision tree would choose using
    the entropy impurity measure. Return that threshold, and flag indicating which side of
    the split has higher entropy. flag=1 => the datapoints above the split have higher entropy, and
    flag=-1 => the datapoints below the split have higher entropy.'''
    split_points = get_split_points(col)
    min_entropy = np.inf
    min_entropy_split = None
    flag = 0
    for split in split_points:
        bools = col > split
        above = [i for i in range(len(col)) if bools[i]]
        below = [i for i in range(len(col)) if not bools[i]]
        voter_status_above = [target_var_bin[i] for i in above]
        voter_status_below = [target_var_bin[i] for i in below]
        entropy_above = get_entropy(voter_status_above)
        entropy_below = get_entropy(voter_status_below)
        entropy_total = entropy_above + entropy_below
        if entropy_total < min_entropy:
            min_entropy = entropy_total
            min_entropy_split = split
            if entropy_above > entropy_below:
                flag = 1
            else:
                flag = -1
    return min_entropy_split, flag

def get_fill_value(col):
    '''First find the split that minimizes entropy (ie the split chosen by a tree), then return the mean
    of the half of the split that has higher entropy.'''
    split, flag = find_min_entropy_split(col)
    if flag == 0:
        return 0 # This is only happening for weird cases where there's one non-null value in the whole column
        #print(col)
        #raise ValueError# If the flag is zero, something went wrong
    if flag == 1:
        higher_entropy_side = col[col > split]
    else:
        higher_entropy_side = col[col <= split]
    return np.mean(higher_entropy_side)

In [833]:
len(test_data)

16000