In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
orig_train_data = pd.read_csv("data/train_2008.csv")
train_data = orig_train_data.copy()

orig_test_data = pd.read_csv("data/test_2008.csv")
test_data = orig_test_data.copy()

In [3]:
train_data_orig_cols = list(orig_train_data.columns) # original columns, with target PES1 on the end
target_var = orig_train_data['PES1']
target_var_bin = target_var.apply(lambda x: 0 if x == 2 else x)
train_data.drop('PES1', axis=1, inplace=True)

In [4]:
# Drop allocation flag features
allocation_flag_feats = ['PRCITFLG', 'PRWERNAL', 'PRHERNAL', 'HXTENURE', 'HXHOUSUT', 'HXTELHHD', 'HXTELAVL', 'HXPHONEO']
allocation_flag_feats.extend([col for col in train_data.columns if col[:2] == 'PX'])
train_data.drop(allocation_flag_feats, axis=1, inplace=True)
test_data.drop(allocation_flag_feats, axis=1, inplace=True)

# Drop "weight" features
weight_feats = [col for col in train_data.columns if col[-3:] == 'WGT']
train_data.drop(weight_feats, axis=1, inplace=True)
test_data.drop(weight_feats, axis=1, inplace=True)

# Drop features that are 100% one answer
one_answer_feats = ['HRMONTH', 'HRYEAR4', 'HUBUSL3', 'HUBUSL4', 'PULAYAVR', 'PULKDK2', 'PULKDK3', 
                   'PULKDK4', 'PULKDK5', 'PULKDK6', 'PULKPS2', 'PULKPS3', 'PULKPS4', 'PULKPS5', 'PULKPS6',
                   'PUDWCK3', 'PUJHDP1O', 'PUERNH1C', 'PTHR', 'PUERN2', 'PTOT',]
for feat in train_data.columns:
    if len(train_data[feat].unique()) == 1:
        one_answer_feats.append(feat)
train_data.drop(one_answer_feats, axis=1, inplace=True)
test_data.drop(one_answer_feats, axis=1, inplace=True)

# Drop some other columns that are "bad"
bad_feats = ['QSTNUM', 'PRERNWA', 'PRERNHLY', 'PEERNH1O', 'HRHHID2', 'GTCSA', 'GTCO', 'HUINTTYP', 'HURESPLI', 'HRMIS',
            'PRDTOCC1', 'PRFAMREL', 'PUSLFPRX', 'OCCURNUM', 'PULINENO', 'PRMJOCC1', 'PRCHLD', 'GTCBSA', 'HRLONGLK',]
train_data.drop(bad_feats, axis=1, inplace=True)
test_data.drop(bad_feats, axis=1, inplace=True)

In [5]:
# Dummify categorical vars
to_dummy = ['GEREG', 'HUBUS', 'PTDTRACE', 'PENATVTY', 'PUABSOT', 'PEIO1COW', 'HUFINAL', 'GESTCEN', 'GESTFIPS',
            'PEIO1ICD', 'PEIO1OCD', 'PEIO2ICD', 'PEIO2OCD', 'PRCITSHP', 'PUDIS', 
           'PRABSREA', 'PRWKSTAT', 'HUPRSCNT', 'PERRP', 'GTCBSAST', 'PRMJOCGR', 'HRHTYPE', ]

train_dummy_df = pd.DataFrame()
test_dummy_df = pd.DataFrame()

for var in to_dummy:
    train_dummy_vars = pd.get_dummies(train_data[var], prefix=var)
    train_dummy_df = pd.concat([train_dummy_df, train_dummy_vars], axis=1)
    
    test_dummy_vars = pd.get_dummies(test_data[var], prefix=var)
    test_dummy_df = pd.concat([test_dummy_df, test_dummy_vars], axis=1)
    
# Drop the original categorical variables
train_data.drop(to_dummy, axis=1, inplace=True)
test_data.drop(to_dummy, axis=1, inplace=True)

In [6]:
# Add dummy vars to the data
train_data = pd.concat([train_data, train_dummy_df], axis=1)
test_data = pd.concat([test_data, test_dummy_df], axis=1)

In [7]:
# Keep track of non-dummy variables
non_dummy_cols = [col for col in train_data.columns if '_' not in col]

In [9]:
### THAT NEW NEW - use the mean of the higher entropy split for NaN. 

# Replace negative values (all forms of N/A) with NaN
for feat in train_data.columns:
    try:
        train_data[feat] = train_data[feat].apply(lambda x: np.NaN if x < 0 else x)
        test_data[feat] = test_data[feat].apply(lambda x: np.NaN if x < 0 else x)
    except:
        continue
    
# Check for columns that are all NaN, and delete them
all_nan_cols = []
for feat in non_dummy_cols:
    if np.all(np.isnan(train_data[feat])) or np.all(np.isnan(test_data[feat])):
        all_nan_cols.append(feat)
train_data.drop(all_nan_cols, axis=1, inplace=True)
test_data.drop(all_nan_cols, axis=1, inplace=True)

In [10]:
# Update non-dummy variables
non_dummy_cols = [col for col in train_data.columns if '_' not in col]

In [11]:
# Replace NaN with the mean of the higher entropy split

for feat in non_dummy_cols: # Only need to worry about nondummies
    train_data[feat].fillna(get_fill_value(train_data[feat]), inplace=True)
    test_data[feat].fillna(get_fill_value(test_data[feat]), inplace=True)

KeyboardInterrupt: 

In [351]:
# ### First pass - use mean for NaN. Later we'll do intelligent filling on important features

# # Replace negative values (all forms of N/A) with NaN
# for feat in train_data.columns:
#     train_data[feat] = train_data[feat].apply(lambda x: np.NaN if x < 0 else x)
#     test_data[feat] = test_data[feat].apply(lambda x: np.NaN if x < 0 else x)
    
# # Replace NaN with the mean of the column
# for feat in train_data.columns:
#     train_data[feat].fillna(train_data[feat].mean(), inplace=True)
#     test_data[feat].fillna(test_data[feat].mean(), inplace=True)
    
# # Check for columns that are all NaN, and delete them
# all_nan_cols = []
# for feat in train_data.columns:
#     if np.all(np.isnan(train_data[feat])):
#         all_nan_cols.append(feat)
# train_data.drop(all_nan_cols, axis=1, inplace=True)
# test_data.drop(all_nan_cols, axis=1, inplace=True)

In [563]:
# Few more cleanups
train_data.drop('id', axis=1, inplace=True)
test_data.drop('id', axis=1, inplace=True)
train_data['PES1'] = target_var

ValueError: labels ['id'] not contained in axis

In [543]:
# Rewrite out the non-feature-selected data sets
train_data.to_csv("data/training_data_feat_eng_smart_fill_raw.csv", index=False)
test_data.to_csv("data/test_data_feat_eng_smart_fill_raw.csv", index=False)

In [566]:
null_cols = []
for feat in train_data.columns:
    if pd.isnull(train_data[feat]).values.any():
        null_cols.append(feat)

In [590]:
for col in null_cols:
    print(col)
    train_data[col].fillna(get_fill_value(train_data[col]), inplace=True)
    test_data[col].fillna(get_fill_value(test_data[col]), inplace=True)

PUBUS1


ValueError: list.remove(x): x not in list

In [580]:
for col in null_cols:
    print((col, train_data[col].count()))

('PUBUS1', 2203)
('PUBUSCK3', 8582)
('PRWNTJOB', 20794)
('PREMP', 40379)
('PRNAGPWS', 30358)
('PRNAGWS', 40236)
('PUNLFCK1', 5930)


In [588]:
sorted(train_data['PUNLFCK1'].unique())

[1.0, nan, 2.0]

In [567]:
null_cols

['PUBUS1', 'PUBUSCK3', 'PRWNTJOB', 'PREMP', 'PRNAGPWS', 'PRNAGWS', 'PUNLFCK1']

In [565]:
# Fit a classifier for evaluating most important features
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
clf = ExtraTreesClassifier(n_estimators=200)
clf = clf.fit(train_data.ix[:, :-1], target_var)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [319]:
# Make a dataframe of the features and their importances
features = pd.DataFrame()
features['feature'] = train_data.columns[3:]
features['importance'] = clf.feature_importances_

In [320]:
# Take a look at the most important features
features.sort(['importance'],ascending=False).head(200)

  if __name__ == '__main__':


Unnamed: 0,feature,importance
25,PEEDUCA,0.025309
18,PEAGE,0.022976
6,HUFAMINC,0.022716
1,HETENURE,0.013238
16,GTCBSASZ,0.012475
9,HRNUMHOU,0.012330
203,PEDIPGED,0.012127
31,PRMARSTA,0.010167
205,PECYC,0.010114
20,PEMARITL,0.009260


In [325]:
# Pull out the most important features
model = SelectFromModel(clf, prefit=True)
train_data_selected_feats = model.transform(train_data.ix[:, 3:])

In [328]:
# Kinda hacky, but we need some way of getting just the selected features in the test set (selecting features kills
# the column names).
imp_mean = features.importance.mean()
keep_feats = []
for f in features['feature']:
    if float(features[features['feature'] == f]['importance']) >= imp_mean:
        keep_feats.append(f)

In [332]:
# Keep hacking....
keep_feats = [f for f in keep_feats if f in test_data.columns]

In [345]:
# Finally get the features we need
train_data_selected_feats = train_data[keep_feats]
test_data_selected_feats = test_data[keep_feats]

# Add the target back to the training data
train_data_selected_feats['targ'] = target_var

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [346]:
# Write out the data sets
train_data_selected_feats.to_csv("data/training_data_cleaned2.csv", index=False)
test_data_selected_feats.to_csv("data/test_data_cleaned2.csv", index=False)

In [8]:
######### Havent used this stuff yet

# For numerics, define a find_min_entropy_split() function, and use that on
# negative valued responses. Negative valued responses indicate that the
# survey participant did not respond in some capacity.
def get_split_points(col):
    '''Get the points between the unique values in col. col is a pandas series. Set the stride such that
    there are ~10 splits in the feature.'''
    unique = col.unique()
    sort = sorted(unique)
    splits = []
    stride = int(np.ceil(len(sort) / 10))
    i = 0
    while i < (len(sort) - stride):
        splits.append(0.5 * (sort[i] + sort[i + stride]))
        i += stride
    return splits

def get_entropy(arr):
    '''Get the entropy of an array. Assumes the values of the array are 0/1'''
    if len(arr) == 0:
        return 0.0
    frac_pos = sum(arr) / len(arr)
    size = len(arr)
    if frac_pos == 0 or frac_pos == 1: # Workaround for defining 0 * log(0) = 0
        return 0.0
    return -size * (frac_pos * np.log(frac_pos) + (1 - frac_pos) * np.log(1 - frac_pos))

def find_min_entropy_split(col):
    '''Find the threshold in a column of data that a decision tree would choose using
    the entropy impurity measure. Return that threshold, and flag indicating which side of
    the split has higher entropy. flag=1 => the datapoints above the split have higher entropy, and
    flag=-1 => the datapoints below the split have higher entropy.'''
    split_points = get_split_points(col)
    min_entropy = np.inf
    min_entropy_split = None
    flag = 0
    for split in split_points:
        bools = col > split
        above = [i for i in range(len(col)) if bools[i]]
        below = [i for i in range(len(col)) if not bools[i]]
        voter_status_above = [target_var_bin[i] for i in above]
        voter_status_below = [target_var_bin[i] for i in below]
        entropy_above = get_entropy(voter_status_above)
        entropy_below = get_entropy(voter_status_below)
        entropy_total = entropy_above + entropy_below
        if entropy_total < min_entropy:
            min_entropy = entropy_total
            min_entropy_split = split
            if entropy_above > entropy_below:
                flag = 1
            else:
                flag = -1
    return min_entropy_split, flag

def get_fill_value(col):
    '''First find the split that minimizes entropy (ie the split chosen by a tree), then return the mean
    of the half of the split that has higher entropy.'''
    split, flag = find_min_entropy_split(col)
    if flag == 0:
        raise ValueError# If the flag is zero, something went wrong
    if flag == 1:
        higher_entropy_side = col[col > split]
    else:
        higher_entropy_side = col[col <= split]
    return np.mean(higher_entropy_side)

In [342]:
numeric_cols = [col for col in train_data.columns if ('_' not in col and col not in to_dummy)]
numeric_cols = numeric_cols[3:]

In [343]:
# For each feature, apply find_max_entropy_split and fill the negative 
# values with the mean of the higher entropy split


In [382]:
get_split_points(train_data['PEEDUCA'])

[32.0, 34.0, 36.0, 38.0, 40.0, 42.0, 44.0]

In [385]:
find_min_entropy_split(train_data['PEAGE'])

(35.5, 1)

In [386]:
get_fill_value(train_data['PEAGE'])

56.130146964308665

In [388]:
import time
start = time.time()
get_fill_value(train_data['PEAGE'])
end = time.time()
print(end - start)

25.91235613822937


In [389]:
len(train_data.columns)

1672

In [491]:
train_data['HRINTSTA'].unique()

array([1])

In [492]:
test_data['HRINTSTA'].unique()

array([1])