In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
orig_data = pd.read_csv("data/test_2008.csv")
data = orig_data.copy()

In [4]:
data_orig_cols = list(orig_data.columns) # original columns, with target PES1 on the end

# Feature Engineering

In [5]:
# Dummify categorical vars
to_dummy = ['GEREG', 'PTDTRACE', 'PENATVTY', 'PEIO1COW', 'PEAFWHN1', 'GTCBSASZ',]

dummy_df = pd.DataFrame()

for var in to_dummy:
    dummy_vars = pd.get_dummies(data[var], prefix=var)
    dummy_df = pd.concat([dummy_df, dummy_vars], axis=1)
    
# Drop the original categorical variables
data.drop(to_dummy, axis=1, inplace=True)

# Add dummy vars to the data
data = pd.concat([data, dummy_df], axis=1)

# Drop allocation flag features; handle missing values

In [6]:
# Drop "weight" features
weight_feats = [col for col in data.columns if col[-3:] == 'WGT']
data.drop(weight_feats, axis=1, inplace=True)

# Drop some other columns that are "bad"
bad_feats = ['QSTNUM', 'PRERNWA', 'PRERNHLY', 'PEERNH1O', 'HRHHID2', 'GTCSA', 'GTCO', 'HUINTTYP', 'HURESPLI', 'HRMIS',
            'PRDTOCC1', 'PRFAMREL', 'PUSLFPRX', 'OCCURNUM', 'PULINENO', 'PRMJOCC1', 'PRCHLD', 'GTCBSA', 'HRLONGLK',]
data.drop(bad_feats, axis=1, inplace=True)

In [7]:
# Drop allocation flag features
allocation_flag_feats = [col for col in data.columns if col[:2] == 'PX']
data.drop(allocation_flag_feats, axis=1, inplace=True)

# Replace negative values (all forms of N/A) with NaN
for feat in data.columns:
    data[feat] = data[feat].apply(lambda x: np.NaN if x < 0 else x)
    
# Replace NaN with the mean of the column
for feat in data.columns:
    data[feat].fillna(data[feat].mean(), inplace=True)
    
# Check for columns that are all NaN, and delete them
all_nan_cols = []
for feat in data.columns:
    if np.all(np.isnan(data[feat])):
        all_nan_cols.append(feat)
data.drop(all_nan_cols, axis=1, inplace=True)

# Remove misleading features (mostly codes)

starting with data that has engineered features, negative values set to the mean of the feature, and allocation flags removed.

In [9]:
# first list is from eyeballing; second is list of "weight" features that are just used for statistical tallying
bad_cols = ['GESTFIPS', 'GESTCEN',] + [col for col in data.columns if col[-3:] == 'WGT']

# drop 'em
data.drop(bad_cols, axis=1, inplace=True)

# Write out the columns we've selected

In [15]:
import pickle as pkl
keep_feats = pkl.load(open('keep_feats.pkl', 'rb'))

In [16]:
len(keep_feats)

126

In [17]:
data = data[keep_feats]

In [18]:
data.to_csv('data/test_2008_cleaned3.csv', index=False)