In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn_pandas import DataFrameMapper

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
features = pd.read_csv('background.csv', low_memory=False, index_col='challengeID')
# Fix date bug
features.cf4fint = ((pd.to_datetime(features.cf4fint) - pd.to_datetime('1960-01-01')) / np.timedelta64(1, 'D')).astype(int)

features.shape

(4242, 12944)

In [15]:
# replace negative numbers with NA
num = features._get_numeric_data()
num[num < 0] = np.nan

# delete columns that are at over 3/4 NA
features_1 = features.dropna(axis=1, how='any', thresh=features.shape[0]/4)
features_1.shape

nulls = pd.isnull(features_1).sum() > 0
cols_with_nan = nulls[nulls == True]
cols_with_nan.head()
print("# of columns with missing values: {}".format(len(cols_with_nan)))

(4242, 10662)

m1citywt    True
m1f7        True
m1h3        True
m1i2b       True
m1i11       True
dtype: bool

# of columns with missing values: 1160


In [16]:
features_nan = features_1[cols_with_nan.keys()]
features_nan.head()

Unnamed: 0_level_0,m1citywt,m1f7,m1h3,m1i2b,m1i11,m1j2a,m1j2b,m1j2d,cm1hhinc,cm1inpov,...,m3d9,f3d7,m4d6,m4d7,f4d6,f4d7,m5c6,m5d20,f5c6,k5f1
challengeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
418,4.187361,7.0,2.0,1121.466228,,1.734794,,,5530.303798,0.0,...,,,,,,,10.258649,,10.592059,9.566678
2882,33.213688,11.0,1.0,0.0,,10.118418,,1.042914,70002.72444,5.260924,...,,10.137821,,,,,,,,
2060,14.125088,3.0,5.0,759.351277,,,,,33242.969722,1.411334,...,,10.926755,8.504683,10.117084,10.025368,9.080795,9.237308,,9.969594,9.814014
2533,19.746658,1.0,1.0,0.0,3.0,2.771169,,,0.0,0.0,...,,,,,,,,8.660517,,10.085962
2118,6.716506,7.0,2.0,1252.766703,,8.725409,,,11411.891004,0.855543,...,9.136485,,,,,,,8.921872,,9.566678


In [25]:
important_features = ['f1b20', 't5b4y', 'k5g2m', 'k5e2a', 'p5q3bw', 'n5d3c2_3', 'm5g13', 'm5f16'
, 'ffcc_cen', 'f1b9b2', 'm2b31e', 'm3b16p1', 'm5b22_10', 'k5g2c', 'k5g2h', 'k5g2d'
, 'p5q3bt', 'hv3m23', 'f3b3', 'n5f12', 'm4k24c', 'hv3m7', 'p5l18', 'p5q3cg'
, 'm5d18d', 'hv3m21', 't5b4r', 'ffcc_pof', 'hv4s1_ot', 'ffcc_fam', 'p5j4b'
, 'm2c37a3', 'm2h3', 'cf5povco', 'f5j6c', 'm3i3c', 'm5g19a', 'f5j6f', 'm1d2c'
, 'm3c34', 'f5i13', 'hv5_ppvt', 'cf1edu', 't5b1u', 'cm2povco', 'm1i1', 'm1i3'
, 'hv5_wj10'] # Found using naive imputation, feature selecting using lasso
len(important_features)

# features that have nan that can be more efficiently computed
important_imputable_features = list(set(important_features) & set(features_nan.columns.values))
important_imputable_features

48

['hv3m7', 'cm2povco', 'hv3m21', 'hv3m23', 'f5i13', 'cf5povco']

In [27]:
# output features_reduced
for f in important_features:
    if not np.any(features_1.columns[:] == f):
        important_features.remove(f)
len(important_features)
features_reduced = features_1[important_features]
features_reduced

37

Unnamed: 0_level_0,f1b20,t5b4y,k5g2m,k5e2a,p5q3bw,n5d3c2_3,m5g13,m5f16,f1b9b2,m3b16p1,...,m5g19a,f5j6f,m1d2c,m3c34,f5i13,cf1edu,t5b1u,cm2povco,m1i1,m1i3
challengeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
418,-9,0,0,4,2,-7,-6,-6,-9,-9,...,-6,2,1,-9,0.000000,1,1,1.388536,3,3
2882,1,-9,-9,-9,-9,-7,-9,-9,-6,-6,...,-9,-6,1,2,72763.497873,3,-9,2.796978,6,6
2060,1,2,1,1,2,-7,-6,-6,-6,-6,...,-6,2,1,2,0.000000,1,3,0.555918,4,2
2533,-9,1,3,0,1,-7,1,1,-9,-6,...,-6,-9,2,1,,-3,2,,4,-2
2118,-9,1,0,0,2,-7,1,-6,-9,-6,...,-6,-9,2,1,,2,2,0.000000,5,4
2935,2,-9,0,0,2,-7,-6,-6,2,-6,...,-6,-6,1,2,5417.071174,2,-9,0.755137,3,4
1434,-9,1,3,1,1,-7,-6,-6,-9,-9,...,-6,-9,2,-9,,2,2,,6,4
3768,1,1,0,0,1,-7,-6,1,-6,-6,...,-2,1,2,2,0.000000,2,2,0.111805,6,4
1452,1,3,3,0,1,-7,-6,2,-6,-6,...,-6,-6,1,2,16661.864412,2,2,1.976687,6,4
329,2,-9,3,2,2,-7,-6,-6,2,-6,...,3,-6,1,-2,,2,-9,0.632874,6,5


In [8]:
features_nan.to_csv("features_nan.csv")

In [28]:
features_reduced.to_csv("features_reduced.csv")