In [1]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
%matplotlib inline



## Data load

In [2]:
df= pd.read_csv('../data/train.csv')

In [3]:
# Group columns by type
colnames= map(lambda x: str(x),df.columns.values)
id_col= colnames[0]
target_col= colnames[1]
cat_cols= filter(lambda x: '_cat' in x,colnames)
bin_cols= filter(lambda x: '_bin' in x,colnames)
num_cols= filter(lambda x: ('_cat' not in x) and ('_bin' not in x) and x not in [id_col,target_col] ,
                colnames)

## Data exploration

In [4]:
# Cut bad fields, based on kernel discussions
bin_cols.remove('ps_ind_12_bin')
df.drop('ps_ind_12_bin',axis=1,inplace=True)

In [5]:
# Remove uninformative of noisy shadow features
shadows= ['ps_car_11_cat',       
            'ps_calc_14',        
            'ps_calc_11',         
            'ps_calc_06',           
            'ps_calc_16_bin',       
            'ps_calc_19_bin',       
            'ps_calc_20_bin',     
            'ps_calc_15_bin',      
            'ps_ind_11_bin',       
            'ps_ind_10_bin']
for s in shadows:
    if s in bin_cols:
        bin_cols.remove(s)
    elif s in num_cols:
        num_cols.remove(s)
    elif s in cat_cols:
        cat_cols.remove(s)
    df.drop(s,axis=1,inplace=True)
    
for c in df.columns:
    if 'calc' in c:
        df.drop(c,axis=1,inplace=True)
        if c in bin_cols:
            bin_cols.remove(c)
        elif c in num_cols:
            num_cols.remove(c)
        elif c in cat_cols:
            cat_cols.remove(c)

In [6]:
# Split train and test
import sklearn.cross_validation
#train= range(len(df[target_col].values))
train,test= sklearn.cross_validation.train_test_split(range(df[target_col].count()),test_size= 0.33,random_state=0)

train_df= df.loc[train,:]
test_df= df.loc[test,:]



In [7]:
# Add some extra features
train_df = train_df.replace(-1, np.NaN)
d_median = train_df.median(axis=0)
d_mean = train_df.mean(axis=0)
train_df = train_df.fillna(-1)

def transform_df(indf):
    indf = pd.DataFrame(indf)
    dcol = [c for c in indf.columns if c not in ['id','target']]
    indf['ps_car_13_x_ps_reg_03'] = indf['ps_car_13'] * indf['ps_reg_03']
    indf['negative_one_vals'] = np.sum((indf[dcol]==-1).values, axis=1)
    for c in dcol:
        if '_bin' not in c: #standard arithmetic
            indf[c+str('_median_range')] = (indf[c].values > d_median[c]).astype(np.int)
            indf[c+str('_mean_range')] = (indf[c].values > d_mean[c]).astype(np.int)
    return indf

# def transform_df(df):
#     df = pd.DataFrame(df)
#     dcol = [c for c in df.columns if c not in ['id','target']]
#     df['ps_car_13_x_ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']
#     df['negative_one_vals'] = np.sum((df[dcol]==-1).values, axis=1)
#     for c in dcol:
#         if '_bin' not in c: #standard arithmetic
#             df[c+str('_median_range')] = (df[c].values > d_median[c]).astype(np.int)
#             df[c+str('_mean_range')] = (df[c].values > d_mean[c]).astype(np.int)
#     return df

train_df= transform_df(train_df)
test_df= transform_df(test_df)

In [8]:
# TODO: implement a class prevalence vectorizer for categoricals, to be used in processing some columns 
# and in understanding field importance

class prevalence_vectorizer:
    def __init__(self,field_list):
        self.field_list= field_list
        self.vectorizer= {}
        for f in field_list:
            self.vectorizer[f]= {}
        return None
    
    def train(self,df,labelcol):
        data_fields= map(lambda x: str(x),df.columns)
        for f in self.field_list:
            if f in data_fields:
                self.vectorizer[f]= {}
                grps= df.groupby(f).apply(lambda x: float(np.sum(x[labelcol])) / len(x[labelcol]) )
                for g in grps.index.values:
                    self.vectorizer[f][g]= grps[g]
            else:
                print 'Warning: field '+f+' not in train data'
        return None
    
    def parse(self,vec,field_name):
        if field_name in self.field_list:
            return map(lambda x: self.vectorizer[field_name][x],vec)
        else:
            print field_name + ' not in vectorizer. Available fields: '+ self.field_list
            raise Exception

## Data processing

Processing plan: 
- cut rows with missing ps_car_11 and ps_car_12
- Normalize all numerics, except: [ps_car_14 and ps_reg_03]
- For [ps_car_14 and ps_reg_03], bin and convert to class prevalence (future work: fit for the missing values?)
- Treat categoricals: dict vectorize, except for: 
     [ps_car_11_cat, ps_car_06_cat, ps_car_04_cat, ps_car_01_cat, ps_car_09_cat, ps_ind_05_cat], which we turn into class prevalence rates

TODO: better treatment of: [ps_car_14 and ps_reg_03]

In [9]:
toPrevalate= ['ps_car_06_cat','ps_car_01_cat']
toOneHot= filter(lambda x: x not in toPrevalate,cat_cols)

In [10]:
# one hot encoder
import sklearn.preprocessing
enc= sklearn.preprocessing.OneHotEncoder(sparse= False)
X_train_onehot= enc.fit_transform( train_df.loc[:,toOneHot].as_matrix()+1.0 )

In [11]:
# Prevalence vectorizer
vectorizer= prevalence_vectorizer(toPrevalate)
vectorizer.train(train_df.loc[:,toPrevalate + [target_col] ],target_col)

X_train_prevalence= np.zeros([len(train),len(toPrevalate)])
i=0
for c in toPrevalate:
    X_train_prevalence[:,i]= vectorizer.parse( train_df.loc[:,c].values, c ) 
    i += 1

In [12]:
# Get means and stds for the columns in training set (for non-missing values)
numerical_norms= {}
for c in num_cols:
    numerical_norms[c]= (train_df.loc[ train_df[c] >= 0, c].mean(),train_df.loc[ train_df[c] >= 0, c].std())

In [13]:
# Normalise numerical fields <- TODO: better treatment of [ps_car_14 and ps_reg_03]
X_train_numeric= np.zeros([len(train),len(num_cols)])
i=0
for c in num_cols:
    X_train_numeric[:,i]= ( train_df[c].values - numerical_norms[c][0] ) / numerical_norms[c][1]
    i+=1

In [14]:
X_train= np.concatenate([ X_train_numeric, 
                          X_train_prevalence, 
                          X_train_onehot, 
                          train_df.loc[:,bin_cols].as_matrix() 
                        ],
                        axis=1)

In [15]:
np.save(open('train_matrix.bin','wb'),X_train)
np.save(open('train_labels.bin','wb'),train_df[target_col].values)

Now repeat encodings for test set (without fitting them)

In [16]:
X_test_onehot= enc.transform( test_df.loc[:,toOneHot].as_matrix()+1.0 )
X_test_prevalence= np.zeros([len(test),len(toPrevalate)])
i=0
for c in toPrevalate:
    X_test_prevalence[:,i]= vectorizer.parse( test_df.loc[:,c].values, c ) 
    i += 1
X_test_numeric= np.zeros([len(test),len(num_cols)])
i=0
for c in num_cols:
    X_test_numeric[:,i]= ( test_df[c].values - numerical_norms[c][0] ) / numerical_norms[c][1]
    i+=1
X_test= np.concatenate([ X_test_numeric, 
                          X_test_prevalence, 
                          X_test_onehot, 
                          test_df.loc[:,bin_cols].as_matrix() 
                        ],
                        axis=1)
np.save(open('test_matrix.bin','wb'),X_test)
np.save(open('test_labels.bin','wb'),test_df[target_col].values)