In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import chi2, f_classif
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.tree import DecisionTreeRegressor as DTR

In [2]:
train_y = pd.read_csv('train_y.csv', header=None, usecols=(1,1))
train_x = pd.read_csv('train_x.csv', index_col=0)

In [3]:
# Segmenting continuous and categorical columns
cat_list = pd.read_csv('train_cat_list.csv')
cat_list = list(cat_list['0'])
cnt_list = [x for x in train_x.columns if x not in cat_list]
cont_col = np.concatenate([cnt_list, cat_list])  # All columns


In [4]:
# Shortlisting the features properly, by making further segments depending upon the VIF values
# Putting a cut off for vif

In [5]:
#VIF function - returns VIF dataset for specified columns
def vif_calc(data, col_roll, cutoff = 7.5):
    print 'Calculating VIF...'
    vif_col = pd.DataFrame()
    for col in range(data[col_roll].shape[1]):
        vif_col = vif_col.append({'var': data[col_roll].columns[col],
                                  'VIF' : vif(data[col_roll].as_matrix(), col)}, ignore_index = True)
    
    vif_col['VIF_type'] = np.where(vif_col['VIF'] >= cutoff, 'High', 'Low')
    print 'VIF check completed.'
    print 'Dataframe with VIF values', vif_col.shape
    return vif_col


# RF wrapper - returns variable importance 
def RF_wrap(data, target, col_names, imp_cutoff = 0.005, n_trees = 500, verbose = False, oob_score = False, min_samples_leaf = 5):
    max_features = int(round(0.4*col_names.shape[0]))
    rf =  RF(n_estimators = n_trees, oob_score = oob_score, random_state = 1203, max_features = max_features, 
             min_samples_leaf = min_samples_leaf, verbose = verbose)
    print 'Fitting RF regressor..'
    rf.fit(data[col_names], target)
    print 'Getting Variable Importance..'
    imp = pd.concat([pd.DataFrame(data[col_names].columns), pd.DataFrame(rf.feature_importances_)], axis=1)
    imp.columns = ['Variable', 'Importance']
    imp['Imp_type'] = np.where(imp['Importance'] >= imp_cutoff, 'High', 'Low')
    print 'Done'
    print 'Dataframe with RF feature selection', imp.shape
    return imp


# DecisionTree wrapper - returns variable importance 
def Tree_wrap(data, target, col_names, imp_cutoff = 0.005, min_samples_leaf = 5, min_samples_split = 10):
    max_features = int(round(0.4*col_names.shape[0]))
    dtr =  DTR(random_state = 1203, max_features = max_features,
               min_samples_leaf =min_samples_leaf, min_samples_split =min_samples_split)
    print 'Fitting Decision Tree regressor..'
    dtr.fit(data[col_names], target)
    print 'Getting Variable Importance..'
    imp = pd.concat([pd.DataFrame(data[col_names].columns), pd.DataFrame(dtr.feature_importances_)], axis=1)
    imp.columns = ['Variable', 'Importance']
    imp['Imp_type'] = np.where(imp['Importance'] >= imp_cutoff, 'High', 'Low')
    print 'Done'
    print 'Dataframe with Decision Tree feature selection', imp.shape
    return imp


#  Wrapper for selecting continuous features - returns ANOVA F-value
def f_wrap(data, target, col_names, p_cutoff = 0.05):
    fdat = f_classif(data[col_names], target)[1]
    fdat_df = pd.concat([pd.DataFrame(data[col_names].columns), pd.DataFrame(fdat)], axis=1)
    fdat_df.columns = ['Variable', 'P_value_fclass']
    fdat_df['Imp_type'] = np.where(fdat_df['P_value_fclass'] > p_cutoff, 'Low', 'High')
    print 'Dataframe with Anova-F feature selection', fdat_df.shape
    return fdat_df


#  Wrapper for selecting categorical features - returns Chi2 P-value
def chi2_wrap(data, target, col_names, p_cutoff = 0.05):
    chdat = chi2(data[col_names], target)[1]
    chdat_df = pd.concat([pd.DataFrame(data[col_names].columns), pd.DataFrame(chdat)], axis=1)
    chdat_df.columns = ['Variable', 'P_value_chi2']
    chdat_df['Imp_type'] = np.where(chdat_df['P_value_chi2'] > p_cutoff, 'Low', 'High')    
    print 'Dataframe with Chi2 feature selection', chdat_df.shape
    return chdat_df


#  Wrapper for calling feature selection methods 
#  method parameter takes the values 'rf', 'tree', 'anovaf', 'chi2'
def method_call(data, target, cols, method, pcut = 0.05, impcut = 0):
    if method == 'rf':
        method_df = RF_wrap(data, target, np.asarray(cols), imp_cutoff = impcut)
        method_df['Measure'] = "Importance"
        print 'Done'
        
    elif method == 'tree':
        method_df = Tree_wrap(data, target, np.asarray(cols), imp_cutoff = impcut)
        method_df['Measure'] = "Importance"
        print 'Done'
    
    elif method == 'anovaf':
        method_df = f_wrap(data, target, cols, p_cutoff = pcut)
        method_df.columns = ['Variable', 'Importance', 'Imp_type']
        method_df['Measure'] = "P-value"
        print 'Done'
        
    else:
        method_df = chi2_wrap(data, target, cols, p_cutoff = pcut)
        method_df.columns = ['Variable', 'Importance', 'Imp_type']
        method_df['Measure'] = "P-value"
        print 'Done'
    
    return method_df
        

In [6]:
#Feature selection wrapper; returns a dataframe with variable name, variable type, respective P-value/Importances, etc.

#categ_method argument takes 'rf', 'tree', 'chi2'
#cont_method argument takes 'rf', 'tree', 'anovaf'

def allvarsel2(data, target, itername, cont_col = cont_col, cont_method ='rf', categ_method ='rf', VIF_cutoff = 5, 
               imp_cutoff = 0, p_cutoff = 0.05, cont_list = cnt_list, categ_list = cat_list):
    
    ##Calculate VIF for continuous variables
    check = vif_calc(data, col_roll=cont_col, cutoff = VIF_cutoff)
    check.to_csv('/home/pgoyal/Projects/app/' + str(itername) + '.csv', index = False)
    lowvif = check.copy()
    highvif = check[check.VIF_type.values == 'High']['var'].as_matrix()
    lowvif = check[check.VIF_type.values == 'Low']['var'].as_matrix()
    
    #Feature selection for continuous variables with high VIF
    print 'Fitting ', cont_method, ' on variables with High VIF...'
    high_vif_df = method_call(data, target, cols = highvif, method = cont_method, pcut = p_cutoff, impcut =imp_cutoff)
    high_vif_df['Var_data'] = "High VIF - all datasets"    
    highvif_imp = high_vif_df[high_vif_df.Imp_type.values == 'High']['Variable'].as_matrix()
    
    #Segregate variables with low VIF according to datasource  
    hist_rf_var = [val for val in lowvif if val in cont_list]
     
    #Feature selection for low VIF variables from latest transaction (historical) data
    print 'Fitting ', cont_method, ' on Historical - latest variables...'
    hist_rf_df = method_call(data, target, cols = hist_rf_var, method = cont_method, pcut = p_cutoff, impcut =imp_cutoff)
    hist_rf_df['Var_data'] = "Historical - latest"

    #Convert shorlisted variables into matrix form
    hist_rf_imp = hist_rf_df[hist_rf_df.Imp_type.values == 'High']['Variable'].as_matrix()
    
    #Feature selection for categorical variables from transaction(historical) data
    print 'Fitting ', categ_method, ' on Categorical variables...'
    categ_var = method_call(data, target, cols = categ_list, method = categ_method, pcut = p_cutoff, impcut =imp_cutoff)
    categ_var['Var_data'] = "Categorical"
    categ_var_imp = categ_var[categ_var.Imp_type.values == 'High']['Variable'].as_matrix()
    
    #Append shortlisted variables/importance dataframes to create a common list/dataframe
    selvar = np.concatenate([highvif_imp, hist_rf_imp, categ_var_imp])
    vardf = pd.concat([high_vif_df, hist_rf_df, categ_var], axis=0)
    
    return vardf


In [7]:
#Using the function

featsel1 = allvarsel2(train_x, np.ravel(train_y[1]), itername = 'train_data', cont_method = 'rf', categ_method = 'rf')

Calculating VIF...
VIF check completed.
Dataframe with VIF values (272, 3)
Fitting  rf  on variables with High VIF...
Fitting RF regressor..
Getting Variable Importance..
Done
Dataframe with RF feature selection (22, 3)
Done
Fitting  rf  on Historical - latest variables...
Fitting RF regressor..
Getting Variable Importance..
Done
Dataframe with RF feature selection (250, 3)
Done
Fitting  rf  on Categorical variables...
Fitting RF regressor..
Getting Variable Importance..
Done
Dataframe with RF feature selection (22, 3)
Done


In [8]:
# shortlisting the continuous features by cross checking the list of VIFs : 107 have been shortlisted ; 
# check train_data_xls.xls file for the shortlisted variables marked in Yellow color
cnts_vars_final = pd.DataFrame(cnt_list)
cnts_vars_final.to_csv('cnt_vars.csv')

In [9]:
# Selecting the top variables 107 variables from the list of variables on the basis of VIF
# cnts_vars_final = ['f_158','f_19','f_154','f_48','f_202','f_113','f_69','f_27','f_74','f_240','f_6','f_108','f_118',
#                   'f_112','f_227','f_177','f_110','f_53','f_4','f_231','f_246','f_152','f_201','f_0','f_225','f_184',
#                   'f_56','f_16','f_90','f_37','f_200','f_203','f_129','f_236','f_46','f_182','f_100','f_176','f_251',
#                   'f_247','f_17','f_206','f_238','f_23','f_24','f_126','f_127','f_64','f_41','f_111','f_65','f_22',
#                   'f_128','f_136','f_107','f_92','f_122','f_78','f_2','f_10','f_59','f_229','f_125','f_33','f_40',
#                   'f_170','f_159','f_5','f_174','f_194','f_241','f_77','f_199','f_140','f_198','f_178','f_216',
#                   'f_188','f_120','f_131','f_144','f_67','f_50','f_221','f_60','f_42','f_180','f_76','f_226',
#                   'f_149','f_1','f_73','f_18','f_155','f_173','f_85','f_135','f_248','f_68','f_191','f_252',
#                   'f_98','f_204','f_45','f_58','f_28','f_183']

In [10]:
# Using all categorical variables as of now
pd.DataFrame(cat_list).to_csv('cat_vars.csv')