In [None]:
# * This Assignment has been done by Palash Goyal, with reference to the application at Auto1 Group. 
# ** Kindly request for approval before any kind of personal use.
# Palash Goyal - palashgoyal1@gmail.com

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import chi2, f_classif
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.tree import DecisionTreeClassifier as DTC


In [2]:
auto_y = pd.read_csv('auto_y.csv', index_col=0)
auto_x = pd.read_csv('auto_x.csv', index_col=0)

In [3]:
auto_x.head()

Unnamed: 0,normalized_losses,wheel_base,length,width,height,curb_weight,engine_size,bore,stroke,compression_ratio,...,engine_type_rotor,num_of_cylinders_five,num_of_cylinders_four,num_of_cylinders_others,num_of_cylinders_six,fuel_system_1bbl,fuel_system_2bbl,fuel_system_idi,fuel_system_mpfi,fuel_system_others
0,122.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,122.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,122.0,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [4]:
cnt_list = ['normalized_losses','wheel_base','length','width','height','curb_weight','engine_size','bore','stroke',
            'compression_ratio','horsepower','peak_rpm','city_mpg','highway_mpg','price']
cat_list = [x for x in auto_x.columns if x not in cnt_list]
cont_col = np.concatenate([cnt_list, cat_list])


In [5]:
# Shortlisting the features properly, by making further segments depending upon the VIF values
# Putting a cut off for vif

In [6]:
#VIF function - returns VIF dataset for specified columns
def vif_calc(data, col_roll, cutoff = 7.5):
    print 'Calculating VIF...'
    vif_col = pd.DataFrame()
    for col in range(data[col_roll].shape[1]):
        vif_col = vif_col.append({'var': data[col_roll].columns[col],
                                  'VIF' : vif(data[col_roll].as_matrix(), col)}, ignore_index = True)
    
    vif_col['VIF_type'] = np.where(vif_col['VIF'] >= cutoff, 'High', 'Low')
    print 'VIF check completed.'
    print 'Dataframe with VIF values', vif_col.shape
    return vif_col


# RF wrapper - returns variable importance 
def RF_wrap(data, target, col_names, imp_cutoff = 0.005, n_trees = 500, verbose = False, oob_score = False, min_samples_leaf = 5):
    max_features = int(round(0.4*col_names.shape[0]))
    rf =  RF(n_estimators = n_trees, oob_score = oob_score, random_state = 1203, max_features = max_features, 
             min_samples_leaf = min_samples_leaf, verbose = verbose)
    print 'Fitting RF classifier..'
    rf.fit(data[col_names], target)
    print 'Getting Variable Importance..'
    imp = pd.concat([pd.DataFrame(data[col_names].columns), pd.DataFrame(rf.feature_importances_)], axis=1)
    imp.columns = ['Variable', 'Importance']
    imp['Imp_type'] = np.where(imp['Importance'] >= imp_cutoff, 'High', 'Low')
    print 'Done'
    print 'Dataframe with RF feature selection', imp.shape
    return imp


# DecisionTree wrapper - returns variable importance 
def Tree_wrap(data, target, col_names, imp_cutoff = 0.005, min_samples_leaf = 5, min_samples_split = 10):
    max_features = int(round(0.4*col_names.shape[0]))
    dtc =  DTC(random_state = 1203, max_features = max_features,
               min_samples_leaf =min_samples_leaf, min_samples_split =min_samples_split)
    print 'Fitting Decision Tree classifier..'
    dtc.fit(data[col_names], target)
    print 'Getting Variable Importance..'
    imp = pd.concat([pd.DataFrame(data[col_names].columns), pd.DataFrame(dtc.feature_importances_)], axis=1)
    imp.columns = ['Variable', 'Importance']
    imp['Imp_type'] = np.where(imp['Importance'] >= imp_cutoff, 'High', 'Low')
    print 'Done'
    print 'Dataframe with Decision Tree feature selection', imp.shape
    return imp


#  Wrapper for selecting continuous features - returns ANOVA F-value
def f_wrap(data, target, col_names, p_cutoff = 0.05):
    fdat = f_classif(data[col_names], target)[1]
    fdat_df = pd.concat([pd.DataFrame(data[col_names].columns), pd.DataFrame(fdat)], axis=1)
    fdat_df.columns = ['Variable', 'P_value_fclass']
    fdat_df['Imp_type'] = np.where(fdat_df['P_value_fclass'] > p_cutoff, 'Low', 'High')
    print 'Dataframe with Anova-F feature selection', fdat_df.shape
    return fdat_df


#  Wrapper for selecting categorical features - returns Chi2 P-value
def chi2_wrap(data, target, col_names, p_cutoff = 0.05):
    chdat = chi2(data[col_names], target)[1]
    chdat_df = pd.concat([pd.DataFrame(data[col_names].columns), pd.DataFrame(chdat)], axis=1)
    chdat_df.columns = ['Variable', 'P_value_chi2']
    chdat_df['Imp_type'] = np.where(chdat_df['P_value_chi2'] > p_cutoff, 'Low', 'High')    
    print 'Dataframe with Chi2 feature selection', chdat_df.shape
    return chdat_df


#  Wrapper for calling feature selection methods 
#  method parameter takes the values 'rf', 'tree', 'anovaf', 'chi2'
def method_call(data, target, cols, method, pcut = 0.05, impcut = 0):
    if method == 'rf':
        method_df = RF_wrap(data, target, np.asarray(cols), imp_cutoff = impcut)
        method_df['Measure'] = "Importance"
        print 'Done'
        
    elif method == 'tree':
        method_df = Tree_wrap(data, target, np.asarray(cols), imp_cutoff = impcut)
        method_df['Measure'] = "Importance"
        print 'Done'
    
    elif method == 'anovaf':
        method_df = f_wrap(data, target, cols, p_cutoff = pcut)
        method_df.columns = ['Variable', 'Importance', 'Imp_type']
        method_df['Measure'] = "P-value"
        print 'Done'
        
    else:
        method_df = chi2_wrap(data, target, cols, p_cutoff = pcut)
        method_df.columns = ['Variable', 'Importance', 'Imp_type']
        method_df['Measure'] = "P-value"
        print 'Done'
    
    return method_df
        

In [7]:
#Feature selection wrapper; returns a dataframe with variable name, variable type, respective P-value/Importances, etc.

#categ_method argument takes 'rf', 'tree', 'chi2'
#cont_method argument takes 'rf', 'tree', 'anovaf'

def allvarsel2(data, target, itername, cont_col = cont_col, cont_method ='rf', categ_method ='rf', VIF_cutoff = 5, 
               imp_cutoff = 0, p_cutoff = 0.05, cont_list = cnt_list, categ_list = cat_list):
    
    ##Calculate VIF for continuous variables
    check = vif_calc(data, col_roll=cont_col, cutoff = VIF_cutoff)
    check.to_csv('/home/pgoyal/Projects/' + str(itername) + '.csv', index = False)
    lowvif = check.copy()
    highvif = check[check.VIF_type.values == 'High']['var'].as_matrix()
    lowvif = check[check.VIF_type.values == 'Low']['var'].as_matrix()
    
    #Feature selection for continuous variables with high VIF
    print 'Fitting ', cont_method, ' on variables with High VIF...'
    high_vif_df = method_call(data, target, cols = highvif, method = cont_method, pcut = p_cutoff, impcut =imp_cutoff)
    high_vif_df['Var_data'] = "High VIF - all datasets"    
    highvif_imp = high_vif_df[high_vif_df.Imp_type.values == 'High']['Variable'].as_matrix()
    
    #Segregate variables with low VIF according to datasource  
    hist_rf_var = [val for val in lowvif if val in cont_list]
     
    #Feature selection for low VIF variables from latest transaction (historical) data
    print 'Fitting ', cont_method, ' on Historical - latest variables...'
    hist_rf_df = method_call(data, target, cols = hist_rf_var, method = cont_method, pcut = p_cutoff, impcut =imp_cutoff)
    hist_rf_df['Var_data'] = "Historical - latest"

    #Convert shorlisted variables into matrix form
    hist_rf_imp = hist_rf_df[hist_rf_df.Imp_type.values == 'High']['Variable'].as_matrix()
    
    #Feature selection for categorical variables from transaction(historical) data
    print 'Fitting ', categ_method, ' on Categorical variables...'
    categ_var = method_call(data, target, cols = categ_list, method = categ_method, pcut = p_cutoff, impcut =imp_cutoff)
    categ_var['Var_data'] = "Categorical"
    categ_var_imp = categ_var[categ_var.Imp_type.values == 'High']['Variable'].as_matrix()
    
    #Append shortlisted variables/importance dataframes to create a common list/dataframe
    selvar = np.concatenate([highvif_imp, hist_rf_imp, categ_var_imp])
    vardf = pd.concat([high_vif_df, hist_rf_df, categ_var], axis=0)
    
    return vardf


In [8]:

#Using the function

featsel1 = allvarsel2(auto_x, np.ravel(auto_y), itername = 'auto_data', cont_method = 'rf', categ_method = 'rf')

Calculating VIF...
VIF check completed.
Dataframe with VIF values (55, 3)
Fitting  rf  on variables with High VIF...
Fitting RF classifier..
Getting Variable Importance..
Done
Dataframe with RF feature selection (53, 3)
Done
Fitting  rf  on Historical - latest variables...
Fitting RF classifier..
Getting Variable Importance..
Done
Dataframe with RF feature selection (2, 3)
Done
Fitting  rf  on Categorical variables...
Fitting RF classifier..
Getting Variable Importance..
Done
Dataframe with RF feature selection (40, 3)
Done


In [9]:
# shortlisting the continuous features by cross checking the list of VIFs
cnts_vars_final = pd.DataFrame(['normalized_losses','stroke','peak_rpm','height','bore','width','wheel_base','length','price'])
cnts_vars_final.to_csv('cnt_vars.csv')

In [10]:
# Using all categorical variables as of now
pd.DataFrame(cat_list).to_csv('cat_vars.csv')