In [None]:
# Purpose: Perform LASSO Feature Selection, Generate Predictions
# Inputs: train.csv as training data, mean-imputed data with Homelessness Indicators added
# Outputs: Files with LASSO selected vars at a r^2 cutoff of 0.4 for each outcome, prediction file from Random Forest using these vars
# Machine: 4-Core PC w/hyperthreading, ~6hrs 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
%matplotlib inline


In [None]:
data_all = pd.read_csv('../output/data_mean_imputed_Homeless_added.csv', index_col = 'challengeID')
outcomes = pd.read_csv('../data/train.csv',index_col='challengeID')

In [None]:
outcomes.columns.values

## Lasso Feature Selection

In [None]:
Alphas = np.logspace(-3,2,50).tolist()

for OUTCOME in outcomes.columns.values:
    y = outcomes[OUTCOME]
    y = y.dropna()
    x_all = data_all.loc[(np.in1d(list(data_all.index),list(y.index))),:]
    r_2 = []
    for L in Alphas:
        reg = linear_model.Lasso(alpha = L)
        reg.fit(x_all,y)
        r_2.append(reg.score(x_all,y))

    reg = linear_model.Lasso()
    path = reg.path(x_all,y, alphas = Alphas)
    n = [np.sum(path[1][:,n] != 0) for n in range(0,len(Alphas))]
    r_2.reverse()
    Alphas.reverse()

    temp = [abs(i-0.4) for i in r_2]

    Alpha_O = Alphas[temp.index(min(temp))]
    coeff = pd.DataFrame(path[1][:,temp.index(min(temp))],index = x_all.columns.values) 
    feature_index = coeff != 0
    features = x_all.loc[:,feature_index.iloc[:,0]]
    x_lars = data_all.loc[:,features.columns.values]
    x_lars.to_csv('../output/LASSO/Lasso_Selected_Vars_'+OUTCOME+'.csv')

## Final Prediction Loop

In [None]:
predictions = {'challengeID':np.array(list(data_all.index)),
               'gpa':None,'grit':None,'materialHardship':None,'eviction':None,'layoff':None,'jobTraining':None} 

for OUTCOME in outcomes.columns.values:
    
    y_all = outcomes[OUTCOME]
    y_all = y_all.dropna()
    x_all = pd.read_csv(str('../output/LASSO/Lasso_Selected_Vars_'+OUTCOME+'.csv'),index_col = 'challengeID')
    x_all = x_all.loc[np.in1d(x_all.index,y_all.index)]
    
    x_test_all = data_all.loc[:,np.in1d(data_all.columns.values,x_all.columns.values)]


    if OUTCOME in ['eviction','layoff','jobTraining']:
        N = 300
        F = 25
        D = 4
        L = None
        mod = RandomForestRegressor(n_estimators = N, max_features = F, max_depth = D)
        mod = mod.fit(x_all,y_all)
        pred_y = mod.predict(x_test_all)

    else:
        N = 300
        F = 50
        D = None
        L = 5
        mod = RandomForestRegressor(n_estimators = N, max_features = F, max_depth = D, min_samples_leaf = L)
        mod = mod.fit(x_all,y_all)
        pred_y = mod.predict(x_test_all)
    
    predictions[OUTCOME] = pred_y
    

In [None]:
pd.DataFrame(predictions).to_csv('../output/final_pred/lassoRF_prediction.csv', index = False)