In [None]:
# Purpose: Perform Feature Selection using Mutual Information for several K-values, save the output
# Inputs: Imputed Dataset w/added Homelessness Indicators
# Outputs: Several Files named after the K-cutoff used for MI on each outcome of interest
# Machine: Laptop, Runtime 8 hrs  


In [None]:
import numpy as np
import scipy.stats as sp
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import glob
import warnings
from sklearn.preprocessing import Imputer
from statsmodels.stats.outliers_influence import variance_inflation_factor    

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import mutual_info_regression

warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
dfx = pd.read_csv('../output/data_mean_imputed_Homeless_added.csv',index_col='challengeID')
dfy = pd.read_csv('../data/train.csv',index_col='challengeID')

outcomes = list(dfy.columns) #get the names of the outcomes

In [None]:
np.shape(dfx)

In [None]:
outcomes

In [None]:
full = dfx.join(dfy, how='outer') #connect the background data to outcomes

In [None]:
training = full.dropna(subset=outcomes, how='all') ##drop observations that have None of the outcomes

In [None]:
full_features = []

In [None]:
for k in [5,15,50,100,200,300,500,700,1000,1500,2000,3000,4000]:
    ## Selecting top K in GPA
    gpa_x = training.dropna(subset=['gpa'], how='all')
    gpa_y = gpa_x['gpa']
    for outcome in outcomes:
        del gpa_x[outcome]

    X_gpa = SelectKBest(mutual_info_regression, k=k).fit_transform(gpa_x, gpa_y)
    
    gpa_featuers = []
    for col in X_gpa.T:
        gpa_featuers.append(gpa_x.columns[(gpa_x.values == np.asarray(col)[:,None]).all(0)].tolist()[0])
    
    
    ## Selecting top K in Grit
    grit_x = training.dropna(subset=['grit'], how='all')
    grit_y = grit_x['grit']
    for outcome in outcomes:
        del grit_x[outcome]


    X_grit = SelectKBest(mutual_info_regression, k=k).fit_transform(grit_x, grit_y)

    grit_featuers = []
    for col in X_grit.T:
        grit_featuers.append(grit_x.columns[(grit_x.values == np.asarray(col)[:,None]).all(0)].tolist()[0])
           
        
    ## Selecting top K in MaterialHardship
    materialHardship_x = training.dropna(subset=['materialHardship'], how='all')
    materialHardship_y = materialHardship_x['materialHardship']
    for outcome in outcomes:
        del materialHardship_x[outcome]


    X_materialHardship = SelectKBest(mutual_info_regression, k=k).fit_transform(materialHardship_x, materialHardship_y)

    materialHardship_featuers = []
    for col in X_materialHardship.T:
        materialHardship_featuers.append(materialHardship_x.columns[(materialHardship_x.values == np.asarray(col)[:,None]).all(0)].tolist()[0])
    
    
    ## Selecting top K in Eviction
    eviction_x = training.dropna(subset=['eviction'], how='all')
    eviction_y = eviction_x['eviction']
    for outcome in outcomes:
        del eviction_x[outcome]

    X_eviction = SelectKBest(mutual_info_classif, k=k).fit_transform(eviction_x, eviction_y)

    eviction_featuers = []
    for col in X_eviction.T:
        eviction_featuers.append(eviction_x.columns[(eviction_x.values == np.asarray(col)[:,None]).all(0)].tolist()[0])

    
    # Selecting top K in Layoff
    layoff_x = training.dropna(subset=['layoff'], how='all')
    layoff_y = layoff_x['layoff']
    for outcome in outcomes:
        del layoff_x[outcome]

    X_layoff = SelectKBest(mutual_info_classif, k=k).fit_transform(layoff_x, layoff_y)

    layoff_featuers = []
    for col in X_layoff.T:
        layoff_featuers.append(layoff_x.columns[(layoff_x.values == np.asarray(col)[:,None]).all(0)].tolist()[0])


    # Selecting top K in JobTraining
    jobTraining_x = training.dropna(subset=['jobTraining'], how='all')
    jobTraining_y = jobTraining_x['jobTraining']
    for outcome in outcomes:
        del jobTraining_x[outcome]

    X_jobTraining = SelectKBest(mutual_info_classif, k=k).fit_transform(jobTraining_x, jobTraining_y)


    jobTraining_featuers = []
    for col in X_jobTraining.T:
        jobTraining_featuers.append(jobTraining_x.columns[(jobTraining_x.values == np.asarray(col)[:,None]).all(0)].tolist()[0])
        
    
    # Combine Features
    final_features = list(set(jobTraining_featuers+layoff_featuers+eviction_featuers+materialHardship_featuers+grit_featuers+gpa_featuers))
    selected_df = full[final_features]
    
    # Save CSV
    selected_df.to_csv('../output/MI/data_univariate_feature_selection_'+str(k)+'.csv')