# MI for SALARY including Experience features. Threshold - 0.1, total features - 81

In [None]:
import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.decomposition import PCA
import pydotplus
from sklearn.feature_selection import mutual_info_classif

In [None]:
import warnings

warnings.filterwarnings("ignore")

# Feature Selection

In [5]:
# Reading the raw_combined.csv
df_raw = pd.read_csv('raw_combined.csv')

# Reading the feature list metadata
df_feat_info = pd.read_csv('feature_info.csv', sep = '|')

# Selecting features to train the model on
selected_feature = df_feat_info[df_feat_info['category'].isin(['Demographic','Education','Employment'])]['id'].tolist()

# Uncomment the feature that you are trying to predit
pred_feat = 'SALARY'
# pred_feat = 'NOCPR'

# Binning Salary
bins = [0,10000,20000,30000,40000,50000,60000,70000,80000,90000,100000,110000,120000,130000,140000,150000,160000,170000,180000,190000,200000,210000,220000,230000,240000,250000,260000,270000,280000,290000,300000,310000,320000,330000,340000,350000]
labels = [0,10000,20000,30000,40000,50000,60000,70000,80000,90000,100000,110000,120000,130000,140000,150000,160000,170000,180000,190000,200000,210000,220000,230000,240000,250000,260000,270000,280000,290000,300000,310000,320000,330000,340000]
df_raw['SALARY_BIN'] = pd.cut(df_raw['SALARY'], bins=bins, labels=labels, right=False)
df_raw['SALARY_BIN'] = df_raw['SALARY_BIN'].astype(float)
pred_feat = 'SALARY_BIN'
# Filtering the raw file
df_features = df_raw[selected_feature]
df_pred = df_raw[pred_feat]

In [14]:
# Filling in -1 for na
df_features = df_features.fillna(-1)
df_pred = df_pred.fillna(-1)

# This step is very cpu intensive and slow, it can take time to run
mi = mutual_info_classif(df_features, df_pred, n_jobs = -1)

df_mi = pd.DataFrame.from_dict(dict(zip(df_features.columns.tolist(), mi.tolist())), orient='index').reset_index()
df_mi.columns = ['Col_name','MI']

# Setting the MI threshold
mi_thres = 0.10
filtered_mi_col = df_mi[df_mi['MI'] > mi_thres]['Col_name'].tolist()

df_features = df_features[filtered_mi_col]

In [7]:
df_features

Unnamed: 0,AGE,BIRYR,CH1218IN,CH19IN,BAAYR5,BA03Y5,NBAMED,HDAY5,HD03Y5,NDGMED,...,ICOLLAB,NWFAM,NWILL,NWLAY,NWNOND,NWOCNA,NWOT,NWOTP,NWRET,NWSTU
0,33,1960,-1.0,-1.0,1980.0,-1.0,226395,1985.0,-1.0,799995,...,-1.0,98,98.0,98,98,98,98.0,-1.0,98.0,98
1,38,1955,-1.0,-1.0,1980.0,-1.0,587995,1980.0,-1.0,587995,...,-1.0,0,0.0,1,0,0,0.0,-1.0,0.0,0
2,48,1945,-1.0,-1.0,1965.0,-1.0,799995,1980.0,-1.0,719995,...,-1.0,98,98.0,98,98,98,98.0,-1.0,98.0,98
3,48,1945,-1.0,-1.0,1970.0,-1.0,226395,1975.0,-1.0,799995,...,-1.0,98,98.0,98,98,98,98.0,-1.0,98.0,98
4,28,1965,-1.0,-1.0,1985.0,-1.0,719995,1985.0,-1.0,719995,...,-1.0,98,98.0,98,98,98,98.0,-1.0,98.0,98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
546708,56,1955,98.0,98.0,-1.0,1981.0,567350,-1.0,1981.0,567350,...,-1.0,98,-1.0,98,98,98,-1.0,98.0,-1.0,98
546709,65,1945,98.0,98.0,-1.0,1966.0,459395,-1.0,1966.0,459395,...,-1.0,98,-1.0,98,98,98,-1.0,98.0,-1.0,98
546710,50,1960,98.0,98.0,-1.0,1981.0,449995,-1.0,1981.0,449995,...,-1.0,0,-1.0,0,0,0,-1.0,1.0,-1.0,0
546711,27,1985,98.0,98.0,-1.0,2006.0,587995,-1.0,2006.0,587995,...,-1.0,98,-1.0,98,98,98,-1.0,98.0,-1.0,98


In [15]:
df_features.columns

Index(['AGE', 'LFSTAT', 'WRKG', 'FPTIND', 'WRKGP', 'EMSMI', 'HRSWK', 'HRSWKGR',
       'WKSWKGR', 'JOBINS', 'JOBPENS', 'JOBPROFT', 'JOBVAC', 'LOOKWK',
       'FTPRET', 'PTWTFT', 'OCEDRLP', 'NOCPR', 'NOCPRMG', 'OCSI', 'EMSEC',
       'EMSIZE', 'EMBUS', 'NEWBUS', 'EMUS', 'GOVSUP', 'WAPRI', 'WAPRSM',
       'WASEC', 'WASCSM', 'PRIAREA', 'ACTCAP', 'ACTDED', 'ACTMGT', 'ACTRD',
       'ACTRDT', 'ACTRES', 'ACTTCH', 'WADEV', 'WADSN', 'WAEMRL', 'WAMGMT',
       'WAOT', 'WAPROD', 'WAQM', 'WASALE', 'WASVC', 'WATEA', 'WAACC',
       'WAAPRSH', 'WABRSH', 'WACOM', 'SUPWK', 'SALARY', 'EARN', 'JOBSATIS',
       'SATADV', 'SATBEN', 'SATCHAL', 'SATIND', 'SATLOC', 'SATRESP', 'SATSAL',
       'SATSEC', 'SATSOC', 'WTREASN', 'OCLIC', 'MGRNAT', 'MGROTH', 'MGRSOC',
       'NOCMLST', 'OCLSTPB', 'NWFAM', 'NWILL', 'NWLAY', 'NWNOND', 'NWOCNA',
       'NWOT', 'NWOTP', 'NWRET', 'NWSTU'],
      dtype='object')

In [16]:
df_features.shape

(546713, 81)

In [22]:
new_filename = 'df_salary.csv'
df_features.to_csv(new_filename, index=False)