In [53]:
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from operator import itemgetter


from sklearn.model_selection import train_test_split
from sklearn import tree, metrics
from sklearn.tree import _tree


from sklearn.ensemble import RandomForestRegressor 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


import warnings
warnings.filterwarnings("ignore")

In [54]:
df = pd.read_csv('final_cps_data.csv')
actual_df = df[df['year'] == df['year'].max()]

In [55]:
df['year'].min()

1981

In [56]:
actual_df.head()

Unnamed: 0,year,state_fip,state,metro,relation_to_head_of_house,age,gender,race,marital_status,birthplace,...,protective,foodcare,building,sales,office_admin,farmer,construction,production,transport.2,transport.3
20,2013,50,VT,1.0,101,62,1,1,6,9900.0,...,0,0,0,0,0,0,0,0,0,0
32,2013,39,OH,3.0,201,59,1,1,1,9900.0,...,0,0,0,0,0,0,1,0,0,0
34,2013,44,RI,2.0,101,44,1,3,1,20000.0,...,0,0,0,0,0,0,0,0,0,0
36,2013,12,FL,3.0,101,41,1,1,4,9900.0,...,0,0,0,0,0,0,0,0,0,0
37,2013,33,NH,1.0,201,35,1,1,1,9900.0,...,0,0,0,0,0,0,0,0,0,1


In [57]:
actual_df.columns

Index(['year', 'state_fip', 'state', 'metro', 'relation_to_head_of_house',
       'age', 'gender', 'race', 'marital_status', 'birthplace',
       'year_of_immigration', 'citizen', 'mother_birthplace',
       'father_birthplace', 'foreign_birthplace', 'grade_level_achieved',
       'employment_status', 'worker_class', 'worker_class_last_year',
       'weeks_worked_last_year', 'hours_per_week', 'union_member_status',
       'income', 'longest_job_earnings', 'family_type', 'white', 'black',
       'hisp', 'other_race', 'high_school_or_less', 'bachelors_degree',
       'advanced_degree', 'potential_experience', 'annual_work_hours',
       'full_time_job', 'not_inputed_income', 'hourly_wage',
       'personal_consumption_expenditure', 'wage_inflation_factor',
       'real_hourly_wage', 'north_east', 'north_central', 'south', 'west',
       'occupation_category', 'agriculture', 'mining_construction', 'durables',
       'nondurables', 'transport', 'transport.1', 'utilities',
       'communica

In [58]:
#Target Gender
TARGET_G = 'gender'

#Target Wage
TARGET_W = 'real_hourly_wage'

In [72]:
dropcols = ['year_of_immigration', 'citizen', 'mother_birthplace',
            'father_birthplace', 'foreign_birthplace','white', 'black',
            'hisp', 'other_race','birthplace','metro','not_inputed_income', 'hourly_wage','state','income', 'longest_job_earnings','state_fip']

In [73]:
df_drop = actual_df.drop(dropcols,axis=1)

In [61]:
dummy_df = pd.get_dummies(df_drop)

In [62]:
#split into X and Y
X = dummy_df.drop([TARGET_G,TARGET_W], axis=1)
Y = dummy_df[[TARGET_G,TARGET_W]]


# random_state=111 for reproduceability
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, \
                                                    train_size=0.8, \
                                                    test_size=.2
                                                    ) 

print("Training = ", X_train.shape)
print("Testing = ", X_test.shape)

Training =  (43032, 132)
Testing =  (10758, 132)


In [63]:
def getCoefLogit( MODEL, TRAIN_DATA ) :
    varNames = list( TRAIN_DATA.columns.values )
    coef_dict = {}
    coef_dict["INTERCEPT"] = MODEL.intercept_[0]
    for coef, feat in zip(MODEL.coef_[0],varNames):
        coef_dict[feat] = coef
    return coef_dict

In [64]:
Y_Pred_Test

array([1, 1, 2, ..., 1, 1, 1], dtype=int64)

In [65]:
#predict gender using logistic regression
model = LogisticRegression(solver='newton-cg', max_iter = 1000)
model = model.fit(X_train,Y_train['gender'])


Y_Pred_Train = model.predict(X_train)
Y_Pred_Test = model.predict(X_test)

train_acc = metrics.accuracy_score(Y_train['gender'], Y_Pred_Train)
test_acc = metrics.accuracy_score(Y_test['gender'], Y_Pred_Test)

print('Training Accuracy = ', train_acc)
print('Testing Accuracy = ', test_acc)

important_variables = getCoefLogit(model,X_train)
print('Important Variables \n',important_variables)

Training Accuracy =  0.7529745305818926
Testing Accuracy =  0.7465142219743447
Important Variables 
 {'INTERCEPT': -3.494549766010671e-07, 'year': -0.0006103912859739081, 'state_fip': -0.0009149027229610422, 'relation_to_head_of_house': 0.0003543529749481804, 'age': -0.021379599981820836, 'race': 0.00849559235681185, 'marital_status': 0.06931468737176735, 'grade_level_achieved': 0.0007786162586722986, 'employment_status': 0.15941468034049525, 'worker_class': 0.010910880561404426, 'worker_class_last_year': 0.00028035850365493056, 'weeks_worked_last_year': 0.021040160315006454, 'hours_per_week': -0.005931687567543779, 'union_member_status': -0.03395946427081192, 'income': -1.4403142523149074e-05, 'longest_job_earnings': 6.583612415025646e-06, 'family_type': -0.199308324071904, 'high_school_or_less': -0.013233833567064049, 'bachelors_degree': -0.02417438096863432, 'advanced_degree': 0.03740791131098804, 'potential_experience': 0.03198617134078176, 'annual_work_hours': -0.00046224878673097

In [66]:
#predict gender using randomforest
fm01_RF = RandomForestClassifier(n_estimators = 100, max_depth=5,random_state=111)
fm01_RF = fm01_RF.fit(X_train, Y_train[TARGET_G])

rfY_Pred_train = fm01_RF.predict(X_train)
rfY_Pred_test = fm01_RF.predict(X_test)

# show how accurately we can predict defaults
print("Accuracy Train:",metrics.accuracy_score(Y_train[TARGET_G], rfY_Pred_train))
print("Accuracy Test:",metrics.accuracy_score(Y_test[TARGET_G], rfY_Pred_test))

Accuracy Train: 0.7360568878973787
Accuracy Test: 0.7291318088864102


In [67]:
def getEnsembleTreeVars(ENSTREE, varNames) :
    importance = ENSTREE.feature_importances_
    index = np.argsort(importance)
    theList = []
    for i in index :
        imp_val = importance[i]
        if imp_val > np.average(ENSTREE.feature_importances_) :
            v = int(imp_val / np.max(ENSTREE.feature_importances_) * 100)
            theList.append((varNames[i], v))
    theList = sorted(theList,key=itemgetter(1),reverse=True)
    return theList

In [68]:
feature_cols = list(X.columns.values)

In [71]:
#predict real_hourly_wage using gradientboosting
amt_m01_GB = GradientBoostingRegressor(n_estimators = 100,max_depth=3,random_state=511)
amt_m01_GB = amt_m01_GB.fit(X_train, Y_train[TARGET_W])

a_Y_Pred_train = amt_m01_GB.predict(X_train)
a_Y_Pred_test = amt_m01_GB.predict(X_test)

RMSE_Train = math.sqrt(metrics.mean_squared_error(Y_train[TARGET_W], a_Y_Pred_train))
RMSE_Test = math.sqrt(metrics.mean_squared_error(Y_test[TARGET_W], a_Y_Pred_test))

print("GB RMSE Train:", RMSE_Train)
print("GB RMSE Test:", RMSE_Test)

RMSE_GB = RMSE_Test

vars_GB_amt = getEnsembleTreeVars(amt_m01_GB, feature_cols)
print(vars_GB_amt)

GB RMSE Train: 2.2247818142422404
GB RMSE Test: 7.61205893291046
[('longest_job_earnings', 100), ('income', 64), ('annual_work_hours', 27), ('wholesale_trade', 16), ('state_IL', 11)]
