In [51]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import tree

census = pd.read_csv('..\\partition\\train.csv')

In [52]:
#Create series with target variable 
target = census.pop('over_50k')
target.head()

0    0
1    1
2    1
3    0
4    1
Name: over_50k, dtype: int64

In [53]:
census.head()

Unnamed: 0,id,age,workclass_id,education_level_id,education_num,marital_status_id,occupation_id,race_id,sex_id,capital_gain,capital_loss,hours_week,country_id,workclass,education_level,marital_status,occupation,race,sex,country
0,11289,22,5,16,10,5,2,3,2,0,0,30,7,Private,Some-college,Never-married,Adm-clerical,Black,Male,Dominican-Republic
1,34529,43,8,11,16,3,11,5,1,0,0,40,40,State-gov,Doctorate,Married-civ-spouse,Prof-specialty,White,Female,United-States
2,28681,45,5,10,13,3,5,5,1,0,0,40,40,Private,Bachelors,Married-civ-spouse,Exec-managerial,White,Female,United-States
3,21496,41,5,12,9,3,4,5,2,0,0,45,40,Private,HS-grad,Married-civ-spouse,Craft-repair,White,Male,United-States
4,27291,56,5,12,9,3,5,5,2,0,0,40,40,Private,HS-grad,Married-civ-spouse,Exec-managerial,White,Male,United-States


In [54]:
#first stab with just continuous variables
cont_only = census[["age","education_num","capital_gain","capital_loss", "hours_week",]]

In [55]:
cont_only.head()

Unnamed: 0,age,education_num,capital_gain,capital_loss,hours_week
0,22,10,0,0,30
1,43,16,0,0,40
2,45,13,0,0,40
3,41,9,0,0,45
4,56,9,0,0,40


In [56]:
logreg1 = LogisticRegression(C=1e20,tol=.00000001,max_iter=50)
#Other solvers too (e.g. sag, newton-cg, lbfgs, liblinear)

#Fit log. model
logreg1.fit(cont_only , target)

#looking at the coefficients
print logreg1.coef_
list(cont_only)

[[  4.12735661e-02   3.24655630e-01   3.24227858e-04   6.88100436e-04
    3.97932204e-02]]


['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_week']

In [57]:
# prettifying the coefficients table
coeff =  logreg1.coef_
pred_vars = list(cont_only)
coeff_dict = dict(zip(pred_vars, *coeff))
coefficients = pd.Series(coeff_dict)
coefficients1 = pd.DataFrame(coefficients, columns=['coeff'])
coefficients1

Unnamed: 0,coeff
age,0.041274
capital_gain,0.000324
capital_loss,0.000688
education_num,0.324656
hours_week,0.039793


In [58]:
#Generate Probabilities for over_50k
probs1 = logreg1.predict_proba(cont_only)

#Grab only the ones for over_50k
prob1 = probs1[:,1]

#Calculate and Print AUC
print "AUC:", metrics.roc_auc_score(target, prob1)

AUC: 0.830691071172


In [59]:
# creating and joining dummy vars for martial_status and occupation
m_dummies = pd.get_dummies(census['marital_status'])
o_dummies = pd.get_dummies(census['occupation'])
census = census.join([m_dummies, o_dummies])


#census = census.join(o_dummies)
#pd.concat([df_a, df_b], axis=1)

In [60]:
list(census)

['id',
 'age',
 'workclass_id',
 'education_level_id',
 'education_num',
 'marital_status_id',
 'occupation_id',
 'race_id',
 'sex_id',
 'capital_gain',
 'capital_loss',
 'hours_week',
 'country_id',
 'workclass',
 'education_level',
 'marital_status',
 'occupation',
 'race',
 'sex',
 'country',
 'Divorced',
 'Married-AF-spouse',
 'Married-civ-spouse',
 'Married-spouse-absent',
 'Never-married',
 'Separated',
 'Widowed',
 '?',
 'Adm-clerical',
 'Armed-Forces',
 'Craft-repair',
 'Exec-managerial',
 'Farming-fishing',
 'Handlers-cleaners',
 'Machine-op-inspct',
 'Other-service',
 'Priv-house-serv',
 'Prof-specialty',
 'Protective-serv',
 'Sales',
 'Tech-support',
 'Transport-moving']

In [61]:
# create data frame with continuous and dummy vars for occupation and marital status
cont_o_m = census[['age','education_num','capital_gain','capital_loss', 'hours_week','Divorced','Married-AF-spouse',
            'Married-civ-spouse','Married-spouse-absent','Never-married','Separated','Widowed','?','Adm-clerical',
            'Armed-Forces','Craft-repair','Exec-managerial','Farming-fishing','Handlers-cleaners','Machine-op-inspct',
            'Other-service','Priv-house-serv','Prof-specialty','Protective-serv','Sales','Tech-support','Transport-moving']]

In [62]:
cont_o_m.head()

Unnamed: 0,age,education_num,capital_gain,capital_loss,hours_week,Divorced,Married-AF-spouse,Married-civ-spouse,Married-spouse-absent,Never-married,...,Farming-fishing,Handlers-cleaners,Machine-op-inspct,Other-service,Priv-house-serv,Prof-specialty,Protective-serv,Sales,Tech-support,Transport-moving
0,22,10,0,0,30,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,43,16,0,0,40,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,45,13,0,0,40,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,41,9,0,0,45,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,56,9,0,0,40,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
#run logistic model
logreg2 = LogisticRegression(C=1e20,tol=.00000001,max_iter=50)

#Fit logistic model and print the coefficients
logreg2.fit(cont_o_m , target)
print logreg2.coef_
list(cont_o_m)

[[  2.35100493e-02   2.80319906e-01   3.26048995e-04   6.57884968e-04
    2.92480180e-02  -1.56124688e+00   9.64584501e-01   7.62384450e-01
   -1.63545931e+00  -2.00825269e+00  -1.42969871e+00  -1.48565678e+00
   -9.58708991e-01  -1.69957568e-01   8.11078642e-01  -1.80634349e-01
    5.87197165e-01  -1.58943520e+00  -1.05350832e+00  -4.95650123e-01
   -1.17512611e+00  -2.88149099e+00   3.59381227e-01   2.30559381e-01
    9.45330399e-02   3.02187403e-01  -2.73770614e-01]]


['age',
 'education_num',
 'capital_gain',
 'capital_loss',
 'hours_week',
 'Divorced',
 'Married-AF-spouse',
 'Married-civ-spouse',
 'Married-spouse-absent',
 'Never-married',
 'Separated',
 'Widowed',
 '?',
 'Adm-clerical',
 'Armed-Forces',
 'Craft-repair',
 'Exec-managerial',
 'Farming-fishing',
 'Handlers-cleaners',
 'Machine-op-inspct',
 'Other-service',
 'Priv-house-serv',
 'Prof-specialty',
 'Protective-serv',
 'Sales',
 'Tech-support',
 'Transport-moving']

In [49]:
# prettifying the parameters/coefficients table
coeff =  logreg2.coef_
pred_vars = list(cont_o_m)
coeff_dict = dict(zip(pred_vars, *coeff))
coefficients = pd.Series(coeff_dict)
coefficients2 = pd.DataFrame(coefficients, columns=['coefficients'])
coefficients2

Unnamed: 0,coefficients
?,-0.958709
Adm-clerical,-0.169958
Armed-Forces,0.811079
Craft-repair,-0.180634
Divorced,-1.561247
Exec-managerial,0.587197
Farming-fishing,-1.589435
Handlers-cleaners,-1.053508
Machine-op-inspct,-0.49565
Married-AF-spouse,0.964585


In [64]:
#Generate Probabilities for over_50k
probs2 = logreg2.predict_proba(cont_o_m)

#Grab only over_50k
prob2 = probs2[:,1]

#Calculate and Print AUC
print "AUC:", metrics.roc_auc_score(target, prob2)

AUC: 0.905765785929


In [None]:
features = list(df.columns[:6])
features

In [None]:
y = df["Hired"]
X = df[features]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X,y)

In [None]:
from IPython.display import Image  
from sklearn.externals.six import StringIO  
import pydot 

dot_data = StringIO()  
tree.export_graphviz(clf, out_file=dot_data,  
                         feature_names=features)  
graph = pydot.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())  