### Data Preprocessing and Vanilla Logistic Regression with L2-Regularizer (or elastic net?)

In [135]:
% matplotlib inline

import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error

from matplotlib import pyplot as plt

In [136]:
# Reading data
data = pd.read_csv('censusDataset.csv',
    names=[
        "Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status",
        "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
        "Hours per week", "Country", "Target"],
        sep=r'\s*,\s*',
        engine='python',
        na_values="?")

In [137]:
#Dropping irrelevant columns

data = data.drop(["fnlwgt", "Education", "Martial Status"], axis = 1)
#print(data.head(15).values.tolist())

In [138]:
# Checking is there are nans in target
check_target_nan_df = data['Target']
print(len(check_target_nan_df) - check_target_nan_df.count())
# OK :)

#print(len(check_target_nan_df))
#print(encoded_data.count())

0


In [139]:
# Encoding Categorical Data

# uncomment if not dropping columns
# encoded_data = pd.get_dummies(data, columns=["Workclass", "Race", "Martial Status", "Occupation", "Relationship", "Sex", "Country", "Target", "Education"], prefix=["workclass", "race", "martial_status", "occupation", "relationship", "sex", "country", "target", "education"])

# if dropping columns
encoded_data = pd.get_dummies(data, columns=["Workclass", "Race", "Occupation", "Relationship", "Sex", "Country", "Target"], prefix=["workclass", "race", "occupation", "relationship", "sex", "country", "target"])


#print(encoded_data)
#print(encoded_data.columns.values.tolist())

#print(len(check_target_nan_df))
#print(encoded_data.count())


# remove one column of each categorical class for linear independence, remove the most represented class to minimize colinearity
# Note that this also takes care of the NaNs --> as if we replaced NaNs with most frequent class
encoded_data = encoded_data.drop(['workclass_Private'], axis = 1)
encoded_data = encoded_data.drop(['race_White'], axis = 1)
#encoded_data = encoded_data.drop(['martial_status_Married-civ-spouse'], axis = 1)
encoded_data = encoded_data.drop(['occupation_Prof-specialty'], axis = 1)
encoded_data = encoded_data.drop(['relationship_Husband'], axis = 1)
encoded_data = encoded_data.drop(['sex_Male'], axis = 1)
encoded_data = encoded_data.drop(['country_United-States'], axis = 1)
#encoded_data = encoded_data.drop(['education_HS-grad'], axis = 1)
encoded_data = encoded_data.drop(['target_<=50K'], axis = 1)

#print(encoded_data)
print(encoded_data.columns.values.tolist())

['Age', 'Education-Num', 'Capital Gain', 'Capital Loss', 'Hours per week', 'workclass_Federal-gov', 'workclass_Local-gov', 'workclass_Never-worked', 'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc', 'workclass_State-gov', 'workclass_Without-pay', 'race_Amer-Indian-Eskimo', 'race_Asian-Pac-Islander', 'race_Black', 'race_Other', 'occupation_Adm-clerical', 'occupation_Armed-Forces', 'occupation_Craft-repair', 'occupation_Exec-managerial', 'occupation_Farming-fishing', 'occupation_Handlers-cleaners', 'occupation_Machine-op-inspct', 'occupation_Other-service', 'occupation_Priv-house-serv', 'occupation_Protective-serv', 'occupation_Sales', 'occupation_Tech-support', 'occupation_Transport-moving', 'relationship_Not-in-family', 'relationship_Other-relative', 'relationship_Own-child', 'relationship_Unmarried', 'relationship_Wife', 'sex_Female', 'country_Cambodia', 'country_Canada', 'country_China', 'country_Columbia', 'country_Cuba', 'country_Dominican-Republic', 'country_Ecuador', 'count

In [140]:
# Centering and normalizing continuous columns entries
encoded_data["Age"] = (encoded_data["Age"] - np.mean(encoded_data["Age"], axis=0))/np.std(encoded_data["Age"], axis=0)
#encoded_data["fnlwgt"] = (encoded_data["fnlwgt"] - np.mean(encoded_data["fnlwgt"], axis=0))/np.std(encoded_data["fnlwgt"], axis=0)
encoded_data["Education-Num"] = (encoded_data["Education-Num"] - np.mean(encoded_data["Education-Num"], axis=0))/np.std(encoded_data["Education-Num"], axis=0)
encoded_data["Capital Gain"] = (encoded_data["Capital Gain"] - np.mean(encoded_data["Capital Gain"], axis=0))/np.std(encoded_data["Capital Gain"], axis=0)
encoded_data["Hours per week"] = (encoded_data["Hours per week"] - np.mean(encoded_data["Hours per week"], axis=0))/np.std(encoded_data["Hours per week"], axis=0)


In [141]:
# Splitting test/train set; useless since Logistic Regression CV?

from sklearn.model_selection import train_test_split

X = encoded_data.drop('target_>50K', axis = 1);
y = encoded_data['target_>50K'];

#print(X.size)
#print(X)

percentage_train = 0.8;
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-percentage_train, random_state=42)

#print(X_train.size)
#print(X_test)


In [142]:
# Run first plain vanilla logistic regression, l2 regularization

from sklearn.linear_model import LogisticRegressionCV

vanillaLogisticRegression = LogisticRegressionCV(
        Cs=list(np.power(10.0, np.arange(-10, 10)))
        ,penalty='l2'
        ,scoring='roc_auc'
        ,cv=10
        ,random_state=777
        ,solver = 'liblinear'
    )

vanillaLogisticRegression.fit(X, y)
    

print ('Max auc_roc:', vanillaLogisticRegression.scores_[1].max())
print ('Class coeffs:', vanillaLogisticRegression.coef_)

Max auc_roc: 0.911977928307
Class coeffs: [[  3.58613138e-01   7.74437606e-01   2.18171021e+00   6.35595861e-04
    3.93015575e-01   5.25450145e-01  -3.80395874e-02  -2.72605480e-02
    2.21170382e-01  -4.02407194e-01  -1.50704032e-01  -1.89736685e-01
   -3.98706072e-01  -1.36916891e-01  -2.03669460e-01  -3.33523173e-01
   -2.10813788e-01  -7.34437484e-02  -9.40093014e-02   5.35439921e-01
   -1.10136130e+00  -7.45879739e-01  -4.07538031e-01  -9.50511195e-01
   -4.53335722e-01   2.70484372e-01   6.13822672e-02   3.64728019e-01
   -2.60529806e-01  -1.77506911e+00  -1.55630414e+00  -2.59951854e+00
   -1.68007660e+00   1.28855646e+00  -8.37361543e-01   2.75183616e-01
    7.13915875e-02  -2.36222568e-01  -4.45115618e-01   1.27548171e-01
   -3.38497579e-01  -7.21903231e-02  -2.33517386e-01   8.33969367e-02
    1.20725930e-01   1.79915036e-01  -2.91753311e-01  -6.15736284e-02
   -4.15811949e-02  -8.10170721e-03  -4.12419247e-02  -1.75429413e-04
   -4.58240865e-02  -1.07208092e-01  -3.33911575

In [143]:
# Run now plain vanilla logistic regression, lasso regularization

from sklearn.linear_model import LogisticRegressionCV

vanillaLogisticRegression2 = LogisticRegressionCV(
        Cs=list(np.power(10.0, np.arange(-10, 10)))
        ,penalty='l1'
        ,scoring='roc_auc'
        ,cv=10
        ,random_state=777
        ,solver = 'liblinear'
    )

vanillaLogisticRegression2.fit(X, y)
    

print ('Max auc_roc:', vanillaLogisticRegression2.scores_[1].max())
print ('Class coeffs:', vanillaLogisticRegression2.coef_)

Max auc_roc: 0.912030558583
Class coeffs: [[  3.45006061e-01   7.68231202e-01   2.31700132e+00   6.43612195e-04
    3.97814733e-01   5.60740780e-01  -4.81200782e-02   0.00000000e+00
    2.26809127e-01  -4.01646879e-01  -1.64756660e-01  -1.00110380e+00
   -5.49881797e-01  -8.34903449e-02  -1.92859519e-01  -4.07671636e-01
   -2.56754410e-01   0.00000000e+00  -1.54742414e-01   5.04133662e-01
   -1.30046129e+00  -8.90491928e-01  -4.87063678e-01  -1.06640180e+00
   -2.16659038e+00   2.43404571e-01   1.56835583e-02   3.47428753e-01
   -3.29431387e-01  -1.86748370e+00  -2.03135625e+00  -3.03669934e+00
   -1.84371387e+00   1.26678232e+00  -8.00234501e-01   9.58815491e-01
    4.04252559e-02  -4.58043800e-01  -1.59863625e+00   1.59314364e-01
   -1.28764302e+00   0.00000000e+00  -4.99184119e-01   5.93875815e-02
    1.67401077e-01   2.26761240e-01  -8.54170721e-01   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00  -1.71316428e-01   0.00000000