In [1]:
# normal imports
import pandas as pd
import numpy as np

# ANN imports
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.utils import np_utils
from keras.optimizers import SGD, RMSprop

# sklearn imports
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# set random state to use throughout
rs = 25

Using Theano backend.


In [2]:
# code to return train/test values using categorical data
def trainTestCategorical(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X.as_matrix(), y.as_matrix(),
        test_size=0.2, random_state=rs)
    # set categorical targets formatted for ANN: one column per class
    yc_train = pd.get_dummies(y_train).as_matrix()
    yc_test = pd.get_dummies(y_test).as_matrix()
    # set categorical targets formatted for other models using Label Encoder
    le = LabelEncoder()
    ye_train = le.fit_transform(y_train)
    ye_test = le.transform(y_test)
    return X_train, X_test, yc_train, yc_test, ye_train, ye_test, le

In [3]:
# function specifying layers of ANN model
def doSequentialModel(dim_input, dim_output):
    model = Sequential()

    model.add(Dense(32, input_dim=dim_input, activation = 'tanh'))
    model.add(Dropout(0.25))

    model.add(Dense(32, activation = 'relu'))
    model.add(Dropout(0.25))

    model.add(Dense(dim_output, activation='softmax'))

    sgd = SGD(lr = .05)
    model.compile(optimizer = sgd, loss = 'mse', metrics = ['accuracy'])
    
    return model

In [4]:
# function to create and fit ANN
def createFitANN(X_train, y_train):
    model = doSequentialModel(X_train.shape[1],y_train.shape[1])
    model.fit(X_train, y_train, nb_epoch = 20, batch_size = 100)
    return model

In [5]:
def evalClassifier(y_true, y_pred):
    print 'confusion matrix \n', confusion_matrix(y_true, y_pred)
    print 'classification report \n', classification_report(y_true, y_pred)
    print 'accuracy score \n', accuracy_score(y_true, y_pred)

In [6]:
def fitEvalRFClassifier(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_jobs=-1,random_state=rs)
    rf.fit(X_train, y_train)
    rf_pred = rf.predict(X_test)
    evalClassifier(y_test, rf_pred)

In [7]:
# import data and format it
data = pd.read_csv('../Data/gss_subset_cleaned.csv')
data = data[data['year']> 2005]
data.drop(['paeduc', 'maeduc', 'speduc', 'income', 'satjob', 'goodlife','health', 'year'], axis=1, inplace=True)
# for initial model, just drop all na
data.dropna(inplace=True)

data.columns

Index([u'marital', u'divorce', u'sibs', u'childs', u'age', u'educ', u'sex',
       u'hompop', u'babies', u'preteen', u'teens', u'adults', u'earnrs',
       u'polviews', u'happy', u'weekswrk', u'satfin', u'dwelling', u'dwelown',
       u'hhrace'],
      dtype='object')

# ANN and RF for predicting happy vs unhappy

In [8]:
# set target and columns to exclude
target = 'happy'
exclude_cols = []
exclude_cols.append(target)

# X, y
y = data[target] > 1
X = pd.get_dummies(data.drop(exclude_cols, axis=1), drop_first=True)

# check the ratio of target values: determine threshold for beating guessing
(data[target].value_counts()/data[target].count()).sort_index()

1.0    0.142291
2.0    0.559948
3.0    0.297761
Name: happy, dtype: float64

In [9]:
# do train test split
X_train, X_test, yc_train, yc_test, ye_train, ye_test, le = trainTestCategorical(X, y)

In [14]:
# fit and evaluate ANN model
model = createFitANN(X_train, yc_train)
print 'testing set evaluation'
model.evaluate(X_test, yc_test)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
testing set evaluation

[0.12277045975366753, 0.85444495604917892]

In [12]:
# compare with Random Forest model
fitEvalRFClassifier(X_train, X_test, ye_train, ye_test)

confusion matrix 
[[  58  258]
 [  96 1759]]
classification report 
             precision    recall  f1-score   support

          0       0.38      0.18      0.25       316
          1       0.87      0.95      0.91      1855

avg / total       0.80      0.84      0.81      2171

accuracy score 
0.836941501612


# ANN and RF for predicting marital status

In [15]:
# set target and columns to exclude
target = 'marital'
exclude_cols = ['divorce'] #including divorce status would be cheating..!
exclude_cols.append(target)

# X, y
y = data[target]
X = pd.get_dummies(data.drop(exclude_cols, axis=1), drop_first=True)

# check the ratio of target values: determine threshold for beating guessing
(data[target].value_counts()/data[target].count()).sort_index()

divorced         0.161183
married          0.460787
never_married    0.260621
separated        0.032532
widowed          0.084877
Name: marital, dtype: float64

In [16]:
# do train test split
X_train, X_test, yc_train, yc_test, ye_train, ye_test, le = trainTestCategorical(X, y)

In [17]:
# fit and evaluate ANN model
model = createFitANN(X_train, yc_train)
print 'testing set evaluation'
model.evaluate(X_test, yc_test)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
testing set evaluation

[0.11649146525061443, 0.54813450036758327]

In [18]:
# compare with Random Forest model
fitEvalRFClassifier(X_train, X_test, ye_train, ye_test)

confusion matrix 
[[153  97  75   2  49]
 [ 30 886  41   1   4]
 [ 55  96 401   0   8]
 [ 25  24  28   0   7]
 [ 47  31  22   0  89]]
classification report 
             precision    recall  f1-score   support

          0       0.49      0.41      0.45       376
          1       0.78      0.92      0.85       962
          2       0.71      0.72      0.71       560
          3       0.00      0.00      0.00        84
          4       0.57      0.47      0.51       189

avg / total       0.66      0.70      0.68      2171

accuracy score 
0.704283740212


# ANN and RF for predicting gender

In [19]:
# set target and columns to exclude
target = 'sex'
exclude_cols = []
exclude_cols.append(target)

# X, y
y = data[target]
X = pd.get_dummies(data.drop(exclude_cols, axis=1), drop_first=True)

# check the ratio of target values: determine threshold for beating guessing
(data[target].value_counts()/data[target].count()).sort_index()

Female    0.553405
Male      0.446595
Name: sex, dtype: float64

In [20]:
# do train test split
X_train, X_test, yc_train, yc_test, ye_train, ye_test, le = trainTestCategorical(X, y)

# fit and evaluate ANN model
model = createFitANN(X_train, yc_train)
print 'testing set evaluation'
model.evaluate(X_test, yc_test)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
testing set evaluation

[0.2439257891935338, 0.54813450028521848]

In [21]:
# compare with Random Forest model
fitEvalRFClassifier(X_train, X_test, ye_train, ye_test)

confusion matrix 
[[877 291]
 [566 437]]
classification report 
             precision    recall  f1-score   support

          0       0.61      0.75      0.67      1168
          1       0.60      0.44      0.50      1003

avg / total       0.60      0.61      0.59      2171

accuracy score 
0.605251036389


# ANN and RF for predicting home ownership/rental/other status

In [23]:
# set target and columns to exclude
target = 'dwelown'
exclude_cols = []
exclude_cols.append(target)

# X, y
y = data[target]
X = pd.get_dummies(data.drop(exclude_cols, axis=1), drop_first=True)

# check the ratio of target values: determine threshold for beating guessing
(data[target].value_counts()/data[target].count()).sort_index()

other    0.343931
owns     0.427057
rents    0.229011
Name: dwelown, dtype: float64

In [24]:
# do train test split
X_train, X_test, yc_train, yc_test, ye_train, ye_test, le = trainTestCategorical(X, y)

# fit and evaluate ANN model
model = createFitANN(X_train, yc_train)
print 'testing set evaluation'
model.evaluate(X_test, yc_test)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
testing set evaluation

[0.21305623510530719, 0.42238599708529456]

In [25]:
# compare with Random Forest model
fitEvalRFClassifier(X_train, X_test, ye_train, ye_test)

confusion matrix 
[[276 330 159]
 [288 522  71]
 [184 112 229]]
classification report 
             precision    recall  f1-score   support

          0       0.37      0.36      0.36       765
          1       0.54      0.59      0.57       881
          2       0.50      0.44      0.47       525

avg / total       0.47      0.47      0.47      2171

accuracy score 
0.473053892216


# ANN and RF for predicting whether or not person has children

In [27]:
# set target and columns to exclude
target = 'childs'
exclude_cols = ['hompop','babies','preteen','teens']
exclude_cols.append(target)

# X, y
y = data[target] >0
X = pd.get_dummies(data.drop(exclude_cols, axis=1), drop_first=True)

# check the ratio of target values: determine threshold for beating guessing
(data[target].value_counts()/data[target].count()).sort_index()

0.0    0.272233
1.0    0.154917
2.0    0.263662
3.0    0.160630
4.0    0.080546
5.0    0.033545
6.0    0.016865
7.0    0.007926
8.0    0.009677
Name: childs, dtype: float64

In [28]:
# do train test split
X_train, X_test, yc_train, yc_test, ye_train, ye_test, le = trainTestCategorical(X, y)

# fit and evaluate ANN model
model = createFitANN(X_train, yc_train)
print 'testing set evaluation'
model.evaluate(X_test, yc_test)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
testing set evaluation

[0.1567740733312715, 0.78857669246630535]

In [29]:
# compare with Random Forest model
fitEvalRFClassifier(X_train, X_test, ye_train, ye_test)

confusion matrix 
[[ 355  236]
 [ 148 1432]]
classification report 
             precision    recall  f1-score   support

          0       0.71      0.60      0.65       591
          1       0.86      0.91      0.88      1580

avg / total       0.82      0.82      0.82      2171

accuracy score 
0.8231229848
