In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [2]:
traindat = pd.read_csv('../dat/clean.data')
#traindat.head()

In [3]:
testdat = pd.read_csv('../dat/clean.test')
#testdat.head()

How many ppl are female in the training and test sets?

In [4]:
print('percentage female in training set:', np.count_nonzero(traindat['sex_ Female'])/traindat.shape[0])

percentage female in training set: 0.3243153637026722


In [5]:
print('percentage female in test set:', np.count_nonzero(testdat['sex_ Female'])/testdat.shape[0])

percentage female in test set: 0.3262284196547145


What percentages of males and females are creditworthy in the train and the test sets?

In [6]:
female_train = traindat.loc[traindat[traindat['sex_ Female'] == 1].index]
male_train = traindat.loc[traindat[traindat['sex_ Male'] == 1].index]
female_test = testdat.loc[testdat[testdat['sex_ Female'] == 1].index]
male_test = testdat.loc[testdat[testdat['sex_ Male'] ==1].index]

female_test_idx = testdat[testdat['sex_ Female'] == 1].index
male_test_idx = testdat[testdat['sex_ Male'] == 1].index

print('percentage of approved females in train set', np.count_nonzero(female_train['>50k $/yr'])/female_train.shape[0])
print('percentage of approved males in train set', np.count_nonzero(male_train['>50k $/yr'])/male_train.shape[0])
print('percentage of approved females in test set', np.count_nonzero(female_test['>50k $/yr'])/female_test.shape[0])
print('percentage of approved males in test set', np.count_nonzero(male_test['>50k $/yr'])/male_test.shape[0])

percentage of approved females in train set 0.11367818442036394
percentage of approved males in train set 0.3138370951913641
percentage of approved females in test set 0.11337268471402402
percentage of approved males in test set 0.30974672316940965


In [7]:
# for rebalancing, later
female_approved_train = traindat[(traindat['sex_ Female'] ==1) & (traindat['>50k $/yr'] == 1)]

## model time
Build a simple model to predict credit worthiness:

In [8]:
trainlbl = traindat['>50k $/yr']
traindat.drop('>50k $/yr', axis=1, inplace=True)

testlbl = testdat['>50k $/yr']
testdat.drop('>50k $/yr', axis=1, inplace=True)
testdat.insert(loc=61, column='native-country_ Holand-Netherlands', value=0)

In [9]:
rfc = RandomForestClassifier(n_estimators=50, n_jobs=-1)

rfc.fit(traindat, trainlbl)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [10]:
biased_output = rfc.predict(testdat)
biased_accuracy = 1-np.count_nonzero(testlbl-biased_output)/testlbl.shape[0]
print(biased_accuracy)

0.8404382470119522


How many females did the model approve?

In [11]:
modeled_females = biased_output[female_test_idx]
print('biased model approved', np.count_nonzero(modeled_females)/ modeled_females.shape[0], 'percent of females')

biased model approved 0.08731935680846734 percent of females


How many males did the model approve?

In [12]:
modeled_males = biased_output[male_test_idx]
print('biased model approved', np.count_nonzero(modeled_males)/ modeled_males.shape[0], 'percent of males')

biased model approved 0.2783088597615059 percent of males


## rebalance data by minority oversampling
one could perform a more sophisticated analysis by using SMOTE or similar.

In [13]:
rebalanced = pd.read_csv('../dat/clean.data')
rebalanced.shape

(30162, 88)

In [14]:
female_approved_train.shape

(1112, 88)

We'll oversample 3x, since there are 3x less approved females than approved males in the train set.

In [15]:
for i in range(300):
    rebalanced = rebalanced.append(female_approved_train)

rebalanced.shape

(363762, 88)

shuffle the new rebalanced dataset, so we don't learn anything about order

In [16]:
rebalanced = rebalanced.sample(frac=1).reset_index(drop=True)

sep labels and data

In [17]:
reballbl = rebalanced['>50k $/yr']
rebalanced.drop('>50k $/yr', axis=1, inplace=True)

remodel

In [18]:
rfc.fit(rebalanced, reballbl)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [19]:
unbiased_output = rfc.predict(testdat)
unbiased_accuracy = 1-np.count_nonzero(testlbl-unbiased_output)/testlbl.shape[0]
print(unbiased_accuracy)

0.8346613545816733


How many females did the model approve?

In [20]:
modeled_females = unbiased_output[female_test_idx]
print('unbiased model approved', np.count_nonzero(modeled_females)/ modeled_females.shape[0], 'percent of females')

unbiased model approved 0.11052310197435375 percent of females


How many males did the model approve?

In [21]:
modeled_males = unbiased_output[male_test_idx]
print('unbiased model approved', np.count_nonzero(modeled_males)/ modeled_males.shape[0], 'percent of males')

unbiased model approved 0.27683059032226276 percent of males
