# Data Acquisition and Processing

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Get and process input data

var = dict([ (1, ('WHITE',1)),(2, ('ALCHY',1)),(3, ('JUNKY',1)),(4, ('SUPER',1)),
                (5, ('MARRIED',1)),(6, ('FELON',1)),(7, ('WORKREL',1)),(8, ('PROPTY',1)),
                (9, ('PERSON',1)),(10, ('MALE',1)),(11, ('PRIORS',2)),(13, ('SCHOOL',2)),
                (15, ('RULE',2)),(17, ('AGE',3)),(20, ('TSERVD',3)),
                (23, ('FOLLOW',2)),(25, ('RECID',1)),(26, ('TIME',2)),(28, ('FILE',1)) ] )

def cleanData(data):
    res = []
    cols = [x[1][0] for x in var.items()] # Get the column names
    for line in data:
        line = line.strip()
        
        curLine = []
        for i in xrange(len(line)):
            if i+1 not in var:
                continue
            name, sz = var[i+1]            
            curLine.append(int(line[i:i+sz]))
        
        res.append(curLine)
    
    ret = pd.DataFrame(data=res, columns=cols)
    ret = ret[ret.FILE != 3] # Remove incomplete data points
    
    # Remove some irrelevant columns
    del ret['TIME']
    del ret['FILE']
    del ret['FOLLOW']
    return ret
    

raw_1978 = open('data/1978.txt','rb').readlines()
raw_1980 = open('data/1980.txt','rb').readlines()

d1978 = cleanData(raw_1978)
d1980 = cleanData(raw_1980)

# Baseline Classifier

In [3]:
from __future__ import division

score_baseline_1 = np.size(d1978[d1978.RECID == 1].RECID.values) / np.size(d1978.RECID.values)
score_baseline_0 = np.size(d1978[d1978.RECID == 0].RECID.values) / np.size(d1978.RECID.values)
print "baseline classifier everyone to 0", score_baseline_0
print "baseline classifier everyone to 1", score_baseline_1

baseline classifier everyone to 0 0.627327847553
baseline classifier everyone to 1 0.372672152447


# Random forest

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split

d78train, d78test = train_test_split(d1978, test_size = 0.1)
X78train = d78train.drop('RECID', axis=1).values
y78train = d78train.RECID.values
X78test = d78test.drop('RECID', axis=1).values
y78test = d78test.RECID.values

In [5]:
def score_random_forest(Xtrain, ytrain, Xtest, ytest, n_estimators=10, criterion='gini', max_features='auto'):
    clf= RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_features= max_features)
    clf.fit(Xtrain, ytrain)
    score_train = clf.score(Xtrain, ytrain)
    score_test = clf.score(Xtest, ytest)
    return score_train, score_test

In [6]:
a, b = score_random_forest(X78train, y78train, X78test, y78test)
print "Train score with default parameters:", a
print "Test score with default parameters:", b

Train score with default parameters: 0.976660250241
Test score with default parameters: 0.627705627706


In [7]:
criterions = ['gini', 'entropy']
nb_trees = np.arange(1,100)
nb_features = np.arange(1,11)

In [8]:
def best_parameters(Xtrain, ytrain, Xtest, ytest, criterions, nb_trees, nb_features):
    
    best_criterion = None
    best_nb_trees = None
    best_nb_features = None
    score_tab = pd.DataFrame(columns=['loss', 'nb_trees', 'nb_features', 'test_score'])

    counter = 0 

    for loss in criterions:
        for n_estimators in nb_trees:
            for max_features in nb_features:

                score_train, score_test = \
                score_random_forest(Xtrain, ytrain, Xtest, ytest, n_estimators=n_estimators, criterion=loss, max_features=max_features) 
                score_tab.loc[counter] = [loss, n_estimators, max_features, score_test]
                counter += 1

    return score_tab

In [10]:
%%time
scores =  best_parameters(X78train, y78train, X78test, y78test, criterions, nb_trees , nb_features)
print scores.head(5)

   loss  nb_trees  nb_features  test_score
0  gini         1            1    0.625541
1  gini         1            2    0.606061
2  gini         1            3    0.586580
3  gini         1            4    0.569264
4  gini         1            5    0.582251
Wall time: 17min 32s


In [19]:
# save file to /data/ folder
file_path = "./data/random_forest_scores.csv"
scores.to_csv(path_or_buf= file_path, index=False)

In [None]:
# recover scores from /data/ folder
#file_path = "./data/random_forest_scores.csv"
#scores = pd.DataFrame.from_csv(file_path, index_col=None)
#scores.head(5)

In [20]:
max_score = np.max(scores.test_score.values)
winner = scores[scores.test_score > max_score * 0.985]
winner

Unnamed: 0,loss,nb_trees,nb_features,test_score
421,gini,43,2,0.690476
873,gini,88,4,0.681818
1473,entropy,49,4,0.681818
1651,entropy,67,2,0.683983
1903,entropy,92,4,0.681818
1941,entropy,96,2,0.681818
