In [139]:
# Get and process input data
import pandas as pd
import numpy as np

var = dict([ (1, ('WHITE',1)),(2, ('ALCHY',1)),(3, ('JUNKY',1)),(4, ('SUPER',1)),
                (5, ('MARRIED',1)),(6, ('FELON',1)),(7, ('WORKREL',1)),(8, ('PROPTY',1)),
                (9, ('PERSON',1)),(10, ('MALE',1)),(11, ('PRIORS',2)),(13, ('SCHOOL',2)),
                (15, ('RULE',2)),(17, ('AGE',3)),(20, ('TSERVD',3)),
                (23, ('FOLLOW',2)),(25, ('RECID',1)),(26, ('TIME',2)),(28, ('FILE',1)) ] )

def cleanData(data):
    res = []
    cols = [x[1][0] for x in var.items()] # Get the column names
    for line in data:
        line = line.strip()
        
        curLine = []
        for i in xrange(len(line)):
            if i+1 not in var:
                continue
            name, sz = var[i+1]            
            curLine.append(int(line[i:i+sz]))
        
        res.append(curLine)
    
    ret = pd.DataFrame(data=res, columns=cols)
    ret = ret[ret.FILE != 3] # Remove incomplete data points
    
    # Remove some irrelevant columns
    del ret['TIME']
    del ret['FILE']
    del ret['FOLLOW']
    return ret
    

raw_1978 = open('data/1978.txt','rb').readlines()
raw_1980 = open('data/1980.txt','rb').readlines()


d1978 = cleanData(raw_1978)
d1980 = cleanData(raw_1980)

print d1978.head()
print len(d1978)


    WHITE  ALCHY  JUNKY  SUPER  MARRIED  FELON  WORKREL  PROPTY  PERSON  MALE  \
2       1      1      0      1        1      0        1       0       0     1   
3       1      0      0      1        1      0        0       0       0     1   
6       1      0      0      1        0      0        1       0       0     1   
10      1      0      0      0        0      0        1       0       0     0   
11      0      0      0      0        0      1        0       0       0     1   

    PRIORS  SCHOOL  RULE  AGE  TSERVD  RECID  
2        0       7     2  441      30      0  
3        0      11     0  303       4      0  
6        1       9     1  276      43      1  
10       0      14     0  329       9      0  
11       0      10     0  277       8      0  
4618


In [None]:
# Neural network for CS 281
# Reference: http://pybrain.org/docs/tutorial

import pybrain
from pybrain.tools.shortcuts import buildNetwork
from pybrain.structure import SoftmaxLayer, TanhLayer
from pybrain.datasets import ClassificationDataSet
from pybrain.supervised.trainers import BackpropTrainer
from pybrain.utilities import percentError

# Load the dataframe data into the pybrain data set
np_d1978 = (d1978.values).astype(float)
np.random.shuffle(np_d1978)
cutoff = int(np_d1978.shape[0] * 0.3) # < cutoff is the test set, otherwise training set

# Change to -1, 1 classificaiton
# np_d1978[ (np_d1978[:, -1] == 0), -1 ] = -1

print np_d1978[:5]
# Normalize
for c in xrange( np_d1978.shape[1] - 1):
    c_sd = np.std(np_d1978[cutoff:, c])
    c_mean = np.mean(np_d1978[cutoff:, c])
    
    np_d1978[cutoff:, c] = (np_d1978[cutoff:, c] - c_mean) / c_sd
    
    # Normalize the test set as well
    np_d1978[:cutoff, c] = (np_d1978[:cutoff, c] - c_mean) / c_sd   

testD = ClassificationDataSet(np_d1978.shape[1]-1, nb_classes=2, class_labels=['No', 'Yes'])
trainD = ClassificationDataSet(np_d1978.shape[1]-1, nb_classes=2, class_labels=['No', 'Yes'])

for r_idx in xrange(len(np_d1978)):
    r = np_d1978[r_idx, :]
    inD, outD = r[:-1], r[-1]
    
    if r_idx < cutoff:
        testD.appendLinked(inD, outD)
    else:
        trainD.appendLinked(inD, outD)

# net = buildNetwork(2, 3, 1, hiddenclass=TanhLayer, outclass=SoftmaxLayer)
trainD._convertToOneOfMany(bounds=[0,1])
testD._convertToOneOfMany(bounds=[0,1])

print np_d1978[:5]
print trainD['target']

print trainD.calculateStatistics()
print testD.calculateStatistics()


resErr = []
    
for hid in xrange(7, 17, 100):
    net = buildNetwork(trainD.indim, hid, trainD.outdim, bias=True)
    t = BackpropTrainer(net, dataset=trainD)    

    bestEpoch = None
    bestErr = 101.

    for ep in xrange(40):    
    #     t.trainUntilConvergence(maxEpochs=2000)
        t.trainEpochs(50)
        testRes = t.testOnClassData(dataset=testD)
        trnresult = percentError( t.testOnClassData(),
                                  trainD['class'] )
        tstresult = percentError( testRes, testD['class'] )

        if tstresult < bestErr:
            bestErr = tstresult
            bestEpoch = ep
    #     print np.mean(t.testOnClassData(dataset=testD) == testD['class']) 
        N1, N0 = 0, 0
        c_1, c_0 = 0, 0
        for i in xrange(len(testRes)):
            if testD['class'][i] == 0:
                N0 += 1
                if testRes[i] == 0:
                    c_0 += 1
            else:
                N1 += 1
                if testRes[i] == 1:
                    c_1 += 1

        print 'Hidden layers:', ep
        print 'Prop. of 1s:', sum(testRes)/float(len(testRes))
        print '% of class 0 correct:', float(c_0)/float(N0)
        print '% of class 1 correct:', float(c_1)/float(N1)

    #     print testRes
    #     print t.testOnClassData(dataset=testD)
        print "epoch: %4d" % t.totalepochs, \
              "  train acc: %5.2f%%" % (100-trnresult), \
              "  test acc: %5.2f%%" % (100-tstresult)
    
    resErr.append( (bestErr, hid, bestEpoch) )
    
print sorted(resErr)
