# Statistics on data

#### Data acquisition

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Get and process input data

var = dict([ (1, ('WHITE',1)),(2, ('ALCHY',1)),(3, ('JUNKY',1)),(4, ('SUPER',1)),
                (5, ('MARRIED',1)),(6, ('FELON',1)),(7, ('WORKREL',1)),(8, ('PROPTY',1)),
                (9, ('PERSON',1)),(10, ('MALE',1)),(11, ('PRIORS',2)),(13, ('SCHOOL',2)),
                (15, ('RULE',2)),(17, ('AGE',3)),(20, ('TSERVD',3)),
                (23, ('FOLLOW',2)),(25, ('RECID',1)),(26, ('TIME',2)),(28, ('FILE',1)) ] )

def cleanData(data):
    res = []
    cols = [x[1][0] for x in var.items()] # Get the column names
    for line in data:
        line = line.strip()
        
        curLine = []
        for i in xrange(len(line)):
            if i+1 not in var:
                continue
            name, sz = var[i+1]            
            curLine.append(int(line[i:i+sz]))
        
        res.append(curLine)
    
    ret = pd.DataFrame(data=res, columns=cols)
    ret = ret[ret.FILE != 3] # Remove incomplete data points
    
    # Remove some irrelevant columns
    del ret['TIME']
    del ret['FILE']
    del ret['FOLLOW']
    return ret
    

raw_1978 = open('data/1978.txt','rb').readlines()
raw_1980 = open('data/1980.txt','rb').readlines()

d1978 = cleanData(raw_1978)
d1980 = cleanData(raw_1980)

### Statistics on data

#### Variables pooled

In [3]:
d1978.head(5)

Unnamed: 0,WHITE,ALCHY,JUNKY,SUPER,MARRIED,FELON,WORKREL,PROPTY,PERSON,MALE,PRIORS,SCHOOL,RULE,AGE,TSERVD,RECID
2,1,1,0,1,1,0,1,0,0,1,0,7,2,441,30,0
3,1,0,0,1,1,0,0,0,0,1,0,11,0,303,4,0
6,1,0,0,1,0,0,1,0,0,1,1,9,1,276,43,1
10,1,0,0,0,0,0,1,0,0,0,0,14,0,329,9,0
11,0,0,0,0,0,1,0,0,0,1,0,10,0,277,8,0


In [4]:
def partition_age(data, age_range):
    for i in xrange(len(data)):
        data.iloc[i].AGE /= age_range
    return

def partition_time_served(data, time_served_range):
    for i in xrange(len(data)):
        data.iloc[i].TSERVD /= time_served_range  
    return

def partition_school(data, school_range):
    for i in xrange(len(data)):
        data.iloc[i].SCHOOL /= time_served_range  
    return

In [5]:
age_range = 36
time_served_range = 5
school_range = 4
d1978 = cleanData(raw_1978)

In [6]:
partition_age(d1978, age_range)
partition_time_served(d1978, time_served_range)
partition_school(d1978, school_range)
print d1978.head(5)

    WHITE  ALCHY  JUNKY  SUPER  MARRIED  FELON  WORKREL  PROPTY  PERSON  MALE  \
2       1      1      0      1        1      0        1       0       0     1   
3       1      0      0      1        1      0        0       0       0     1   
6       1      0      0      1        0      0        1       0       0     1   
10      1      0      0      0        0      0        1       0       0     0   
11      0      0      0      0        0      1        0       0       0     1   

    PRIORS  SCHOOL  RULE  AGE  TSERVD  RECID  
2        0       1     2   44       7      0  
3        0       2     0   30       1      0  
6        1       2     1   27      10      1  
10       0       3     0   32       2      0  
11       0       2     0   27       2      0  


#### find unqiue elements in array of arrays

In [8]:
from random import randint

def foo():
    res = []
    for x in xrange(4):
        res.append(randint(1,4))
    return res

def bar():
    res = []
    for x in xrange(2000):
        res.append(foo())
    return res

In [None]:
u = bar()
v = find_unique(u)
print "before find_unique, nb of elements: ", len(u)
print "after find_unique, nb of elements: ", len(v)

#### similarity

In [43]:
def find_unique(array):
    res = []
    for ar in array:
        check = True
        for br in res:
            check = check & (~(np.array(ar == br).all()))
        if check:
            res.append(ar)
    return res

In [54]:
def similarity(data, age_range, time_served_range):
    partition_age(data, age_range)
    partition_time_served(data, time_served_range)
    partition_school(data, school_range)
    
    tab = data.as_matrix()
    sim_data = find_unique(tab)
    n_data = len(tab)
    nb_sim = len(tab) - len(sim_data)
    
    print "number of data points: ", n_data
    print "number of similar data points: ", nb_sim
    print "portion of similar data points: ", (float(nb_sim) / n_data * 100), '%'
    
    return

In [65]:
%%time
age_range = 60
time_served_range = 5
school_range = 4
d1978 = cleanData(raw_1978)
similarity(d1978.drop('RECID', axis = 1), age_range, time_served_range)

number of data points:  4618
number of similar data points:  734
portion of similar data points:  15.8943265483 %
Wall time: 1min 39s


In [66]:
%%time
age_range = 120
time_served_range = 5
school_range = 4
d1978 = cleanData(raw_1978)
similarity(d1978.drop('RECID', axis = 1), age_range, time_served_range)

number of data points:  4618
number of similar data points:  905
portion of similar data points:  19.5972282373 %
Wall time: 1min 43s
