## Jess's DIGBlood IPython notebook

In [219]:
# Import stuff
%matplotlib inline
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import scipy
from scipy.stats import pearsonr
from __future__ import division
import matplotlib.pyplot as plt
from statsmodels.nonparametric.smoothers_lowess import lowess

In [220]:
import os
bloodPath =  os.path.abspath(os.path.join(os.getcwd(),'..'))
trainPath =  os.path.join(bloodPath,'data','raw','blood_train.csv')
testPath =  os.path.join(bloodPath,'data','raw','blood_train.csv')

df = pd.DataFrame.from_csv(trainPath)
df.columns = [c.replace(' ', '_') for c in df.columns]

dt = pd.DataFrame.from_csv(testPath)
dt.columns = [c.replace(' ', '_') for c in dt.columns]

In [221]:
#Show the first few lines of the database
df[:5]

Unnamed: 0,Months_since_Last_Donation,Number_of_Donations,Total_Volume_Donated_(c.c.),Months_since_First_Donation,Made_Donation_in_March_2007
619,2,50,12500,98,1
664,0,13,3250,28,1
441,1,16,4000,35,1
160,2,20,5000,45,1
358,1,24,6000,77,0


In [222]:
dt[:5]

Unnamed: 0,Months_since_Last_Donation,Number_of_Donations,Total_Volume_Donated_(c.c.),Months_since_First_Donation,Made_Donation_in_March_2007
619,2,50,12500,98,1
664,0,13,3250,28,1
441,1,16,4000,35,1
160,2,20,5000,45,1
358,1,24,6000,77,0


In [223]:
pearsonr(df['Number_of_Donations'],df['Total_Volume_Donated_(c.c.)'])

(1.0, 0.0)

In [224]:
df['MLDCount'] = df[['Made_Donation_in_March_2007','Months_since_Last_Donation']].groupby('Months_since_Last_Donation').transform(lambda x: x.count())
df[['Made_Donation_in_March_2007','MLDCount']].groupby('MLDCount').mean()

Unnamed: 0_level_0,Made_Donation_in_March_2007
MLDCount,Unnamed: 1_level_1
1,0.4
2,0.25
3,0.0
4,0.333333
5,0.133333
9,0.222222
10,0.3
17,0.176471
36,0.027778
37,0.081081


In [225]:
predLabels = ['Months_since_Last_Donation','Months_since_First_Donation','Number_of_Donations']
def getCrossTabExact(predLabels):
    dtP = dt
    dtP['Index'] = dtP.index 
    dfP = df.groupby(predLabels).mean().reset_index()
    dfP = dfP.rename(columns = {'Made_Donation_in_March_2007': 'Estimate'})
    dfP['Weight'] = df.groupby(predLabels).count().reset_index()['Made_Donation_in_March_2007']
    dtP = pd.merge(dtP,dfP,how ='left',on=predLabels).set_index('Index')
    dtP = dtP[['Estimate','Weight']]
    return dtP
#     dfG = df.groupby(predLabels).mean()
#     dfC = df.groupby(predLabels).count()
#     dtG = dt.groupby(predLabels).mean()
#     dtC = dt.groupby(predLabels).count()
#     dtG['Made_Donation_in_March_2007'] = 0
#     dtC['Made_Donation_in_March_2007'] = 0
#     for x in dtG.index.tolist():
#         if x in dfG.index.tolist():
#             if isinstance(x, tuple):
#                 xI = tuple([int(y) for y in x])
#             else:
#                 xI = x
#             dtG.loc[xI,'Made_Donation_in_March_2007'] = dfG.loc[xI,'Made_Donation_in_March_2007']
#             dtC.loc[xI,'Made_Donation_in_March_2007'] = dfC.loc[xI,'Made_Donation_in_March_2007']
#     dtG['Index'] = dtG['Index'].astype(int)
#     dtC['Index'] = dtC['Index'].astype(int)     
#     dtG = dtG.set_index('Index')
#     dtC = dtC.set_index('Index')
#     dtG['Estimate'] = dtG['Made_Donation_in_March_2007'].values
#     dtG['Weight'] = dtC['Made_Donation_in_March_2007'].values
#     dtG = dtG[['Estimate','Weight']].sort_values('Weight')
#     return dtG

dtTri = getCrossTabExact(['Months_since_Last_Donation','Months_since_First_Donation','Number_of_Donations'])
dtPair1 = getCrossTabExact(['Months_since_Last_Donation','Months_since_First_Donation'])
dtPair2 = getCrossTabExact(['Months_since_Last_Donation','Number_of_Donations'])
dtPair3 = getCrossTabExact(['Months_since_First_Donation','Number_of_Donations'])
dtMSLD = getCrossTabExact(['Months_since_Last_Donation'])
dtMSFD = getCrossTabExact(['Months_since_First_Donation'])
dtND = getCrossTabExact(['Number_of_Donations'])

In [227]:
def meanRevertBins(predictorLabel,actualLabel,meanRevCount):
    dataMean = df[[predictorLabel,actualLabel]].groupby([predictorLabel]).mean()
    dataCount = df[[predictorLabel,actualLabel]].groupby([predictorLabel]).count()
    pred = (dataMean*dataCount+np.mean(dataMean)*meanRevCount)/(dataCount+meanRevCount)
    x = pred.index
    y = pred.values
    
    return x,y

In [228]:
# From http://stackoverflow.com/questions/18517722/weighted-moving-average-in-python 
def uniPredict(x,y,testx,fillval=0):
    pred = np.array([])        
    for test in testx:
        if test in x:
            pred = np.append(pred,y[test==x])
        else:
            pred = np.append(pred,fillval)
    return pred       


In [229]:
actualLabel = 'Made_Donation_in_March_2007'
meanRevCount = 3

predictorLabel = 'Months_since_Last_Donation'

x,y = meanRevertBins(predictorLabel,actualLabel,meanRevCount)
fillval = np.mean(df[actualLabel])  
testx = df[predictorLabel]
pred1 = uniPredict(x,y,testx,fillval=fillval)

predictorLabel = 'Months_since_First_Donation'
x,y = meanRevertBins(predictorLabel,actualLabel,meanRevCount)
fillval = np.mean(df[actualLabel])  
testx = df[predictorLabel]
pred2 = uniPredict(x,y,testx,fillval=fillval)

predictorLabel = 'Number_of_Donations'
x,y = meanRevertBins(predictorLabel,actualLabel,meanRevCount)
fillval = np.mean(df[actualLabel])  
testx = df[predictorLabel]
pred3 = uniPredict(x,y,testx,fillval=fillval)

predictorLabel = 'MLDCount'
x,y = meanRevertBins(predictorLabel,actualLabel,meanRevCount)
fillval = np.mean(df[actualLabel])  
testx = df[predictorLabel]
pred4 = uniPredict(x,y,testx,fillval=fillval)

multipred = (pred1+pred2*3+pred3+pred4)/6

In [235]:
# Training evaluation
from sklearn.metrics import log_loss
actual = df[actualLabel]
print 'Training log-loss score ' + str(log_loss(actual,pred1))
print 'Training log-loss score ' + str(log_loss(actual,pred2))
print 'Training log-loss score ' + str(log_loss(actual,pred3))
print 'Training log-loss score ' + str(log_loss(actual,pred4))
print 'Training log-loss score ' + str(log_loss(actual,multipred))

Training log-loss score 0.477751103919
Training log-loss score 0.48873511625
Training log-loss score 0.484721168487
Training log-loss score 0.507067396522
Training log-loss score 0.494623769427
Training log-loss score 0.476482877656


In [236]:
print 'Training log-loss score ' + str(log_loss(actual,np.array(dtTri['Estimate'])))

Training log-loss score 0.12445527956
