# Random Acts of Pizza Baseline

### Divyang Prateek and Cory Kind

###### Importing and structuring data

Start by importing relevant libraries for storing and analyzing data.

In [24]:
import pandas as pd
import json as js
import random
import numpy as np
import datetime
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import *
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier

Read in JSON file of training data.

In [2]:
#Reads the json file as a String
data2 = open("train.json").read()
#Converts JSON string to a List of Dictionaries
jsondata2 = js.loads(data2)

The RAOP data contains a variety of predictors of different formats. This step puts variables into separate categories for text and numeric, and creates an array for the outcome we are trying to predict ("requester_received_pizza"). We decided it was easier to work with text and numeric variables separately at this stage.


NOTE that the following variables are not currently imported because they require extra processing. They will be addressed at a later point, but are not required for the baseline.

1) requester_subreddits_at_request (returns an array)

2) unix timestamp of request (date format)

In [3]:
#numeric variables
numeric_variables = ['number_of_downvotes_of_request_at_retrieval',
    'number_of_upvotes_of_request_at_retrieval',
    'post_was_edited',
    'request_number_of_comments_at_retrieval',
    'requester_account_age_in_days_at_request',
    'requester_account_age_in_days_at_request',
    'requester_account_age_in_days_at_retrieval',
    'requester_days_since_first_post_on_raop_at_request',
    'requester_days_since_first_post_on_raop_at_retrieval',
    'requester_number_of_comments_at_request',
    'requester_number_of_comments_at_retrieval',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_comments_in_raop_at_retrieval',
    'requester_number_of_posts_at_request',
    'requester_number_of_posts_at_retrieval',
    'requester_number_of_posts_on_raop_at_request',
    'requester_number_of_posts_on_raop_at_retrieval',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_minus_downvotes_at_retrieval',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_retrieval',
    'unix_timestamp_of_request_utc']

#text variables
text_variables = ['giver_username_if_known',
    'request_id',
    'request_text',
    'request_text_edit_aware',
    'request_title',
    'requester_user_flair',
    'requester_username']

#Creating empty data frames to store the training data
numeric_elements = pd.DataFrame(np.nan, index = range(len(jsondata2)), columns = numeric_variables)
text_elements = pd.DataFrame(np.nan, index = range(len(jsondata2)), columns = text_variables)
outcome = pd.DataFrame(np.nan, index = range(len(jsondata2)), columns = ['requester_received_pizza'])

#Print the number of text and numeric predictors currently included
print "Number of numeric variables: ", len(numeric_elements.columns)
print "Number of text variables: ", len(text_elements.columns)

Number of numeric variables:  23
Number of text variables:  7


The next step is to fill these arrays from the JSON data. Although the loop approach is less efficient at large scale, we went this direction because the number of keys varies between cases in the data.

In [4]:
for i in range(len(jsondata2)):
    mykeys = jsondata2[i].keys()
    myvals = jsondata2[i].values()
    for key, val in zip(mykeys, myvals):
        if key in numeric_variables:
            idx = numeric_variables.index(key)
            numeric_elements.iloc[i, idx] = val
        if key in text_variables:
            idx = text_variables.index(key)
            text_elements.iloc[i, idx] = val
        if key == 'requester_received_pizza':
            outcome.iloc[i,0] = val


This is a quick check on the size of these arrays - the number of columns should match the number of text and numeric predictors determined above.

In [5]:
#Output shapes of numeric, text, and outcome arrays
print "Numeric array:"
print numeric_elements.shape
print 

print "Text array:"
print text_elements.shape
print

print "Outcome array:"
print outcome.shape
print

Numeric array:
(4040, 23)

Text array:
(4040, 7)

Outcome array:
(4040, 1)



Here we split out a dev set from the provided training data (80/20). There is no need to separate out a test set, since that is provided by Kaggle in a separate JSON file. To compare our results to other competitors in the Kaggle competition, we will need to use that test set.

In [6]:
random.seed(500)
data_size = len(jsondata2)
dev_indices = random.sample(range(data_size), data_size / 5)
train_indices = list(set(range(data_size)) - set(dev_indices))

print "Number of training cases: ", len(train_indices)
print "Number of dev cases: ", len(dev_indices)

Number of training cases:  3232
Number of dev cases:  808


In [8]:
#Pull out the request text and outcomes for training and dev sets
train_request_text = text_elements.ix[train_indices, "request_text"]
dev_request_text = text_elements.ix[dev_indices, "request_text"]

train_outcome = outcome.ix[train_indices,].astype(int).sum(axis = 1)
dev_outcome = outcome.ix[dev_indices,].astype(int).sum(axis = 1)

In [None]:
#Extracting the numerical data and converting to datetime.
train_request_time = numeric_elements.ix[train_indices,"unix_timestamp_of_request_utc"].astype(long)
train_request_dateTime = [datetime.datetime.fromtimestamp(time) for time in train_request_time]
dev_request_time = numeric_elements.ix[dev_indices,"unix_timestamp_of_request_utc"].astype(long)
dev_request_dateTime = [datetime.datetime.fromtimestamp(time) for time in dev_request_time]

In [19]:
train_month = np.asarray([time.month for time in train_request_dateTime]).reshape((len(train_request_time),1))
train_month_label = np.asarray(train_outcome)

dev_month = np.asarray([time.month for time in dev_request_dateTime]).reshape((len(dev_request_time),1))
dev_month_label = np.asarray(dev_outcome)

knn_clf  = KNeighborsClassifier()
knn_clf = knn_clf.fit(train_month,train_month_label)
print 'KNN Score :',knn_clf.score(dev_month,dev_month_label)

lr_clf = LogisticRegression(C=1)
lr_clf = lr_clf.fit(train_month,train_month_label)
print 'Logistic Regression Score :',lr_clf.score(dev_month,dev_month_label)

rf_clf = Ran

KNN Score : 0.706683168317
Logistic Regression Score : 0.738861386139


In [21]:
train_holiday_season = np.asarray([time.month >= 10 for time in train_request_dateTime]).reshape((len(train_request_time),1))
train_holiday_label = np.asarray(train_outcome)

dev_holiday_season = np.asarray([time.month >= 10 for time in dev_request_dateTime]).reshape((len(dev_request_time),1))
dev_holiday_label = np.asarray(dev_outcome)

knn_clf  = KNeighborsClassifier()
knn_clf = knn_clf.fit(train_holiday_season,train_holiday_label)
print 'KNN Score :',knn_clf.score(dev_holiday_season,dev_holiday_label)

lr_clf = LogisticRegression(C=1)
lr_clf = lr_clf.fit(train_holiday_season,train_holiday_label)
print 'Logistic Regression Score :',lr_clf.score(dev_holiday_season,dev_holiday_label)



KNN Score : 0.738861386139
Logistic Regression Score : 0.738861386139


In [22]:
train_payday_effect = np.asarray([(time.day > 26 or time.day<2) for time in train_request_dateTime]).reshape((len(train_request_time),1))
train_payday_label = np.asarray(train_outcome)

dev_payday_effect = np.asarray([(time.day > 26 or time.day<2) for time in dev_request_dateTime]).reshape((len(dev_request_time),1))
dev_payday_label = np.asarray(dev_outcome)

knn_clf_pe  = KNeighborsClassifier()
knn_clf_pe = knn_clf_pe.fit(train_payday_effect,train_payday_label)
print 'KNN Score :',knn_clf.score(dev_payday_effect,dev_payday_label)

lr_clf = LogisticRegression(C=1)
lr_clf = lr_clf.fit(train_payday_effect,train_payday_label)
print 'Logistic Regression Score :',lr_clf.score(dev_payday_effect,dev_payday_label)
    

KNN Score : 0.738861386139
Logistic Regression Score : 0.738861386139
