# Random Acts of Pizza Baseline

### Divyang Prateek and Cory Kind

###### Importing and structuring data

Start by importing relevant libraries for storing and analyzing data.

In [108]:
import pandas as pd
import json as js
import random
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import *
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression

Read in JSON file of training data.

In [22]:
#Reads the json file as a String
data2 = open("train.json").read()
#Converts JSON string to a List of Dictionaries
jsondata2 = js.loads(data2)

The RAOP data contains a variety of predictors of different formats. This step puts variables into separate categories for text and numeric, and creates an array for the outcome we are trying to predict ("requester_received_pizza"). We decided it was easier to work with text and numeric variables separately at this stage.


NOTE that the following variables are not currently imported because they require extra processing. They will be addressed at a later point, but are not required for the baseline.

1) requester_subreddits_at_request (returns an array)

2) unix timestamp of request (date format)

In [31]:
#numeric variables
numeric_variables = ['number_of_downvotes_of_request_at_retrieval',
    'number_of_upvotes_of_request_at_retrieval',
    'post_was_edited',
    'request_number_of_comments_at_retrieval',
    'requester_account_age_in_days_at_request',
    'requester_account_age_in_days_at_request',
    'requester_account_age_in_days_at_retrieval',
    'requester_days_since_first_post_on_raop_at_request',
    'requester_days_since_first_post_on_raop_at_retrieval',
    'requester_number_of_comments_at_request',
    'requester_number_of_comments_at_retrieval',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_comments_in_raop_at_retrieval',
    'requester_number_of_posts_at_request',
    'requester_number_of_posts_at_retrieval',
    'requester_number_of_posts_on_raop_at_request',
    'requester_number_of_posts_on_raop_at_retrieval',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_minus_downvotes_at_retrieval',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_retrieval',
    'unix_timestamp_of_request_utc']

#text variables
text_variables = ['giver_username_if_known',
    'request_id',
    'request_text',
    'request_text_edit_aware',
    'request_title',
    'requester_user_flair',
    'requester_username']

#Creating empty data frames to store the training data
numeric_elements = pd.DataFrame(np.nan, index = range(len(jsondata2)), columns = numeric_variables)
text_elements = pd.DataFrame(np.nan, index = range(len(jsondata2)), columns = text_variables)
outcome = pd.DataFrame(np.nan, index = range(len(jsondata2)), columns = ['requester_received_pizza'])

#Print the number of text and numeric predictors currently included
print "Number of numeric variables: ", len(numeric_elements.columns)
print "Number of text variables: ", len(text_elements.columns)

Number of numeric variables:  23
Number of text variables:  7


The next step is to fill these arrays from the JSON data. Although the loop approach is less efficient at large scale, we went this direction because the number of keys varies between cases in the data.

In [32]:
for i in range(len(jsondata2)):
    mykeys = jsondata2[i].keys()
    myvals = jsondata2[i].values()
    for key, val in zip(mykeys, myvals):
        if key in numeric_variables:
            idx = numeric_variables.index(key)
            numeric_elements.iloc[i, idx] = val
        if key in text_variables:
            idx = text_variables.index(key)
            text_elements.iloc[i, idx] = val
        if key == 'requester_received_pizza':
            outcome.iloc[i,0] = val


This is a quick check on the size of these arrays - the number of columns should match the number of text and numeric predictors determined above.

In [10]:
#Output shapes of numeric, text, and outcome arrays
print "Numeric array:"
print numeric_elements.shape
print 

print "Text array:"
print text_elements.shape
print

print "Outcome array:"
print outcome.shape
print

Numeric array:
(4040, 22)

Text array:
(4040, 7)

Outcome array:
(4040, 1)



Here we split out a dev set from the provided training data (80/20). There is no need to separate out a test set, since that is provided by Kaggle in a separate JSON file. To compare our results to other competitors in the Kaggle competition, we will need to use that test set.

In [26]:
random.seed(500)
data_size = len(jsondata2)
dev_indices = random.sample(range(data_size), data_size / 5)
train_indices = list(set(range(data_size)) - set(dev_indices))

print "Number of training cases: ", len(train_indices)
print "Number of dev cases: ", len(dev_indices)

Number of training cases:  3232
Number of dev cases:  808


###### Creating features for the baseline model

Now that the set-up is over, we can start using the text of the request to extract more interesting predictors. As a baseline, we're going to build a logistic regression model based on the word counts from the request text alone.

In [36]:
#Pull out the request text and outcomes for training and dev sets
train_request_text = text_elements.ix[train_indices, "request_text"]
dev_request_text = text_elements.ix[dev_indices, "request_text"]

train_outcome = outcome.ix[train_indices,].astype(int).sum(axis = 1)
dev_outcome = outcome.ix[dev_indices,].astype(int).sum(axis = 1)

In [28]:
#Create CountVectorizer object with no preprocessing, but include basic English stop words
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = "english")
train_data_features = vectorizer.fit_transform(train_request_text)
train_vocab = vectorizer.get_feature_names()

#Use train_vocab to extract the same features from the dev set
vectorizer_dev = CountVectorizer(analyzer = "word", tokenizer= None, preprocessor = None, stop_words = "english", vocabulary = train_vocab)
dev_data_features = vectorizer_dev.fit_transform(dev_request_text)

print "The length of the vocabulary using this basic model is: ", str(len(train_vocab))

The length of the vocabulary using this basic model is:  10919


###### Fitting a logistic regression model and printing confusion matrix and classification report

In [9]:
#Fit L2 Logistic Regression model
log_regression = LogisticRegression(penalty = "l2", C = 1)
log_regression.fit(train_data_features, train_outcome)
dev_predicted_labels = log_regression.predict(dev_data_features)

#Print confusion matrix and classification report
print "Confusion matrix on dev data for Logistic Regression model no processing, using only request_text: "
print metrics.confusion_matrix(dev_outcome, dev_predicted_labels, labels = [0,1])
print

print "Classification report: "
print metrics.classification_report(dev_outcome, dev_predicted_labels, labels = [0, 1])

Confusion matrix on dev data for Logistic Regression model no processing, using only request_text: 
[[537  60]
 [156  55]]

Classification report: 
             precision    recall  f1-score   support

          0       0.77      0.90      0.83       597
          1       0.48      0.26      0.34       211

avg / total       0.70      0.73      0.70       808



We are pretty happy with this as a first attempt. There is a lot of room for growth, but even with our very basic model we're seeing decent initial results.

###### Directions for future analysis

As a quick check of our initial model, we pulled out the 20 unigrams with the largest weights. This can also help us think through what patterns to look for in the data.

In [11]:
#Extracting 20 largest weights from the logistic regression model and printing
weights = log_regression.coef_
top_weights = np.argpartition(weights[0,], -19)[-20:]

#Printing out features
print "Unigram Features with Largest Weights"
for j in top_weights:
    print str(train_vocab[j])

Unigram Features with Largest Weights
rice
op
tight
expected
surprise
greatest
ones
sound
hurting
million
couch
park
sunday
reasons
pictures
lift
younger
checks
pockets
feelin


These words seem directionally correct based on our early reviews of the request_text variable and the kind of requests people make. "Rice" is often presented as an alternative to pizza (i.e., "I've been eating rice for a week - pizza would be a nice change"). A second note is that "op" is part of the Reddit lexicon. One hypothesis is that requesters who come across as insiders are more likely to get pizza. To test for that, we can include measures like # of subreddits and length of Reddit history. A couple of other ideas we have include:

- Time (seasonality, day of week, are people more likely to give at certain times of the month)
- Text (extracting predictors from request titles, include bigrams/triagrams, # of spelling errors, potentially sentiment analysis)
- Reddit behaviors (number of sub-reddits, length of time on Reddit, upvote/downvote differential)

In [112]:
train_request_time = numeric_elements.ix[train_indices,"unix_timestamp_of_request_utc"].astype(long)
train_request_dateTime = [datetime.datetime.fromtimestamp(time) for time in train_request_time]
train_holiday_season = np.asarray([time.month >= 10 for time in train_request_dateTime]).reshape((len(train_request_time),1))
train_holiday_label = np.asarray(train_outcome).reshape((len(train_outcome),1))

dev_request_time = numeric_elements.ix[dev_indices,"unix_timestamp_of_request_utc"].astype(long)
dev_request_dateTime = [datetime.datetime.fromtimestamp(time) for time in dev_request_time]
dev_holiday_season = np.asarray([time.month >= 10 for time in dev_request_dateTime]).reshape((len(dev_request_time),1))
dev_holiday_label = np.asarray(dev_outcome).reshape((len(dev_outcome),1))

knn_clf  = KNeighborsClassifier()
knn_clf = knn_clf.fit(train_holiday_season,train_holiday_label)
print knn_clf.score(dev_holiday_season,dev_holiday_label)


0.738861386139




In [129]:
train_payday_effect = np.asarray([(time.day > 26 or time.day<2) for time in train_request_dateTime]).reshape((len(train_request_time),1))
train_payday_label = np.asarray(train_outcome).reshape((len(train_outcome),1))

dev_payday_effect = np.asarray([(time.day > 26 or time.day<2) for time in dev_request_dateTime]).reshape((len(dev_request_time),1))
dev_payday_label = np.asarray(dev_outcome).reshape((len(dev_outcome),1))

print np.bincount(np.asarray([(time.day > 26 or time.day<2) for time in train_request_dateTime]))
knn_clf_pe  = KNeighborsClassifier()
knn_clf_pe = knn_clf_pe.fit(train_payday_effect,train_payday_label)
print knn_clf.score(dev_payday_effect,dev_payday_label)

[2587  645]
0.738861386139


