# Random Acts of Pizza

The objective of the project is to predict if the requester on Reddit will receive a Pizza as an act of altruism from one of the other Reddit users. 
The train and test data files are available in kaggle.com and have been downloaded at the below location before executing this notebook. 

In this notebook we are going to look at only the non-text field and build a classification model.

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from datetime import datetime
import re
from sklearn.linear_model import LogisticRegression

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from scipy.optimize import minimize
from sklearn.metrics import log_loss

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

We are going to add year, month, day and dayofweek fields and drop all text fields as well user-specific fields like giver_username_if_known, requester_username etc.

In [2]:
data = pd.read_json("data/train.json")
data['year']=data.apply(lambda x: datetime.utcfromtimestamp(x['unix_timestamp_of_request_utc']).strftime('%Y'), axis=1).astype(int)
data['month']=data.apply(lambda x: datetime.utcfromtimestamp(x['unix_timestamp_of_request_utc']).strftime('%m'), axis=1).astype(int)
data['day']=data.apply(lambda x: datetime.utcfromtimestamp(x['unix_timestamp_of_request_utc']).strftime('%d'), axis=1).astype(int)
data['dayofweek']=data.apply(lambda x: datetime.utcfromtimestamp(x['unix_timestamp_of_request_utc']).weekday(), axis=1).astype(int)
data=data.drop(['unix_timestamp_of_request'], axis=1)
data=data.drop(['request_id'], axis=1)
data=data.drop(['unix_timestamp_of_request_utc'], axis=1)
data=data.drop(['request_text'], axis=1)
data=data.drop(['request_text_edit_aware'], axis=1)
data=data.drop(['request_title'], axis=1)
data=data.drop(['requester_subreddits_at_request'], axis=1)
data=data.drop(['requester_username'], axis=1)
data=data.drop(['giver_username_if_known'], axis=1)
data=data.drop(['requester_user_flair'], axis=1)

Let's cast the non-integer fields to integers. These fields represent days and having days as integers is sufficient for our analysis.

In [3]:
data.requester_account_age_in_days_at_request=data.requester_account_age_in_days_at_request.astype(int)
data.requester_account_age_in_days_at_retrieval=data.requester_account_age_in_days_at_retrieval.astype(int)
data.requester_days_since_first_post_on_raop_at_retrieval=data.requester_days_since_first_post_on_raop_at_retrieval.astype(int)
data.requester_days_since_first_post_on_raop_at_request=data.requester_days_since_first_post_on_raop_at_request.astype(int)

In [4]:
data.dtypes

number_of_downvotes_of_request_at_retrieval             int64
number_of_upvotes_of_request_at_retrieval               int64
post_was_edited                                         int64
request_number_of_comments_at_retrieval                 int64
requester_account_age_in_days_at_request                int64
requester_account_age_in_days_at_retrieval              int64
requester_days_since_first_post_on_raop_at_request      int64
requester_days_since_first_post_on_raop_at_retrieval    int64
requester_number_of_comments_at_request                 int64
requester_number_of_comments_at_retrieval               int64
requester_number_of_comments_in_raop_at_request         int64
requester_number_of_comments_in_raop_at_retrieval       int64
requester_number_of_posts_at_request                    int64
requester_number_of_posts_at_retrieval                  int64
requester_number_of_posts_on_raop_at_request            int64
requester_number_of_posts_on_raop_at_retrieval          int64
requeste

Let's make a 80:20 split for training and dev.

In [5]:
from sklearn.model_selection import train_test_split

train_data, dev_data = train_test_split(data, test_size=0.2)
train_labels = train_data["requester_received_pizza"]
dev_labels = dev_data["requester_received_pizza"]
train_data=train_data.drop(['requester_received_pizza'], axis=1)
dev_data=dev_data.drop(['requester_received_pizza'], axis=1)

In [6]:
train_data.head(10)

Unnamed: 0,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,post_was_edited,request_number_of_comments_at_retrieval,requester_account_age_in_days_at_request,requester_account_age_in_days_at_retrieval,requester_days_since_first_post_on_raop_at_request,requester_days_since_first_post_on_raop_at_retrieval,requester_number_of_comments_at_request,requester_number_of_comments_at_retrieval,...,requester_number_of_posts_on_raop_at_retrieval,requester_number_of_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_minus_downvotes_at_retrieval,requester_upvotes_plus_downvotes_at_request,requester_upvotes_plus_downvotes_at_retrieval,year,month,day,dayofweek
1025,1,3,0,1,317,808,0,490,1,7,...,1,1,1,16,1,32,2012,8,2,3
2962,2,3,0,1,570,1084,210,725,126,131,...,1,31,641,647,1159,1183,2012,7,9,0
616,1,1,0,0,0,835,0,835,0,0,...,1,0,0,0,0,2,2011,8,24,2
845,2,3,0,3,75,614,0,538,58,720,...,1,11,92,2346,204,4742,2012,6,15,4
495,1,3,0,2,392,1265,38,911,259,333,...,2,52,2828,3270,8458,9232,2011,7,17,6
3872,3,15,0,1,506,650,0,144,214,252,...,2,60,1597,2233,2785,3841,2013,7,15,0
1382,3,4,0,1,724,1478,0,754,184,286,...,1,53,2601,3414,4843,6138,2011,11,12,5
3013,3,4,0,3,0,800,0,800,0,2,...,1,0,0,4,0,12,2011,9,27,1
825,1,1,0,0,218,1070,0,851,0,991,...,1,28,1486,16475,6144,33685,2011,8,7,6
722,5,4,0,0,25,176,0,151,1,7,...,1,1,2,4,14,46,2013,7,8,0


Let's fit a RandomForestClassifier this dataset.

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

y=train_labels
X=train_data
clf = RandomForestClassifier(random_state=0)
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

Let's test the accuracy of the model on the dev dataset.

In [9]:
X_test=dev_data
y_test=dev_labels
y_pred = clf.predict(X_test)
print('Accuracy of classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))
print('LogLoss : {score}'.format(score=log_loss(dev_labels, y_pred)))

Accuracy of classifier on test set: 0.83
LogLoss : 5.898971205001409


In [10]:
predictions=[]
predictions.append(clf.predict_proba(X_test))

Let's run a detailed classification report.

In [11]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

      False       0.83      0.96      0.89       583
       True       0.83      0.48      0.61       225

avg / total       0.83      0.83      0.81       808



In [90]:
data = pd.read_json("data/train.json")

train_data, dev_data = train_test_split(data, test_size=0.2)

train_labels = train_data["requester_received_pizza"]
train_data = train_data["request_text"]

dev_labels = dev_data["requester_received_pizza"]
dev_data = dev_data["request_text"]

### Text Classificaton

In [87]:
# Use basic pre-processing techniques
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

def no_preprocessor(s):
    return s

def better_preprocessor(s):
    message = s.lower()
    message = re.sub(r"[^a-zA-Z0-9]", " ", message)
    message = re.sub(r"[-]*", "", message)
    message = ' '.join([word[0:8] for word in message.split()])
    return message

def stemming_tokenizer(str_input):
    words = str_input.split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [88]:
cv = TfidfVectorizer(stop_words='english',
                     preprocessor=better_preprocessor,
                     lowercase=True,
                     tokenizer=stemming_tokenizer,
                     min_df=2, 
                     max_df=0.5, 
                     ngram_range=(1,3))
transformer = cv.fit_transform(train_data)
text_clf = LogisticRegression()
text_clf.fit(transformer,train_labels)
dev_data_trans = cv.transform(dev_data)
y_pred = text_clf.predict(dev_data_trans)
print ("Accuracy (on dev set): %.4f" % metrics.accuracy_score(y_true=dev_labels, y_pred=y_pred))
print (metrics.classification_report(y_true=dev_labels, y_pred=y_pred))
print('LogLoss {score}'.format(score=log_loss(dev_labels, y_pred)))

Accuracy (on dev set): 0.7611
             precision    recall  f1-score   support

      False       0.77      0.99      0.86       617
       True       0.38      0.02      0.03       191

avg / total       0.67      0.76      0.67       808

LogLoss 8.249984953223903


In [16]:
predictions.append(text_clf.predict_proba(dev_data_trans))

In [17]:
np.array(predictions).shape

(2, 808, 2)

In [76]:
dev_probabilities=[int(k) for k in dev_labels]

### Building an Ensemble model

In [85]:
def log_loss_func(weights):
    final_prediction = 0
    ''' scipy minimize will pass the weights as a numpy array '''
    for weight, prediction in zip(weights, predictions):
            final_prediction += weight*prediction

    return log_loss(dev_probabilities, final_prediction)
    #return metrics.recall_score(dev_probabilities, final_prediction)

In [84]:
starting_values = [0.5]*len(predictions)

cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
#our weights are bound between 0 and 1
bounds = [(0,1)]*len(predictions)

res = minimize(log_loss_func, starting_values, method='SLSQP', bounds=bounds, constraints=cons)

print('Ensemble Score: {best_score}'.format(best_score=res['fun']))
print('Best Weights: {weights}'.format(weights=res['x']))

Ensemble Score: 0.5202057598201189
Best Weights: [0.06055213 0.93944787]


In [80]:
weights=res['x']
y_pred=[weights[0]*predictions[0][k][1]+weights[1]*predictions[1][k][1] for k in range(len(dev_data))]

In [56]:
y_pred=[True if k > 0.5 else False for k in y_pred]

In [57]:
print (metrics.classification_report(y_true=dev_labels, y_pred=y_pred))

             precision    recall  f1-score   support

      False       0.78      1.00      0.88       631
       True       0.67      0.01      0.02       177

avg / total       0.76      0.78      0.69       808

