# Random Acts of Pizza

The objective of the project is to predict if the requester on Reddit will receive a Pizza as an act of altruism from one of the other Reddit users. 
The train and test data files are available in kaggle.com and have been downloaded at the below location before executing this notebook. 

In this notebook we are going to look at only the non-text field and build a classification model.

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from datetime import datetime
import re
from sklearn.linear_model import LogisticRegression

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from scipy.optimize import minimize
from sklearn.metrics import log_loss

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

We are going to add year, month, day and dayofweek fields and drop all text fields as well user-specific fields like giver_username_if_known, requester_username etc.

In [2]:
data = pd.read_json("data/train.json")
data['year']=data.apply(lambda x: datetime.utcfromtimestamp(x['unix_timestamp_of_request_utc']).strftime('%Y'), axis=1).astype(int)
data['month']=data.apply(lambda x: datetime.utcfromtimestamp(x['unix_timestamp_of_request_utc']).strftime('%m'), axis=1).astype(int)
data['day']=data.apply(lambda x: datetime.utcfromtimestamp(x['unix_timestamp_of_request_utc']).strftime('%d'), axis=1).astype(int)
data['dayofweek']=data.apply(lambda x: datetime.utcfromtimestamp(x['unix_timestamp_of_request_utc']).weekday(), axis=1).astype(int)
data=data.drop(['unix_timestamp_of_request'], axis=1)
data=data.drop(['request_id'], axis=1)
data=data.drop(['unix_timestamp_of_request_utc'], axis=1)
data=data.drop(['request_text_edit_aware'], axis=1)
data=data.drop(['request_title'], axis=1)
data=data.drop(['requester_subreddits_at_request'], axis=1)
data=data.drop(['requester_username'], axis=1)
data=data.drop(['giver_username_if_known'], axis=1)
data=data.drop(['requester_user_flair'], axis=1)

Let's cast the non-integer fields to integers. These fields represent days and having days as integers is sufficient for our analysis.

In [3]:
data.requester_account_age_in_days_at_request=data.requester_account_age_in_days_at_request.astype(int)
data.requester_account_age_in_days_at_retrieval=data.requester_account_age_in_days_at_retrieval.astype(int)
data.requester_days_since_first_post_on_raop_at_retrieval=data.requester_days_since_first_post_on_raop_at_retrieval.astype(int)
data.requester_days_since_first_post_on_raop_at_request=data.requester_days_since_first_post_on_raop_at_request.astype(int)

In [4]:
data.dtypes

number_of_downvotes_of_request_at_retrieval              int64
number_of_upvotes_of_request_at_retrieval                int64
post_was_edited                                          int64
request_number_of_comments_at_retrieval                  int64
request_text                                            object
requester_account_age_in_days_at_request                 int64
requester_account_age_in_days_at_retrieval               int64
requester_days_since_first_post_on_raop_at_request       int64
requester_days_since_first_post_on_raop_at_retrieval     int64
requester_number_of_comments_at_request                  int64
requester_number_of_comments_at_retrieval                int64
requester_number_of_comments_in_raop_at_request          int64
requester_number_of_comments_in_raop_at_retrieval        int64
requester_number_of_posts_at_request                     int64
requester_number_of_posts_at_retrieval                   int64
requester_number_of_posts_on_raop_at_request           

Let's make a 80:20 split for training and dev.

In [5]:
from sklearn.model_selection import train_test_split

train_data, dev_data = train_test_split(data, test_size=0.3)
train_labels = train_data["requester_received_pizza"]
dev_labels = dev_data["requester_received_pizza"]
train_data=train_data.drop(['requester_received_pizza'], axis=1)
dev_data=dev_data.drop(['requester_received_pizza'], axis=1)

train_data_text = train_data["request_text"]
train_data=train_data.drop(['request_text'], axis=1)
dev_data_text = dev_data["request_text"]
dev_data=dev_data.drop(['request_text'], axis=1)

print (train_data.shape)
print (train_data_text.shape)
print (dev_data.shape)
print (dev_data_text.shape)

(2828, 25)
(2828,)
(1212, 25)
(1212,)


In [6]:
train_data.head(10)

Unnamed: 0,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,post_was_edited,request_number_of_comments_at_retrieval,requester_account_age_in_days_at_request,requester_account_age_in_days_at_retrieval,requester_days_since_first_post_on_raop_at_request,requester_days_since_first_post_on_raop_at_retrieval,requester_number_of_comments_at_request,requester_number_of_comments_at_retrieval,...,requester_number_of_posts_on_raop_at_retrieval,requester_number_of_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_minus_downvotes_at_retrieval,requester_upvotes_plus_downvotes_at_request,requester_upvotes_plus_downvotes_at_retrieval,year,month,day,dayofweek
3299,0,2,0,1,337,871,0,533,645,997,...,1,74,2500,3304,5988,7916,2012,6,20,2
3313,3,3,0,2,452,920,259,727,248,377,...,1,26,507,648,997,2108,2012,8,25,5
2698,3,4,0,1,24,135,0,110,1,4,...,1,1,1,4,1,14,2013,8,17,5
2548,1,5,0,0,139,324,0,184,213,306,...,1,35,1599,2678,2909,4792,2013,6,5,2
659,2,3,0,0,8,914,0,905,3,9,...,1,3,11,42,15,86,2011,6,14,1
444,2,5,0,4,0,803,0,802,0,6,...,1,1,1,10,1,14,2011,9,25,6
2026,5,11,0,3,259,445,0,186,52,77,...,1,10,1451,1797,2009,2769,2013,6,3,0
3283,11,57,0,7,302,655,0,352,502,987,...,1,50,2058,5278,4356,9756,2012,12,18,1
1643,1,9,0,0,170,497,0,327,96,205,...,1,18,1788,2310,5824,6680,2013,1,12,5
1070,1,2,0,0,1381,1680,0,299,561,820,...,1,69,3210,5310,6536,9866,2013,2,9,5


Let's fit a RandomForestClassifier this dataset.

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

y=train_labels
X=train_data
clf = RandomForestClassifier(random_state=0)
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

Let's test the accuracy of the model on the dev dataset.

In [8]:
X_test=dev_data
y_test=dev_labels
y_pred = clf.predict(X_test)
print('Accuracy of classifier on dev set: {:.2f}'.format(clf.score(X_test, y_test)))
print('LogLoss : {score}'.format(score=log_loss(dev_labels, y_pred)))

Accuracy of classifier on dev set: 0.84
LogLoss : 5.414524997124438


In [9]:
predictions=[]
predictions.append(clf.predict_proba(X_test))

Let's run a detailed classification report.

In [10]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

      False       0.86      0.95      0.90       911
       True       0.77      0.52      0.62       301

avg / total       0.84      0.84      0.83      1212



In [40]:
#all_data = data[data["requester_received_pizza"]==True].iloc[0:1000]
#all_data = all_data.append(data[data["requester_received_pizza"]==False].iloc[0:1000])
#all_data = all_data.sample(frac=1)


In [11]:
print (train_data_text.shape)
print (dev_data_text.shape)

(2828,)
(1212,)


### Text Classificaton

In [12]:
# Use basic pre-processing techniques
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

def no_preprocessor(s):
    return s

def better_preprocessor(s):
    message = s.lower()
    message = re.sub(r"[^a-zA-Z0-9]", " ", message)
    message = re.sub(r"[-]*", "", message)
    message = ' '.join([word[0:8] for word in message.split()])
    return message

def stemming_tokenizer(str_input):
    words = str_input.split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [13]:
cv = TfidfVectorizer(stop_words='english',
                     preprocessor=better_preprocessor,
                     lowercase=True,
                     tokenizer=stemming_tokenizer,
                     min_df=2, 
                     max_df=0.5, 
                     ngram_range=(1,3))
transformer = cv.fit_transform(train_data_text)
text_clf = LogisticRegression()
text_clf.fit(transformer,train_labels)
dev_data_trans = cv.transform(dev_data_text)
y_pred = text_clf.predict(dev_data_trans)
print ("Accuracy (on dev set): %.4f" % metrics.accuracy_score(y_true=dev_labels, y_pred=y_pred))
print (metrics.classification_report(y_true=dev_labels, y_pred=y_pred))
print('LogLoss {score}'.format(score=log_loss(dev_labels, y_pred)))

Accuracy (on dev set): 0.7541
             precision    recall  f1-score   support

      False       0.75      1.00      0.86       911
       True       1.00      0.01      0.02       301

avg / total       0.81      0.75      0.65      1212

LogLoss 8.492207397428535


In [14]:
predictions.append(text_clf.predict_proba(dev_data_trans))

In [15]:
np.array(predictions).shape

(2, 1212, 2)

In [16]:
dev_probabilities=[int(k) for k in dev_labels]

### Building an Ensemble model

In [17]:
def log_loss_func(weights):
    final_prediction = 0
    ''' scipy minimize will pass the weights as a numpy array '''
    for weight, prediction in zip(weights, predictions):
            final_prediction += weight*prediction

    return log_loss(dev_probabilities, final_prediction)
    #return metrics.recall_score(dev_probabilities, final_prediction)

In [22]:
starting_values = [0.5]*len(predictions)

cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
#our weights are bound between 0 and 1
bounds = [(0,1)]*len(predictions)

res = minimize(log_loss_func, starting_values, method='SLSQP', bounds=bounds, constraints=cons)

print('Ensemble Score: {best_score}'.format(best_score=res['fun']))
print('Best Weights: {weights}'.format(weights=res['x']))

Ensemble Score: 0.3804418839256663
Best Weights: [0.86485372 0.13514628]


In [23]:
weights=res['x']
y_pred=[weights[0]*predictions[0][k][1]+weights[1]*predictions[1][k][1] for k in range(len(dev_data))]

In [24]:
y_pred=[True if k > 0.5 else False for k in y_pred]

In [25]:
print (metrics.classification_report(y_true=dev_labels, y_pred=y_pred))
print('LogLoss : {score}'.format(score=log_loss(dev_labels, y_pred)))

             precision    recall  f1-score   support

      False       0.86      0.95      0.90       911
       True       0.77      0.52      0.62       301

avg / total       0.84      0.84      0.83      1212

LogLoss : 5.414524997124438
