# Random Acts of Pizza

The objective of the project is to predict if the requester on Reddit will receive a Pizza as an act of altruism from one of the other Reddit users. 
The train and test data files are available in kaggle.com and have been downloaded at the below location before executing this notebook. 

In this notebook we are going to look at only the non-text field and build a classification model.

In [2]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from datetime import datetime
import re
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from scipy.optimize import minimize
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedShuffleSplit

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

#from sklearn.cross_validation import train_test_split

We are going to add year, month, day and dayofweek fields and drop all text fields as well user-specific fields like giver_username_if_known, requester_username etc.

In [3]:
data = pd.read_json("/Users/gautamkarnataki/MIDS/train.json")
data['year']=data.apply(lambda x: datetime.utcfromtimestamp(x['unix_timestamp_of_request_utc']).strftime('%Y'), axis=1).astype(int)
data['month']=data.apply(lambda x: datetime.utcfromtimestamp(x['unix_timestamp_of_request_utc']).strftime('%m'), axis=1).astype(int)
data['day']=data.apply(lambda x: datetime.utcfromtimestamp(x['unix_timestamp_of_request_utc']).strftime('%d'), axis=1).astype(int)
data['dayofweek']=data.apply(lambda x: datetime.utcfromtimestamp(x['unix_timestamp_of_request_utc']).weekday(), axis=1).astype(int)
data=data.drop(['unix_timestamp_of_request'], axis=1)
data=data.drop(['request_id'], axis=1)
data=data.drop(['unix_timestamp_of_request_utc'], axis=1)
data=data.drop(['request_text_edit_aware'], axis=1)
#data=data.drop(['request_title'], axis=1)
data=data.drop(['requester_subreddits_at_request'], axis=1)
data=data.drop(['requester_username'], axis=1)
data=data.drop(['giver_username_if_known'], axis=1)
data=data.drop(['requester_user_flair'], axis=1)

Let's cast the non-integer fields to integers. These fields represent days and having days as integers is sufficient for our analysis.

In [4]:
data.requester_account_age_in_days_at_request=data.requester_account_age_in_days_at_request.astype(int)
data.requester_account_age_in_days_at_retrieval=data.requester_account_age_in_days_at_retrieval.astype(int)
data.requester_days_since_first_post_on_raop_at_retrieval=data.requester_days_since_first_post_on_raop_at_retrieval.astype(int)
data.requester_days_since_first_post_on_raop_at_request=data.requester_days_since_first_post_on_raop_at_request.astype(int)

In [5]:
data.dtypes

number_of_downvotes_of_request_at_retrieval              int64
number_of_upvotes_of_request_at_retrieval                int64
post_was_edited                                          int64
request_number_of_comments_at_retrieval                  int64
request_text                                            object
request_title                                           object
requester_account_age_in_days_at_request                 int64
requester_account_age_in_days_at_retrieval               int64
requester_days_since_first_post_on_raop_at_request       int64
requester_days_since_first_post_on_raop_at_retrieval     int64
requester_number_of_comments_at_request                  int64
requester_number_of_comments_at_retrieval                int64
requester_number_of_comments_in_raop_at_request          int64
requester_number_of_comments_in_raop_at_retrieval        int64
requester_number_of_posts_at_request                     int64
requester_number_of_posts_at_retrieval                 

Let's make a 80:20 split for training and dev.

In [6]:
X = data["request_text"]
y = data["requester_received_pizza"]
sss = StratifiedShuffleSplit(n_splits=5, random_state=1234)
for train_index, dev_index in sss.split(X,y):
    break

train_data,dev_data = X[train_index],X[dev_index]
train_labels,dev_labels = y[train_index],y[dev_index]

### Text Classificaton

In [7]:
predictions = []

In [8]:
# Use basic pre-processing techniques
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

def text_preprocessor(s):
    message = s.lower()
    message = re.sub(r"\brequest|\[|\]|\(|\)|\$|\!|\/|\.|\*|\+|\&|\=|\%|\:|\?|\"|\,|\;|\@|\_|\\|\}|\{|\||\~", " ", message)
    message = re.sub(r"[0-9]", " ", message)
    message = re.sub(r"[-]*", "", message)
    message = ' '.join([word[0:20] for word in message.split() if len(word)>3])
    return message

def stemming_tokenizer(str_input):
    words = str_input.split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [11]:
def classify(clfs):
    cv = CountVectorizer(stop_words='english',
                         preprocessor=text_preprocessor,
                         lowercase=True,
                         tokenizer=stemming_tokenizer,
                         min_df=5, 
                         max_df=0.2, 
                         ngram_range=(1,2))
    transformer = cv.fit_transform(train_data)
    logreg = LogisticRegression()
    logreg.fit(transformer,train_labels)
    dev_data_trans = cv.transform(dev_data)
    y_pred = text_clf.predict(dev_data_trans)
    print ("Accuracy (on dev set): %.4f" % metrics.accuracy_score(y_true=dev_labels, y_pred=y_pred))
    print (metrics.classification_report(y_true=dev_labels, y_pred=y_pred))
    print('LogLoss {score}'.format(score=log_loss(dev_labels, y_pred)))
    predictions.append(text_clf.predict_proba(dev_data_trans))
    dev_probabilities=[int(k) for k in dev_labels]
    clfs.append(logreg)
    return clfs

In [28]:
predictions = []
X = data["request_text"]
y = data["requester_received_pizza"]
sss = StratifiedShuffleSplit(n_splits=5, random_state=1000)
for train_index, dev_index in sss.split(X,y):
    break

train_data,dev_data = X[train_index],X[dev_index]
train_labels,dev_labels = y[train_index],y[dev_index]
cv = CountVectorizer(stop_words='english',
                         preprocessor=text_preprocessor,
                         lowercase=True,
                         tokenizer=stemming_tokenizer,
                         min_df=5, 
                         max_df=0.2, 
                         ngram_range=(1,1))
transformer = cv.fit_transform(train_data)
logreg_pizza_text = LogisticRegression()
logreg_pizza_text.fit(transformer,train_labels)
dev_data_trans = cv.transform(dev_data)
y_pred = logreg_pizza_text.predict(dev_data_trans)
print ("Accuracy (on dev set): %.4f" % metrics.accuracy_score(y_true=dev_labels, y_pred=y_pred))
print (metrics.classification_report(y_true=dev_labels, y_pred=y_pred))
print('LogLoss {score}'.format(score=log_loss(dev_labels, y_pred)))
predictions.append(logreg_pizza_text.predict_proba(dev_data_trans))

X = data["request_title"]
y = data["requester_received_pizza"]
sss = StratifiedShuffleSplit(n_splits=5, random_state=1000)
for train_index, dev_index in sss.split(X,y):
    break
    
train_data,dev_data = X[train_index],X[dev_index]
train_labels,dev_labels = y[train_index],y[dev_index]
cv = CountVectorizer(stop_words='english',
                         preprocessor=text_preprocessor,
                         lowercase=True,
                         tokenizer=stemming_tokenizer,
                         min_df=5, 
                         max_df=0.2, 
                         ngram_range=(1,1))
transformer = cv.fit_transform(train_data)
logreg_pizza_title = LogisticRegression()
logreg_pizza_title.fit(transformer,train_labels)
dev_data_trans = cv.transform(dev_data)
y_pred = logreg_pizza_title.predict(dev_data_trans)
print ("Accuracy (on dev set): %.4f" % metrics.accuracy_score(y_true=dev_labels, y_pred=y_pred))
print (metrics.classification_report(y_true=dev_labels, y_pred=y_pred))
print('LogLoss {score}'.format(score=log_loss(dev_labels, y_pred)))
predictions.append(logreg_pizza_title.predict_proba(dev_data_trans))

Accuracy (on dev set): 0.7030
              precision    recall  f1-score   support

       False       0.78      0.85      0.81       305
        True       0.35      0.25      0.29        99

   micro avg       0.70      0.70      0.70       404
   macro avg       0.56      0.55      0.55       404
weighted avg       0.67      0.70      0.69       404

LogLoss 10.2591335368
Accuracy (on dev set): 0.7376
              precision    recall  f1-score   support

       False       0.76      0.95      0.85       305
        True       0.33      0.07      0.12        99

   micro avg       0.74      0.74      0.74       404
   macro avg       0.55      0.51      0.48       404
weighted avg       0.66      0.74      0.67       404

LogLoss 9.06218191145


In [29]:
def log_loss_func(weights):
    ''' scipy minimize will pass the weights as a numpy array '''
    final_prediction = 0
    for weight, prediction in zip(weights, predictions):
            final_prediction += weight*prediction

    return log_loss(dev_labels, final_prediction)
    
#the algorithms need a starting value, right not we chose 0.5 for all weights
#its better to choose many random starting points and run minimize a few times
starting_values = [0.5]*len(predictions)
cons = ({'type':'eq','fun':lambda w: 1-sum(w)})

#our weights are bound between 0 and 1
bounds = [(0,1)]*len(predictions)
res = minimize(log_loss_func, starting_values, method='Nelder-Mead', bounds=bounds, constraints=cons)

print('Ensamble Score: {best_score}'.format(best_score=res['fun']))
print('Best Weights: {weights}'.format(weights=res['x']))

Ensamble Score: 0.558724221529
Best Weights: [0.25988513 0.69784521]


In [30]:
weights=res['x']
y_pred=[weights[0]*predictions[0][k][1]+weights[1]*predictions[1][k][1] for k in range(len(dev_data))]

In [31]:
y_pred=[True if k > 0.5 else False for k in y_pred]

In [32]:
print (metrics.classification_report(y_true=dev_labels, y_pred=y_pred))
print('LogLoss : {score}'.format(score=log_loss(dev_labels, y_pred)))

              precision    recall  f1-score   support

       False       0.76      0.98      0.85       305
        True       0.30      0.03      0.06        99

   micro avg       0.75      0.75      0.75       404
   macro avg       0.53      0.50      0.45       404
weighted avg       0.64      0.75      0.66       404

LogLoss : 8.8056919947
