In [136]:
from sklearn import metrics
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import codecs
import json
import itertools
import csv

def read_dataset(path):
  with codecs.open(path, 'r', 'utf-8') as myFile:
    content = myFile.read()
  dataset = json.loads(content)
  return dataset

path = './pizza_request_dataset.json'
dataset = read_dataset(path)

print 'The dataset contains %d samples.' %(len(dataset))
print 'Available attributes: ', sorted(dataset[0].keys())
print 'First post:'
print json.dumps(dataset[0], sort_keys=True, indent=2)

successes = [r['requester_received_pizza'] for r in dataset]
success_rate = 100.0 * sum(successes) / float(len(successes))
print 'The average success rate is: %.2f%%' %(success_rate)

shuffle = np.random.permutation(np.arange(len(dataset)))

The dataset contains 5671 samples.
Available attributes:  [u'giver_username_if_known', u'in_test_set', u'number_of_downvotes_of_request_at_retrieval', u'number_of_upvotes_of_request_at_retrieval', u'post_was_edited', u'request_id', u'request_number_of_comments_at_retrieval', u'request_text', u'request_text_edit_aware', u'request_title', u'requester_account_age_in_days_at_request', u'requester_account_age_in_days_at_retrieval', u'requester_days_since_first_post_on_raop_at_request', u'requester_days_since_first_post_on_raop_at_retrieval', u'requester_number_of_comments_at_request', u'requester_number_of_comments_at_retrieval', u'requester_number_of_comments_in_raop_at_request', u'requester_number_of_comments_in_raop_at_retrieval', u'requester_number_of_posts_at_request', u'requester_number_of_posts_at_retrieval', u'requester_number_of_posts_on_raop_at_request', u'requester_number_of_posts_on_raop_at_retrieval', u'requester_number_of_subreddits_at_request', u'requester_received_pizza', u'

In [163]:
#read train data

train = []
test = []

for i in range(int(len(shuffle) * 0.7)):
    train.append(dataset[shuffle[i]])

for i in range(int(len(shuffle) * 0.3)):
    test.append(dataset[shuffle[len(shuffle) - i - 1]])
    
train_labels = np.zeros(len(train))
test_labels = np.zeros(len(test))

train_data_text = []
test_data_text = []

for i in range(len(train_labels)):
    train_labels[i] = train[i].get('requester_received_pizza')
    train_data_text.append(train[i].get('request_text_edit_aware'))

for i in range(len(test_labels)):
    test_labels[i] = test[i].get('requester_received_pizza')
    test_data_text.append(test[i].get('request_text_edit_aware'))

vec_train = CountVectorizer()
tokenized_train_data = vec_train.fit_transform(train_data_text)

vec_test = CountVectorizer(vocabulary=vec_train.vocabulary_)
tokenized_test_data = vec_test.fit_transform(test_data_text)

print tokenized_train_data.shape
print tokenized_test_data.shape


(3969, 12233)
(1701, 12233)


In [138]:
LR_train = LogisticRegression()
LR_train.fit(tokenized_train_data,train_labels)

test_pred = LR_train.predict(tokenized_test_data)

print(classification_report(test_labels, test_pred))


             precision    recall  f1-score   support

        0.0       0.78      0.85      0.81      1287
        1.0       0.35      0.25      0.29       414

avg / total       0.68      0.70      0.69      1701



In [139]:
#see performance with "requester_account_age_in_days_at_request" and 
#"requester_days_since_first_post_on_raop_at_request"

#append the account age in days to data

train_token_mat = tokenized_train_data.todense()
test_token_mat = tokenized_test_data.todense()

train_tmp = np.zeros(len(train))
test_tmp = np.zeros(len(test))

for i in range(len(train_labels)):
    train_tmp[i] = train[i].get('requester_account_age_in_days_at_request')

for i in range(len(test_labels)):
    test_tmp[i] = test[i].get('requester_account_age_in_days_at_request')


train_token_mat_plus = np.c_[train_token_mat,train_tmp]
test_token_mat_plus = np.c_[test_token_mat,test_tmp]

LR_train_plus = LogisticRegression()
LR_train_plus.fit(train_token_mat_plus,train_labels)

test_pred_plus = LR_train_plus.predict(test_token_mat_plus)

print "Adding requester account age in days:\n"
print(classification_report(test_labels, test_pred_plus))


#append the requester_days_since_first_post_on_raop_at_request

train_tmp = np.zeros(len(train))
test_tmp = np.zeros(len(test))

for i in range(len(train_labels)):
    train_tmp[i] = train[i].get('requester_days_since_first_post_on_raop_at_request')

for i in range(len(test_labels)):
    test_tmp[i] = test[i].get('requester_days_since_first_post_on_raop_at_request')


train_token_mat_plus_1 = np.c_[train_token_mat_plus,train_tmp]
test_token_mat_plus_1 = np.c_[test_token_mat_plus,test_tmp]

LR_train_plus_1 = LogisticRegression()
LR_train_plus_1.fit(train_token_mat_plus_1,train_labels)

test_pred_plus_1 = LR_train_plus_1.predict(test_token_mat_plus_1)

print "Adding requester days since first post:\n"
print(classification_report(test_labels, test_pred_plus_1))

Adding requester account age in days:

             precision    recall  f1-score   support

        0.0       0.78      0.85      0.82      1287
        1.0       0.36      0.26      0.30       414

avg / total       0.68      0.71      0.69      1701

Adding requester days since first post:

             precision    recall  f1-score   support

        0.0       0.78      0.85      0.82      1287
        1.0       0.37      0.27      0.31       414

avg / total       0.68      0.71      0.69      1701



In [168]:
#append requester_upvotes_minus_downvotes_at_request (did best)

train_tmp = np.zeros(len(train))
test_tmp = np.zeros(len(test))

for i in range(len(train_labels)):
    train_tmp[i] = train[i].get('requester_upvotes_minus_downvotes_at_request')
        
for i in range(len(test_labels)):
    test_tmp[i] = test[i].get('requester_upvotes_minus_downvotes_at_request')
        
#print max_val

train_token_mat_plus_2 = np.c_[train_token_mat_plus_1,train_tmp]
test_token_mat_plus_2 = np.c_[test_token_mat_plus_1,test_tmp]

LR_train_plus_2 = LogisticRegression()
LR_train_plus_2.fit(train_token_mat_plus_2,train_labels)

test_pred_plus_2 = LR_train_plus_2.predict(test_token_mat_plus_2)

print "Adding requester upvotes - downvotes:\n"

print(classification_report(test_labels, test_pred_plus_2))

Adding requester upvotes - downvotes:

             precision    recall  f1-score   support

        0.0       0.78      0.86      0.82      1287
        1.0       0.38      0.26      0.31       414

avg / total       0.68      0.72      0.70      1701



In [171]:
#append requester_number_of_comments_in_raop_at_request

train_tmp = np.zeros(len(train))
test_tmp = np.zeros(len(test))

for i in range(len(train_labels)):
    train_tmp[i] = train[i].get('requester_number_of_comments_in_raop_at_request')
        
for i in range(len(test_labels)):
    test_tmp[i] = test[i].get('requester_number_of_comments_in_raop_at_request')
        
#print max_val

train_token_mat_plus_3 = np.c_[train_token_mat_plus_2,train_tmp]
test_token_mat_plus_3 = np.c_[test_token_mat_plus_2,test_tmp]

LR_train_plus_3 = LogisticRegression()
LR_train_plus_3.fit(train_token_mat_plus_3,train_labels)

test_pred_plus_3 = LR_train_plus_3.predict(test_token_mat_plus_3)

print "Adding comments in RAOP\n"

print(classification_report(test_labels, test_pred_plus_3))

Adding comments in RAOP

             precision    recall  f1-score   support

        0.0       0.78      0.86      0.82      1287
        1.0       0.38      0.27      0.31       414

avg / total       0.69      0.71      0.70      1701



In [152]:
#append requester_number_of_subreddits_at_request

train_tmp = np.zeros(len(train))
test_tmp = np.zeros(len(test))

max_val = 0

for i in range(len(train_labels)):
    train_tmp[i] = train[i].get('requester_number_of_subreddits_at_request') / 235
    
    if max_val < train_tmp[i]:
        max_val = train_tmp[i]

for i in range(len(test_labels)):
    test_tmp[i] = test[i].get('requester_number_of_subreddits_at_request') / 235

    if max_val < test_tmp[i]:
        max_val = test_tmp[i]

#print max_val

train_token_mat_plus_4 = np.c_[train_token_mat_plus_3,train_tmp]
test_token_mat_plus_4 = np.c_[test_token_mat_plus_3,test_tmp]

LR_train_plus_4 = LogisticRegression()
LR_train_plus_4.fit(train_token_mat_plus_4,train_labels)

test_pred_plus_4 = LR_train_plus_4.predict(test_token_mat_plus_4)

print "Adding number of subreddits:\n"

print(classification_report(test_labels, test_pred_plus_4))

Adding number of subreddits:

             precision    recall  f1-score   support

        0.0       0.76      1.00      0.86      1287
        1.0       0.00      0.00      0.00       414

avg / total       0.57      0.76      0.65      1701



In [167]:
#title NLP 

train_data_text_title = []
test_data_text_title = []

for i in range(len(train_labels)):
    train_data_text_title.append(train[i].get('request_title'))
    #train_data_text_title.append("blarg")
    
for i in range(len(test_labels)):
    test_data_text_title.append(test[i].get('request_title'))
    #test_data_text_title.append("blarg too")
    
vec_train_title = CountVectorizer()
tokenized_train_data_title = vec_train_title.fit_transform(train_data_text_title)

vec_test_title = CountVectorizer(vocabulary=vec_train_title.vocabulary_)
tokenized_test_data_title = vec_test_title.fit_transform(test_data_text_title)

LR_train_title = LogisticRegression()
LR_train_title.fit(tokenized_train_data_title,train_labels)

test_pred_title = LR_train_title.predict(tokenized_test_data_title)

print(classification_report(test_labels, test_pred_title))


train_token_mat_title = tokenized_train_data_title.todense()
test_token_mat_title = tokenized_test_data_title.todense()

train_token_mat_plus_3 = np.c_[train_token_mat_plus_2,train_token_mat_title]
test_token_mat_plus_3 = np.c_[test_token_mat_plus_2,test_token_mat_title]

LR_train_plus_3 = LogisticRegression()
LR_train_plus_3.fit(train_token_mat_plus_3,train_labels)

test_pred_plus_3 = LR_train_plus_3.predict(test_token_mat_plus_3)

print "Adding title NLP\n"

print(classification_report(test_labels, test_pred_plus_3))

             precision    recall  f1-score   support

        0.0       0.77      0.94      0.85      1287
        1.0       0.41      0.13      0.19       414

avg / total       0.68      0.74      0.69      1701

Adding title NLP

             precision    recall  f1-score   support

        0.0       0.78      0.85      0.82      1287
        1.0       0.36      0.26      0.30       414

avg / total       0.68      0.71      0.69      1701



#check user flair

train_tmp = np.zeros((len(train),3))
test_tmp = np.zeros((len(test),3))

#encode into 3 dim array
for i in range(len(train_labels)):
    if train[i].get('requester_user_flair') == None:
        train_tmp[i,0] = 1
    if train[i].get('requester_user_flair') == "shroom":
        train_tmp[i,1] = 1
    if train[i].get('requester_user_flair') == "PIF":
        train_tmp[i,2] = 1

for i in range(len(test_labels)):
    if test[i].get('requester_user_flair') == None:
        test_tmp[i,0] = 1
    if test[i].get('requester_user_flair') == "shroom":
        test_tmp[i,1] = 1
    if test[i].get('requester_user_flair') == "PIF":
        test_tmp[i,2] = 1
        
#append flair vector

train_token_mat_plus_2 = np.c_[train_token_mat_plus_1,train_tmp]
test_token_mat_plus_2 = np.c_[test_token_mat_plus_1,test_tmp]

LR_train_plus_2 = LogisticRegression()
LR_train_plus_2.fit(train_token_mat_plus_2,train_labels)

test_pred_plus_2 = LR_train_plus_2.predict(test_token_mat_plus_2)

print "Adding requester days since first post:\n"
print(classification_report(test_labels, test_pred_plus_2))

#print np.corrcoef(train_labels,train_tmp_0[:,1])

#append requester_upvotes_plus_downvotes_at_request

train_tmp = np.zeros(len(train))
test_tmp = np.zeros(len(test))

for i in range(len(train_labels)):
    train_tmp[i] = train[i].get('requester_upvotes_plus_downvotes_at_request')

for i in range(len(test_labels)):
    test_tmp[i] = test[i].get('requester_upvotes_plus_downvotes_at_request')


train_token_mat_plus_3 = np.c_[train_token_mat_plus_2,train_tmp]
test_token_mat_plus_3 = np.c_[test_token_mat_plus_2,test_tmp]

LR_train_plus_3 = LogisticRegression()
LR_train_plus_3.fit(train_token_mat_plus_3,train_labels)

test_pred_plus_3 = LR_train_plus_3.predict(test_token_mat_plus_3)

print "Adding requester upvotes + downvotes:\n"

print(classification_report(test_labels, test_pred_plus_3))

#append unix_timestamp_of_request

train_tmp = np.zeros(len(train))
test_tmp = np.zeros(len(test))

for i in range(len(train_labels)):
    train_tmp[i] = train[i].get('unix_timestamp_of_request')

for i in range(len(test_labels)):
    test_tmp[i] = test[i].get('unix_timestamp_of_request')


train_token_mat_plus_3 = np.c_[train_token_mat_plus_2,train_tmp]
test_token_mat_plus_3 = np.c_[test_token_mat_plus_2,test_tmp]

LR_train_plus_3 = LogisticRegression()
LR_train_plus_3.fit(train_token_mat_plus_3,train_labels)

test_pred_plus_3 = LR_train_plus_3.predict(test_token_mat_plus_3)

print "Adding unix timestamp:\n"

print(classification_report(test_labels, test_pred_plus_3))

In [78]:
#create predictions for Kaggle
test_data = pd.read_json('data/test.json')
test_data_text = test_data.request_text_edit_aware

tokenized_test_data = vec_test.fit_transform(test_data_text)

test_pred = LR_train.predict(tokenized_test_data)

predictions = pd.DataFrame()
predictions['request_id'] = test_data.request_id
predictions['requester_received_pizza'] = test_pred
predictions = predictions.set_index('request_id')

predictions.to_csv('NLP_submission.csv')

In [133]:
#create predictions for Kaggle
test_data = pd.read_json('data/test.json')

print test_data.columns

test_data_text = test_data.request_text_edit_aware

tokenized_test_data = vec_test.fit_transform(test_data_text)

test_token_mat = tokenized_test_data.todense()

test_tmp = test_data.requester_account_age_in_days_at_request

test_token_mat_plus = np.c_[test_token_mat,test_tmp]

#append the requester_days_since_first_post_on_raop_at_request

test_tmp = test_data.requester_days_since_first_post_on_raop_at_request

test_token_mat_plus_1 = np.c_[test_token_mat_plus,test_tmp]

test_pred = LR_train_plus_1.predict(test_token_mat_plus_1)

predictions = pd.DataFrame()
predictions['request_id'] = test_data.request_id
predictions['requester_received_pizza'] = test_pred
predictions = predictions.set_index('request_id')

predictions.to_csv('NLP_submission_v2.csv')

Index([                           u'giver_username_if_known',
                                               u'request_id',
                                  u'request_text_edit_aware',
                                            u'request_title',
                 u'requester_account_age_in_days_at_request',
       u'requester_days_since_first_post_on_raop_at_request',
                  u'requester_number_of_comments_at_request',
          u'requester_number_of_comments_in_raop_at_request',
                     u'requester_number_of_posts_at_request',
             u'requester_number_of_posts_on_raop_at_request',
                u'requester_number_of_subreddits_at_request',
                          u'requester_subreddits_at_request',
             u'requester_upvotes_minus_downvotes_at_request',
              u'requester_upvotes_plus_downvotes_at_request',
                                       u'requester_username',
                                u'unix_timestamp_of_request',
        