In [292]:
#load lib
from sklearn import metrics
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import codecs
import json
import itertools
import csv

In [293]:
#load datasets
def read_dataset(path):
  with codecs.open(path, 'r', 'utf-8') as myFile:
    content = myFile.read()
  dataset = json.loads(content)
  return dataset

path = 'data/pizza_request_dataset.json'
dataset = read_dataset(path)
shuffle = np.random.permutation(np.arange(len(dataset)))
print 'There are %d requests in json dataset.' % (len(dataset))

There are 5671 requests in json dataset.


## Read the data and split into training and dev set

In [294]:
data_all = pd.read_json(path)
data_all = data_all.ix[shuffle]

data_all['hour_of_request'] = pd.to_datetime(data_all.unix_timestamp_of_request_utc.values, unit='s').hour
data_all['length_of_title'] = [len(entry.split()) for entry in data_all.request_title.values]
data_all['length_of_text'] = [len(entry.split()) for entry in data_all.request_text_edit_aware.values]

In [295]:
kaggle_test_data = pd.read_json('data/test.json')
print 'Kaggle test data provided (without label)',kaggle_test_data.shape

kaggle_test_data['hour_of_request'] = pd.to_datetime(kaggle_test_data.unix_timestamp_of_request_utc.values, unit='s').hour
kaggle_test_data['length_of_title'] = [len(entry.split()) for entry in kaggle_test_data.request_title.values]
kaggle_test_data['length_of_text'] = [len(entry.split()) for entry in kaggle_test_data.request_text_edit_aware.values]

Kaggle test data provided (without label) (1631, 17)


In [319]:
def combine_text_title(data):
    text_and_title = [data.shape]
    for i in range(data.shape[0]):
        value = data.request_title.values[i] + " " + data.request_text_edit_aware.values[i]
        if(1<10):
            print value
        text_and_title.append(value.lower())
    return np.array(text_and_title).T
     

In [320]:
split_point= int(len(data_all) * 0.8)
train_all = data_all[:split_point]
test_all = data_all[split_point:]

print 'Training dataset shape:', train_all.shape
print 'Test dataset shape:', test_all.shape

train_data_text = combine_text_title(train_all)
train_labels = train_all.requester_received_pizza.values

test_data_text= combine_text_title(test_all)
test_labels = test_all.requester_received_pizza.values

Training dataset shape: (4536, 36)
Test dataset shape: (1135, 36)
[Request] Lonely and hungry, maybe someone can help me out Girlfriend's out of town for the weekend, but even if she were here her car's busted, and we're both broke. So i'm just sitting here on Reddit, depressed and hungry. Pizza places don't open for a little bit here but if someone could help me out with a pie and a 2 liter you'd be my hero.

Edit: I'm an idiot and totally spaced on the location. Oh well. If anyone sees this and wants to take pity, i'm in Tempe AZ. I'd edit the title but dunno how.
[Request] Chicago, IL - Check lost in the mail, $0.25 in the bank account, and out of ramen Hey there.

I've been struggling the past few months but this week has been a little rougher than most.  A check I was expecting from a temp job I had worked was lost in the mail and it sounds like I won't receive the reissued one until later this week or early next week.  My bank account is looking real sad right now and I've used u

## Some helper functions

In [310]:
# linya: preprocessor for text
def text_preprocesspr(s):
    #lowercase does improve result a little bit
    #return s.lower()

    #strip space does not improve 
    return s.lower().strip()
    
    #return s

#sample usage
print "'" + text_preprocesspr("S ") + "'"

's'


In [311]:
#Use the "min_df" argument to prune words that appear in fewer than x number of documents. To reduce the size of vocabulary
def global_min_df():
    return 20

In [312]:
# linya: read from vocabulary provided in kaggle dataset
def get_vocabulary(filename):
    text_file = open("data/narratives/%s.txt" % (filename), "r")
    lines = text_file.read().split('\n')
    return lines

def get_vocabulary_all():
    filenames = ["desire", "money", "job", "family", "student"]
    result = set()
    for filename in filenames:
        lines = get_vocabulary(filename)
        for i in range(len(lines)):
            result.add(lines[i].lower())
    return result

preset_vocabulary = get_vocabulary_all() 
#checked, this is included in current vocabulary already


In [313]:
# myan: helper function to easily compare performance
def comapre_features(train_data, test_data, add_features_list=None):
    train_data_text = combine_text_title(train_data)
    train_labels = train_data.requester_received_pizza.values

    test_data_text= combine_text_title(test_data)
    test_labels = test_data.requester_received_pizza.values
    
    vec_train = CountVectorizer(input='content', analyzer = 'word', ngram_range=(1,3), min_df=global_min_df(), preprocessor = text_preprocesspr)
    tokenized_train_data = vec_train.fit_transform(train_data_text)

    vec_test = CountVectorizer(input='content', analyzer = 'word', ngram_range=(1,3), min_df=global_min_df(), vocabulary=vec_train.vocabulary_, preprocessor = text_preprocesspr)
    tokenized_test_data = vec_test.fit_transform(test_data_text)
    
    NB_train = MultinomialNB()
    NB_train.fit(tokenized_train_data,train_labels)

    test_pred = NB_train.predict(tokenized_test_data)
    print "------ NLP Baseline: ------\n"
    print(classification_report(test_labels, test_pred))
    print '\n'
    
    if add_features_list:
        train_token_mat = tokenized_train_data.toarray()
        test_token_mat = tokenized_test_data.toarray()
        
        features_added = []
        for feature in add_features_list:
            features_added.append(feature)
            train_token_mat = np.concatenate((train_token_mat, np.array([train_data[feature].values]).T), axis=1)
            test_token_mat = np.concatenate((test_token_mat, np.array([test_data[feature].values]).T), axis=1)
            
            NB_train_plus = LogisticRegression()
            NB_train_plus.fit(train_token_mat, train_labels)
            test_pred_plus = NB_train_plus.predict(test_token_mat)
            print "--- Features = {feature}: ---\n".format(feature=features_added)
            print classification_report(test_labels, test_pred_plus)
            print '\n'

In [314]:
# myan: helper function to make a classifier based on a list of extra features
def make_classifier(train_data, test_data, add_features_list=None):
    train_data_text = combine_text_title(train_data)
    train_labels = train_data.requester_received_pizza.values

    test_data_text= combine_text_title(test_data) 
    
    vec_train = CountVectorizer(input='content', analyzer = 'word', ngram_range=(1,2), min_df=global_min_df(), preprocessor = text_preprocesspr)
    tokenized_train_data = vec_train.fit_transform(train_data_text)

    vec_test = CountVectorizer(input='content', analyzer = 'word', ngram_range=(1,3), min_df=global_min_df(), vocabulary=vec_train.vocabulary_, preprocessor = text_preprocesspr)
    tokenized_test_data = vec_test.fit_transform(test_data_text)    
    
    train_token_mat = tokenized_train_data.toarray()
    test_token_mat = tokenized_test_data.toarray()
    if add_features_list:                
        features_added = []
        for feature in add_features_list:
            features_added.append(feature)
            train_token_mat = np.concatenate((train_token_mat, np.array([train_data[feature].values]).T), axis=1)
            test_token_mat = np.concatenate((test_token_mat, np.array([test_data[feature].values]).T), axis=1)
            
    NB_train_plus = LogisticRegression()
    NB_train_plus.fit(train_token_mat, train_labels)
    return NB_train_plus, test_token_mat    

## FEATURE ENGINEERING: Look at subreddits matrix

Had this idea: maybe the people giving / requesting pizza belong to a similar group (e.g. gamers tend to give pizzas to fellow gamers, or people who look more 'legit' by the subredits they participate in will tend to receive pizzas)

The implementation is simple:
1. Extract all the unique subreddits from the training_data as **all_unique_subreddits**
2. Construct a matrix for each observation, if the requester has subreddits in ith element of **all_unique_subreddits**, then fill a 1, otherwise 0
3. Concatenante this matrix to the tokenized text data
4. Train the model with LogisticRegression
5. Make predictions

In [315]:
# myan: a helper function to construct the subreddit matrix for each observation in step 2
def make_subreddits_matrix(input_data, unique_subreddits):
    results = []
    for entry in input_data:
        results.append(np.in1d(unique_subreddits, entry))
    return np.array(results)

In [316]:
all_unique_subreddits = np.unique(np.concatenate(train_all.requester_subreddits_at_request.values))
all_unique_subreddits.shape

subreddits_matrix_train_all = make_subreddits_matrix(train_all.requester_subreddits_at_request.values, 
                                                     all_unique_subreddits)
subreddits_matrix_test_all = make_subreddits_matrix(test_all.requester_subreddits_at_request.values, 
                                                    all_unique_subreddits)

Here I just ran a simple Bernoulli Naive-Bayes classifier to do a quick sanity check. If this feature is any good, the NB performance shouldn't be too bad.

In [317]:
nb = MultinomialNB(alpha = 1.0)
nb.fit(np.array(subreddits_matrix_train_all), train_labels)

test_pred = nb.predict(np.array(subreddits_matrix_test_all))
print(classification_report(test_labels, test_pred))

             precision    recall  f1-score   support

      False       0.76      0.98      0.85       858
       True       0.37      0.04      0.07       277

avg / total       0.66      0.75      0.66      1135



Quite interestingly, using a NB on the subreddits matrix *alone* gives us approximately the same performance we got with tokenizing the texts. Hmm, looks promising. How about we try to make a classifier and see what happens?

In [318]:
train_data_text = combine_text_title(train_all)
train_labels = train_all.requester_received_pizza.values

test_data_text= combine_text_title(test_all)
test_labels = test_all.requester_received_pizza.values

vec_train = CountVectorizer(input='content', analyzer = 'word', ngram_range=(1,2), min_df=global_min_df(), preprocessor = text_preprocesspr)
tokenized_train_data = vec_train.fit_transform(train_data_text)

vec_test = CountVectorizer(input='content', analyzer = 'word', ngram_range=(1,3), min_df=global_min_df(), vocabulary=vec_train.vocabulary_, preprocessor = text_preprocesspr)
tokenized_test_data = vec_test.fit_transform(test_data_text)

train_token_mat = tokenized_train_data.toarray()
test_token_mat = tokenized_test_data.toarray()

[Request] Lonely and hungry, maybe someone can help me out Girlfriend's out of town for the weekend, but even if she were here her car's busted, and we're both broke. So i'm just sitting here on Reddit, depressed and hungry. Pizza places don't open for a little bit here but if someone could help me out with a pie and a 2 liter you'd be my hero.

Edit: I'm an idiot and totally spaced on the location. Oh well. If anyone sees this and wants to take pity, i'm in Tempe AZ. I'd edit the title but dunno how.
[Request] Chicago, IL - Check lost in the mail, $0.25 in the bank account, and out of ramen Hey there.

I've been struggling the past few months but this week has been a little rougher than most.  A check I was expecting from a temp job I had worked was lost in the mail and it sounds like I won't receive the reissued one until later this week or early next week.  My bank account is looking real sad right now and I've used up the last bit of my spare change to cover transit fare.  I've bee

AttributeError: 'tuple' object has no attribute 'lower'

In [238]:
NB_train_plus = MultinomialNB(alpha = 1.0)
NB_train_plus.fit(np.concatenate((train_token_mat, subreddits_matrix_train_all), axis=1), train_labels)

test_pred_plus = NB_train_plus.predict(np.concatenate((test_token_mat, subreddits_matrix_test_all), axis=1))

print classification_report(test_labels, test_pred_plus)

             precision    recall  f1-score   support

      False       0.78      0.89      0.83       863
       True       0.36      0.19      0.25       272

avg / total       0.68      0.73      0.69      1135



In [240]:
#check incorrect predictions 
print 'test case count', test_labels.shape[0]

#pred_case is the case that predicted but wrong 
#print_detail is whether you want it to print the request text or not 
def print_wrong_prediction(pred_labels, correct_labels, case_data, pred_case, print_detail):
    print 'predict is %s but label is opposite' % (pred_case)
    count = 0
    for i in range(pred_labels.shape[0]):
        if((pred_labels[i] != correct_labels[i]) & (pred_labels[i] == pred_case)):
            count = count + 1
            if (print_detail):
                print '# %d:' % (i), 'pred is', pred_labels[i], 'label is', correct_labels[i]
                print 'request text is\n', case_data[i], '\n\n\n'
    print 'total count of this kind of case is %d, this is %.2f percent' % (count, count*100.0/test_labels.shape[0]*1.0)
    
print_wrong_prediction(test_pred_plus, test_labels, test_data_text, True, True)

print_wrong_prediction(test_pred_plus, test_labels, test_data_text, False, False)

test case count 1135
predict is True but label is opposite
# 12: pred is True label is False
request text is
Haven't eaten anything these last two days hoping Reddit can help! 



# 27: pred is True label is False
request text is
Not the best week as you can see from the title. Money stresses making my relationships fall apart and struggling to get back on my feet. Pizza would not only cheer me up but fill my belly. Any help is appreciated : ) 



# 35: pred is True label is False
request text is
So, well. My girlfriend and I are 20, and 22 years old respectively. We decided to follow our dreams of creating a digital marketing/design firm about a year and a half ago. The road has been long and hard, we slept on my father's couch in a trailer for 6 months, a dank basement (on the floor) with people who didn't really want us there for the last 9, sometimes living off ramen as we all have to sometimes.

But here we are, on the advent of our success, and Christopher Columbus messes it up. 

The results on test_data above looks like an overall improvement. Let's go ahead and make a submission

In [141]:
subreddits_kaggle_test_data = make_subreddits_matrix(kaggle_test_data.requester_subreddits_at_request.values,
                                                     all_unique_subreddits)

tokenized_test_data = vec_test.fit_transform(kaggle_test_data.request_text_edit_aware.values)
test_token_mat = tokenized_test_data.toarray()
test_token_mat = np.concatenate((test_token_mat, subreddits_kaggle_test_data), axis=1)

pred_test_all = NB_train_plus.predict(test_token_mat)

predictions = pd.DataFrame()
predictions['request_id'] = kaggle_test_data.request_id
#maxyan: sample submission was expecting 0 instead of False
predictions['requester_received_pizza'] = pred_test_all.astype(int)
predictions = predictions.set_index('request_id')

# make sure the length is as expected in https://www.kaggle.com/c/random-acts-of-pizza/submissions/attach
print len(predictions) == 1631
predictions.to_csv('max_nlp_plus_features_submission_v3.csv')

True


**Some final words on my bit of exploration**

Throughout the exercise, I used the originally shuffled and *splitted* data for training and testing. I haven't bothered making any changes. I'm not sure if we can further improve the performance if we took the entire dataset for training.


Also, I have been using Ricardo's dataset (i.e. 'data/pizza_request_dataset.json') rather than the one provided by Kaggle. I think there are some differences but I haven't figured out what exactly these differences are. 

### Baseline

In [142]:
vec_train = CountVectorizer(input='content', analyzer = 'word', ngram_range=(1,2), min_df=global_min_df(), preprocessor = text_preprocesspr)
tokenized_train_data = vec_train.fit_transform(train_data_text)

vec_test = CountVectorizer(input='content', vocabulary=vec_train.vocabulary_, preprocessor = text_preprocesspr)
tokenized_test_data = vec_test.fit_transform(test_data_text)

print tokenized_train_data.shape
print tokenized_test_data.shape

NB_train = BernoulliNB(alpha=1.0)
NB_train.fit(tokenized_train_data,train_labels)

test_pred = NB_train.predict(tokenized_test_data)

print(classification_report(test_labels, test_pred))

(4536, 6243)
(1135, 6243)
             precision    recall  f1-score   support

      False       0.76      0.93      0.84       857
       True       0.34      0.12      0.18       278

avg / total       0.66      0.73      0.68      1135



## Exploring other features

In [143]:
list(kaggle_test_data.columns.astype(str))

['giver_username_if_known',
 'request_id',
 'request_text_edit_aware',
 'request_title',
 'requester_account_age_in_days_at_request',
 'requester_days_since_first_post_on_raop_at_request',
 'requester_number_of_comments_at_request',
 'requester_number_of_comments_in_raop_at_request',
 'requester_number_of_posts_at_request',
 'requester_number_of_posts_on_raop_at_request',
 'requester_number_of_subreddits_at_request',
 'requester_subreddits_at_request',
 'requester_upvotes_minus_downvotes_at_request',
 'requester_upvotes_plus_downvotes_at_request',
 'requester_username',
 'unix_timestamp_of_request',
 'unix_timestamp_of_request_utc',
 'hour_of_request',
 'length_of_title',
 'length_of_text']

In [144]:
comapre_features(train_all, 
                 test_all, 
                 add_features_list=['requester_number_of_posts_on_raop_at_request',                                    
                                   'requester_number_of_subreddits_at_request',
                                   'requester_upvotes_minus_downvotes_at_request',
                                   'length_of_text',
                                   'length_of_title',
                                   'hour_of_request'])

------ NLP Baseline: ------

             precision    recall  f1-score   support

      False       0.78      0.81      0.79       857
       True       0.33      0.28      0.30       278

avg / total       0.67      0.68      0.67      1135



--- Features = ['requester_number_of_posts_on_raop_at_request']: ---

             precision    recall  f1-score   support

      False       0.77      0.84      0.81       857
       True       0.34      0.24      0.28       278

avg / total       0.67      0.70      0.68      1135



--- Features = ['requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request']: ---

             precision    recall  f1-score   support

      False       0.77      0.84      0.81       857
       True       0.34      0.24      0.28       278

avg / total       0.67      0.70      0.68      1135



--- Features = ['requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request', 'requester_upvotes_minus_d

In [145]:
comapre_features(train_all, 
                 test_all, 
                 text_field='request_title',
                 add_features_list=['requester_number_of_posts_on_raop_at_request',                                    
                                   'requester_number_of_subreddits_at_request',
                                   'requester_upvotes_minus_downvotes_at_request',
                                   'length_of_text',
                                   'length_of_title',
                                   'hour_of_request'])

------ NLP Baseline: ------

             precision    recall  f1-score   support

      False       0.76      0.86      0.81       857
       True       0.28      0.17      0.21       278

avg / total       0.64      0.69      0.66      1135



--- Features = ['requester_number_of_posts_on_raop_at_request']: ---

             precision    recall  f1-score   support

      False       0.76      0.89      0.82       857
       True       0.28      0.13      0.17       278

avg / total       0.64      0.71      0.66      1135



--- Features = ['requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request']: ---

             precision    recall  f1-score   support

      False       0.76      0.89      0.82       857
       True       0.26      0.12      0.16       278

avg / total       0.63      0.70      0.66      1135



--- Features = ['requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request', 'requester_upvotes_minus_d

## Make a classifier and predict

In [146]:
kaggle_test_mat

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [147]:
classifier, kaggle_test_mat = make_classifier(train_all, 
                                              kaggle_test_data, 
                                              add_features_list=['requester_number_of_posts_on_raop_at_request',                                    
                                                                 'requester_number_of_subreddits_at_request'])

In [148]:
classifier, kaggle_test_mat = make_classifier(train_all, 
                                              kaggle_test_data, 
                                              add_features_list=[])

In [149]:
pred_test_all = classifier.predict(kaggle_test_mat)

predictions = pd.DataFrame()
predictions['request_id'] = kaggle_test_data.request_id
#maxyan: sample submission was expecting 0 instead of False
predictions['requester_received_pizza'] = pred_test_all.astype(int)
predictions = predictions.set_index('request_id')

# make sure the length is as expected in https://www.kaggle.com/c/random-acts-of-pizza/submissions/attach
print len(predictions) == 1631
predictions.to_csv('max_nlp_plus_features_submission_v2.csv')

True
