In [54]:
from sklearn import metrics
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report
import codecs
import json
import itertools
import csv

def read_dataset(path):
  with codecs.open(path, 'r', 'utf-8') as myFile:
    content = myFile.read()
  dataset = json.loads(content)
  return dataset

path = 'data/pizza_request_dataset.json'
dataset = read_dataset(path)
shuffle = np.random.permutation(np.arange(len(dataset)))

## Read the data and split into training and dev set

In [55]:
data_all = pd.read_json(path)
data_all = data_all.ix[shuffle]

data_all['hour_of_request'] = pd.to_datetime(data_all.unix_timestamp_of_request_utc.values, unit='s').hour
data_all['length_of_title'] = [len(entry.split()) for entry in data_all.request_title.values]
data_all['length_of_text'] = [len(entry.split()) for entry in data_all.request_text_edit_aware.values]

In [56]:
kaggle_test_data = pd.read_json('data/test.json')
print kaggle_test_data.shape

kaggle_test_data['hour_of_request'] = pd.to_datetime(kaggle_test_data.unix_timestamp_of_request_utc.values, unit='s').hour
kaggle_test_data['length_of_title'] = [len(entry.split()) for entry in kaggle_test_data.request_title.values]
kaggle_test_data['length_of_text'] = [len(entry.split()) for entry in kaggle_test_data.request_text_edit_aware.values]

(1631, 17)


In [57]:
train_all = data_all[:int(len(data_all) * 0.7)]
test_all = data_all[int(len(data_all) * 0.7):]

print train_all.shape
print test_all.shape

train_data_text = train_all.request_text_edit_aware.values
train_labels = train_all.requester_received_pizza.values

test_data_text= test_all.request_text_edit_aware.values
test_labels = test_all.requester_received_pizza.values

(3969, 36)
(1702, 36)


## Some helper functions

In [58]:
# linya: preprocessor for text
def text_preprocesspr(s):
    #lowercase does improve result a little bit
    return s.lower()

    #strip space does not improve 
    #return s.lower().strip()

#sample usage
print "'" + text_preprocesspr("S ") + "'"

's '


In [98]:
# linya: read from vocabulary provided in kaggle dataset
def get_vocabulary(filename):
    text_file = open("data/narratives/%s.txt" % (filename), "r")
    lines = text_file.read().split('\n')
    return lines

def get_vocabulary_all():
    filenames = ["desire", "money", "job", "family", "student"]
    result = set()
    for filename in filenames:
        lines = get_vocabulary(filename)
        for i in range(len(lines)):
            result.add(lines[i].lower())
    return result

preset_vocabulary = get_vocabulary_all() 
#checked, this is included in current vocabulary already


In [99]:
# myan: helper function to easily compare performance
def comapre_features(train_data, test_data, text_field = 'request_text_edit_aware', add_features_list=None):
    train_data_text = train_data[text_field].values
    train_labels = train_data.requester_received_pizza.values

    test_data_text= test_data[text_field].values
    test_labels = test_data.requester_received_pizza.values
    
    vec_train = CountVectorizer(preprocessor = text_preprocesspr)
    tokenized_train_data = vec_train.fit_transform(train_data_text)

    vec_test = CountVectorizer(vocabulary=vec_train.vocabulary_, preprocessor = text_preprocesspr)
    tokenized_test_data = vec_test.fit_transform(test_data_text)
    
    LR_train = LogisticRegression()
    LR_train.fit(tokenized_train_data,train_labels)

    test_pred = LR_train.predict(tokenized_test_data)
    print "------ NLP Baseline: ------\n"
    print(classification_report(test_labels, test_pred))
    print '\n'
    
    if add_features_list:
        train_token_mat = tokenized_train_data.toarray()
        test_token_mat = tokenized_test_data.toarray()
        
        features_added = []
        for feature in add_features_list:
            features_added.append(feature)
            train_token_mat = np.concatenate((train_token_mat, np.array([train_data[feature].values]).T), axis=1)
            test_token_mat = np.concatenate((test_token_mat, np.array([test_data[feature].values]).T), axis=1)
            
            LR_train_plus = LogisticRegression()
            LR_train_plus.fit(train_token_mat, train_labels)
            test_pred_plus = LR_train_plus.predict(test_token_mat)
            print "--- Features = {feature}: ---\n".format(feature=features_added)
            print classification_report(test_labels, test_pred_plus)
            print '\n'

In [60]:
# myan: helper function to make a classifier based on a list of extra features
def make_classifier(train_data, test_data, text_field = 'request_text_edit_aware', add_features_list=None):
    train_data_text = train_data[text_field].values
    train_labels = train_data.requester_received_pizza.values

    test_data_text= test_data[text_field].values    
    
    vec_train = CountVectorizer(preprocessor = text_preprocesspr)
    tokenized_train_data = vec_train.fit_transform(train_data_text)

    vec_test = CountVectorizer(vocabulary=vec_train.vocabulary_, preprocessor = text_preprocesspr)
    tokenized_test_data = vec_test.fit_transform(test_data_text)    
    
    train_token_mat = tokenized_train_data.toarray()
    test_token_mat = tokenized_test_data.toarray()
    if add_features_list:                
        features_added = []
        for feature in add_features_list:
            features_added.append(feature)
            train_token_mat = np.concatenate((train_token_mat, np.array([train_data[feature].values]).T), axis=1)
            test_token_mat = np.concatenate((test_token_mat, np.array([test_data[feature].values]).T), axis=1)
            
    LR_train_plus = LogisticRegression()
    LR_train_plus.fit(train_token_mat, train_labels)
    return LR_train_plus, test_token_mat    

## FEATURE ENGINEERING: Look at subreddits matrix

Had this idea: maybe the people giving / requesting pizza belong to a similar group (e.g. gamers tend to give pizzas to fellow gamers, or people who look more 'legit' by the subredits they participate in will tend to receive pizzas)

The implementation is simple:
1. Extract all the unique subreddits from the training_data as **all_unique_subreddits**
2. Construct a matrix for each observation, if the requester has subreddits in ith element of **all_unique_subreddits**, then fill a 1, otherwise 0
3. Concatenante this matrix to the tokenized text data
4. Train the model with LogisticRegression
5. Make predictions

In [61]:
# myan: a helper function to construct the subreddit matrix for each observation in step 2
def make_subreddits_matrix(input_data, unique_subreddits):
    results = []
    for entry in input_data:
        results.append(np.in1d(unique_subreddits, entry))
    return np.array(results)

In [62]:
all_unique_subreddits = np.unique(np.concatenate(train_all.requester_subreddits_at_request.values))
all_unique_subreddits.shape

subreddits_matrix_train_all = make_subreddits_matrix(train_all.requester_subreddits_at_request.values, 
                                                     all_unique_subreddits)
subreddits_matrix_test_all = make_subreddits_matrix(test_all.requester_subreddits_at_request.values, 
                                                    all_unique_subreddits)

Here I just ran a simple Bernoulli Naive-Bayes classifier to do a quick sanity check. If this feature is any good, the NB performance shouldn't be too bad.

In [63]:
nb = BernoulliNB()
nb.fit(np.array(subreddits_matrix_train_all), train_labels)

test_pred = nb.predict(np.array(subreddits_matrix_test_all))
print(classification_report(test_labels, test_pred))

             precision    recall  f1-score   support

      False       0.76      0.90      0.82      1276
       True       0.30      0.13      0.19       426

avg / total       0.64      0.71      0.66      1702



Quite interestingly, using a NB on the subreddits matrix *alone* gives us approximately the same performance we got with tokenizing the texts. Hmm, looks promising. How about we try to make a classifier and see what happens?

In [64]:
train_data_text = train_all.request_text_edit_aware.values
train_labels = train_all.requester_received_pizza.values

test_data_text= test_all.request_text_edit_aware.values
test_labels = test_all.requester_received_pizza.values

vec_train = CountVectorizer(preprocessor = text_preprocesspr)
tokenized_train_data = vec_train.fit_transform(train_data_text)

vec_test = CountVectorizer(vocabulary=vec_train.vocabulary_, preprocessor = text_preprocesspr)
tokenized_test_data = vec_test.fit_transform(test_data_text)

train_token_mat = tokenized_train_data.toarray()
test_token_mat = tokenized_test_data.toarray()

In [65]:
LR_train_plus = LogisticRegression()
LR_train_plus.fit(np.concatenate((train_token_mat, subreddits_matrix_train_all), axis=1), train_labels)

test_pred_plus = LR_train_plus.predict(np.concatenate((test_token_mat, subreddits_matrix_test_all), axis=1))

print classification_report(test_labels, test_pred_plus)

             precision    recall  f1-score   support

      False       0.77      0.86      0.81      1276
       True       0.36      0.24      0.29       426

avg / total       0.67      0.70      0.68      1702



The results on test_data above looks like an overall improvement. Let's go ahead and make a submission

In [66]:
subreddits_kaggle_test_data = make_subreddits_matrix(kaggle_test_data.requester_subreddits_at_request.values,
                                                     all_unique_subreddits)

tokenized_test_data = vec_test.fit_transform(kaggle_test_data.request_text_edit_aware.values)
test_token_mat = tokenized_test_data.toarray()
test_token_mat = np.concatenate((test_token_mat, subreddits_kaggle_test_data), axis=1)

pred_test_all = LR_train_plus.predict(test_token_mat)

predictions = pd.DataFrame()
predictions['request_id'] = kaggle_test_data.request_id
#maxyan: sample submission was expecting 0 instead of False
predictions['requester_received_pizza'] = pred_test_all.astype(int)
predictions = predictions.set_index('request_id')

# make sure the length is as expected in https://www.kaggle.com/c/random-acts-of-pizza/submissions/attach
print len(predictions) == 1631
predictions.to_csv('max_nlp_plus_features_submission_v3.csv')

True


**Some final words on my bit of exploration**

Throughout the exercise, I used the originally shuffled and *splitted* data for training and testing. I haven't bothered making any changes. I'm not sure if we can further improve the performance if we took the entire dataset for training.


Also, I have been using Ricardo's dataset (i.e. 'data/pizza_request_dataset.json') rather than the one provided by Kaggle. I think there are some differences but I haven't figured out what exactly these differences are. 

### Baseline

In [43]:
vec_train = CountVectorizer(preprocessor = text_preprocesspr)
tokenized_train_data = vec_train.fit_transform(train_data_text)

vec_test = CountVectorizer(vocabulary=vec_train.vocabulary_, preprocessor = text_preprocesspr)
tokenized_test_data = vec_test.fit_transform(test_data_text)

print tokenized_train_data.shape
print tokenized_test_data.shape

LR_train = LogisticRegression()
LR_train.fit(tokenized_train_data,train_labels)

test_pred = LR_train.predict(tokenized_test_data)

print(classification_report(test_labels, test_pred))

(3969, 12055)
(1702, 12055)
             precision    recall  f1-score   support

      False       0.76      0.85      0.81      1274
       True       0.33      0.21      0.26       428

avg / total       0.66      0.69      0.67      1702



## Exploring other features

In [44]:
list(kaggle_test_data.columns.astype(str))

['giver_username_if_known',
 'request_id',
 'request_text_edit_aware',
 'request_title',
 'requester_account_age_in_days_at_request',
 'requester_days_since_first_post_on_raop_at_request',
 'requester_number_of_comments_at_request',
 'requester_number_of_comments_in_raop_at_request',
 'requester_number_of_posts_at_request',
 'requester_number_of_posts_on_raop_at_request',
 'requester_number_of_subreddits_at_request',
 'requester_subreddits_at_request',
 'requester_upvotes_minus_downvotes_at_request',
 'requester_upvotes_plus_downvotes_at_request',
 'requester_username',
 'unix_timestamp_of_request',
 'unix_timestamp_of_request_utc',
 'hour_of_request',
 'length_of_title',
 'length_of_text']

In [45]:
comapre_features(train_all, 
                 test_all, 
                 add_features_list=['requester_number_of_posts_on_raop_at_request',                                    
                                   'requester_number_of_subreddits_at_request',
                                   'requester_upvotes_minus_downvotes_at_request',
                                   'length_of_text',
                                   'length_of_title',
                                   'hour_of_request'])

------ NLP Baseline: ------

             precision    recall  f1-score   support

      False       0.76      0.85      0.81      1274
       True       0.33      0.21      0.26       428

avg / total       0.66      0.69      0.67      1702



--- Features = ['requester_number_of_posts_on_raop_at_request']: ---

             precision    recall  f1-score   support

      False       0.77      0.86      0.81      1274
       True       0.35      0.23      0.28       428

avg / total       0.66      0.70      0.68      1702



--- Features = ['requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request']: ---

             precision    recall  f1-score   support

      False       0.77      0.86      0.81      1274
       True       0.35      0.23      0.28       428

avg / total       0.66      0.70      0.68      1702



--- Features = ['requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request', 'requester_upvotes_minus_d

In [46]:
comapre_features(train_all, 
                 test_all, 
                 text_field='request_title',
                 add_features_list=['requester_number_of_posts_on_raop_at_request',                                    
                                   'requester_number_of_subreddits_at_request',
                                   'requester_upvotes_minus_downvotes_at_request',
                                   'length_of_text',
                                   'length_of_title',
                                   'hour_of_request'])

------ NLP Baseline: ------

             precision    recall  f1-score   support

      False       0.75      0.93      0.83      1274
       True       0.30      0.09      0.14       428

avg / total       0.64      0.72      0.66      1702



--- Features = ['requester_number_of_posts_on_raop_at_request']: ---

             precision    recall  f1-score   support

      False       0.76      0.93      0.83      1274
       True       0.34      0.11      0.17       428

avg / total       0.65      0.72      0.67      1702



--- Features = ['requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request']: ---

             precision    recall  f1-score   support

      False       0.76      0.93      0.83      1274
       True       0.35      0.12      0.18       428

avg / total       0.66      0.72      0.67      1702



--- Features = ['requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request', 'requester_upvotes_minus_d

## Make a classifier and predict

In [47]:
kaggle_test_mat

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [48]:
classifier, kaggle_test_mat = make_classifier(train_all, 
                                              kaggle_test_data, 
                                              add_features_list=['requester_number_of_posts_on_raop_at_request',                                    
                                                                 'requester_number_of_subreddits_at_request'])

In [49]:
classifier, kaggle_test_mat = make_classifier(train_all, 
                                              kaggle_test_data, 
                                              add_features_list=[])

In [50]:
pred_test_all = classifier.predict(kaggle_test_mat)

predictions = pd.DataFrame()
predictions['request_id'] = kaggle_test_data.request_id
#maxyan: sample submission was expecting 0 instead of False
predictions['requester_received_pizza'] = pred_test_all.astype(int)
predictions = predictions.set_index('request_id')

# make sure the length is as expected in https://www.kaggle.com/c/random-acts-of-pizza/submissions/attach
print len(predictions) == 1631
predictions.to_csv('max_nlp_plus_features_submission_v2.csv')

True
