In [2]:
from sklearn import metrics
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import codecs
import json
import itertools
import csv

def read_dataset(path):
  with codecs.open(path, 'r', 'utf-8') as myFile:
    content = myFile.read()
  dataset = json.loads(content)
  return dataset

path = 'data/pizza_request_dataset.json'
dataset = read_dataset(path)

print 'The dataset contains %d samples.' %(len(dataset))
print 'Available attributes: ', sorted(dataset[0].keys())
print 'First post:'
print json.dumps(dataset[0], sort_keys=True, indent=2)

successes = [r['requester_received_pizza'] for r in dataset]
success_rate = 100.0 * sum(successes) / float(len(successes))
print 'The average success rate is: %.2f%%' %(success_rate)

shuffle = np.random.permutation(np.arange(len(dataset)))

The dataset contains 5671 samples.
Available attributes:  [u'giver_username_if_known', u'in_test_set', u'number_of_downvotes_of_request_at_retrieval', u'number_of_upvotes_of_request_at_retrieval', u'post_was_edited', u'request_id', u'request_number_of_comments_at_retrieval', u'request_text', u'request_text_edit_aware', u'request_title', u'requester_account_age_in_days_at_request', u'requester_account_age_in_days_at_retrieval', u'requester_days_since_first_post_on_raop_at_request', u'requester_days_since_first_post_on_raop_at_retrieval', u'requester_number_of_comments_at_request', u'requester_number_of_comments_at_retrieval', u'requester_number_of_comments_in_raop_at_request', u'requester_number_of_comments_in_raop_at_retrieval', u'requester_number_of_posts_at_request', u'requester_number_of_posts_at_retrieval', u'requester_number_of_posts_on_raop_at_request', u'requester_number_of_posts_on_raop_at_retrieval', u'requester_number_of_subreddits_at_request', u'requester_received_pizza', u'

## Read the data and split into training and dev set

In [33]:
data_all = pd.read_json(path)
data_all = data_all.ix[shuffle]

data_all['hour_of_request'] = pd.to_datetime(data_all.unix_timestamp_of_request_utc.values, unit='s').hour
data_all['length_of_title'] = [len(entry.split()) for entry in data_all.request_title.values]
data_all['length_of_text'] = [len(entry.split()) for entry in data_all.request_text_edit_aware.values]

In [101]:
kaggle_test_data = pd.read_json('data/test.json')
print kaggle_test_data.shape

kaggle_test_data['hour_of_request'] = pd.to_datetime(kaggle_test_data.unix_timestamp_of_request_utc.values, unit='s').hour
kaggle_test_data['length_of_title'] = [len(entry.split()) for entry in kaggle_test_data.request_title.values]
kaggle_test_data['length_of_text'] = [len(entry.split()) for entry in kaggle_test_data.request_text_edit_aware.values]

(1631, 17)


In [34]:
train_all = data_all[:int(len(data_all) * 0.7)]
test_all = data_all[int(len(data_all) * 0.7):]

print train_all.shape
print test_all.shape

train_data_text = train_all.request_text_edit_aware.values
train_labels = train_all.requester_received_pizza.values

test_data_text= test_all.request_text_edit_aware.values
test_labels = test_all.requester_received_pizza.values

(3969, 36)
(1702, 36)


In [94]:
# myan: helper function to easily compare performance
def comapre_features(train_data, test_data, text_field = 'request_text_edit_aware', add_features_list=None):
    train_data_text = train_data[text_field].values
    train_labels = train_data.requester_received_pizza.values

    test_data_text= test_data[text_field].values
    test_labels = test_data.requester_received_pizza.values
    
    vec_train = CountVectorizer()
    tokenized_train_data = vec_train.fit_transform(train_data_text)

    vec_test = CountVectorizer(vocabulary=vec_train.vocabulary_)
    tokenized_test_data = vec_test.fit_transform(test_data_text)
    
    LR_train = LogisticRegression()
    LR_train.fit(tokenized_train_data,train_labels)

    test_pred = LR_train.predict(tokenized_test_data)
    print "------ NLP Baseline: ------\n"
    print(classification_report(test_labels, test_pred))
    print '\n'
    
    if add_features_list:
        train_token_mat = tokenized_train_data.toarray()
        test_token_mat = tokenized_test_data.toarray()
        
        features_added = []
        for feature in add_features_list:
            features_added.append(feature)
            train_token_mat = np.concatenate((train_token_mat, np.array([train_data[feature].values]).T), axis=1)
            test_token_mat = np.concatenate((test_token_mat, np.array([test_data[feature].values]).T), axis=1)
            
            LR_train_plus = LogisticRegression()
            LR_train_plus.fit(train_token_mat, train_labels)
            test_pred_plus = LR_train_plus.predict(test_token_mat)
            print "--- Features = {feature}: ---\n".format(feature=features_added)
            print classification_report(test_labels, test_pred_plus)
            print '\n'

In [92]:
# myan: helper function to make a classifier based on a list of extra features
def make_classifier(train_data, test_data, text_field = 'request_text_edit_aware', add_features_list=None):
    train_data_text = train_data[text_field].values
    train_labels = train_data.requester_received_pizza.values

    test_data_text= test_data[text_field].values    
    
    vec_train = CountVectorizer()
    tokenized_train_data = vec_train.fit_transform(train_data_text)

    vec_test = CountVectorizer(vocabulary=vec_train.vocabulary_)
    tokenized_test_data = vec_test.fit_transform(test_data_text)    
    
    if add_features_list:
        train_token_mat = tokenized_train_data.toarray()
        test_token_mat = tokenized_test_data.toarray()
        
        features_added = []
        for feature in add_features_list:
            features_added.append(feature)
            train_token_mat = np.concatenate((train_token_mat, np.array([train_data[feature].values]).T), axis=1)
            test_token_mat = np.concatenate((test_token_mat, np.array([test_data[feature].values]).T), axis=1)
            
        LR_train_plus = LogisticRegression()
        LR_train_plus.fit(train_token_mat, train_labels)
        return LR_train_plus, test_token_mat
    return None, None

### First model

In [96]:
vec_train = CountVectorizer()
tokenized_train_data = vec_train.fit_transform(train_data_text)

vec_test = CountVectorizer(vocabulary=vec_train.vocabulary_)
tokenized_test_data = vec_test.fit_transform(test_data_text)

print tokenized_train_data.shape
print tokenized_test_data.shape

LR_train = LogisticRegression()
LR_train.fit(tokenized_train_data,train_labels)

test_pred = LR_train.predict(tokenized_test_data)

print(classification_report(test_labels, test_pred))

(3969, 12174)
(1702, 12174)
             precision    recall  f1-score   support

      False       0.77      0.88      0.82      1281
       True       0.35      0.20      0.26       421

avg / total       0.66      0.71      0.68      1702



## Exploring other features

In [78]:
list(kaggle_test_data.columns.astype(str))

(1631, 17)


['giver_username_if_known',
 'request_id',
 'request_text_edit_aware',
 'request_title',
 'requester_account_age_in_days_at_request',
 'requester_days_since_first_post_on_raop_at_request',
 'requester_number_of_comments_at_request',
 'requester_number_of_comments_in_raop_at_request',
 'requester_number_of_posts_at_request',
 'requester_number_of_posts_on_raop_at_request',
 'requester_number_of_subreddits_at_request',
 'requester_subreddits_at_request',
 'requester_upvotes_minus_downvotes_at_request',
 'requester_upvotes_plus_downvotes_at_request',
 'requester_username',
 'unix_timestamp_of_request',
 'unix_timestamp_of_request_utc']

In [97]:
comapre_features(train_all, 
                 test_all, 
                 add_features_list=['requester_number_of_posts_on_raop_at_request',                                    
                                   'requester_number_of_subreddits_at_request',
                                   'requester_upvotes_minus_downvotes_at_request',
                                   'length_of_text',
                                   'length_of_title',
                                   'hour_of_request'])

------ NLP Baseline: ------

             precision    recall  f1-score   support

      False       0.77      0.88      0.82      1281
       True       0.35      0.20      0.26       421

avg / total       0.66      0.71      0.68      1702



--- Features = ['requester_number_of_posts_on_raop_at_request']: ---

             precision    recall  f1-score   support

      False       0.77      0.87      0.82      1281
       True       0.36      0.22      0.27       421

avg / total       0.67      0.71      0.69      1702



--- Features = ['requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request']: ---

             precision    recall  f1-score   support

      False       0.78      0.87      0.82      1281
       True       0.37      0.23      0.28       421

avg / total       0.67      0.71      0.69      1702



--- Features = ['requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request', 'requester_upvotes_minus_d

In [98]:
comapre_features(train_all, 
                 test_all, 
                 text_field='request_title',
                 add_features_list=['requester_number_of_posts_on_raop_at_request',                                    
                                   'requester_number_of_subreddits_at_request',
                                   'requester_upvotes_minus_downvotes_at_request',
                                   'length_of_text',
                                   'length_of_title',
                                   'hour_of_request'])

------ NLP Baseline: ------

             precision    recall  f1-score   support

      False       0.76      0.91      0.83      1281
       True       0.29      0.11      0.16       421

avg / total       0.64      0.71      0.66      1702



--- Features = ['requester_number_of_posts_on_raop_at_request']: ---

             precision    recall  f1-score   support

      False       0.76      0.91      0.83      1281
       True       0.30      0.11      0.16       421

avg / total       0.64      0.71      0.66      1702



--- Features = ['requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request']: ---

             precision    recall  f1-score   support

      False       0.76      0.91      0.83      1281
       True       0.32      0.13      0.18       421

avg / total       0.65      0.72      0.67      1702



--- Features = ['requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request', 'requester_upvotes_minus_d

## Make a classifier and predict

In [106]:
classifier, kaggle_test_mat = make_classifier(train_all, 
                                              kaggle_test_data, 
                                              add_features_list=['requester_number_of_posts_on_raop_at_request',                                    
                                                                   'requester_number_of_subreddits_at_request'])

In [107]:
pred_test_all = classifier.predict(kaggle_test_mat)

predictions = pd.DataFrame()
predictions['request_id'] = kaggle_test_data.request_id
#maxyan: sample submission was expecting 0 instead of False
predictions['requester_received_pizza'] = pred_test_all.astype(int)
predictions = predictions.set_index('request_id')

# make sure the length is as expected in https://www.kaggle.com/c/random-acts-of-pizza/submissions/attach
print len(predictions) == 1631
predictions.to_csv('max_nlp_plus_features_submission.csv')

True
