In [1]:
from sklearn import metrics
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report
import codecs
import json
import itertools
import csv

def read_dataset(path):
  with codecs.open(path, 'r', 'utf-8') as myFile:
    content = myFile.read()
  dataset = json.loads(content)
  return dataset

path = 'data/pizza_request_dataset.json'

## Read the data and split into training and dev set

In [74]:
data_all.day_of_week_at_request.unique()

array([1, 6, 4, 3, 0, 2, 5])

In [76]:
data_all = pd.read_json(path)
shuffle = np.random.permutation(np.arange(len(data_all)))
data_all = data_all.ix[shuffle]

data_all['hour_of_request'] = pd.to_datetime(data_all.unix_timestamp_of_request_utc.values, unit='s').hour
data_all['day_of_week_at_request'] = pd.to_datetime(data_all.unix_timestamp_of_request.values, unit='s').dayofweek
data_all['is_weekend'] = data_all.day_of_week_at_request.isin([0, 6, 5])

data_all['day_of_month_at_request'] = pd.to_datetime(data_all.unix_timestamp_of_request.values, unit='s').day
data_all['month_pos'] = 0
data_all['month_pos'].loc[(data_all.day_of_month_at_request >= 10) & (data_all.day_of_month_at_request < 20)] = 1
data_all['month_pos'].loc[data_all.day_of_month_at_request >= 20] = 2

data_all['length_of_title'] = [len(entry.split()) for entry in data_all.request_title.values]
data_all['length_of_text'] = [len(entry.split()) for entry in data_all.request_text_edit_aware.values]

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [3]:
import nltk
nltk.download()  # Download text data sets, including stop words

showing info http://www.nltk.org/nltk_data/


True

In [4]:
from nltk.corpus import stopwords # Import the stop word list
print stopwords.words("english") 

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u

In [5]:
def remove_stop_words(texts):
    result = []
    for entry in texts:
        words = [w.lower() for w in entry.split() if not w.lower() in stopwords.words("english")]
        res = ''
        for w in words:
            res += ' ' + w
        result.append(res)
    return result

In [78]:
data_all['request_text_revised'] = remove_stop_words(data_all.request_text_edit_aware.values)

In [77]:
kaggle_test_data = pd.read_json('data/test.json')
print kaggle_test_data.shape

kaggle_test_data['hour_of_request'] = pd.to_datetime(kaggle_test_data.unix_timestamp_of_request_utc.values, unit='s').hour
kaggle_test_data['length_of_title'] = [len(entry.split()) for entry in kaggle_test_data.request_title.values]
kaggle_test_data['length_of_text'] = [len(entry.split()) for entry in kaggle_test_data.request_text_edit_aware.values]
kaggle_test_data['day_of_week_at_request'] = pd.to_datetime(kaggle_test_data.unix_timestamp_of_request.values, unit='s').dayofweek
kaggle_test_data['day_of_month_at_request'] = pd.to_datetime(kaggle_test_data.unix_timestamp_of_request.values, unit='s').day
kaggle_test_data['request_text_revised'] = remove_stop_words(kaggle_test_data.request_text_edit_aware.values)

kaggle_test_data['is_weekend'] = kaggle_test_data.day_of_week_at_request.isin([0, 6, 5])
kaggle_test_data['month_pos'] = 0
kaggle_test_data['month_pos'].loc[(kaggle_test_data.day_of_month_at_request >= 10) & (kaggle_test_data.day_of_month_at_request < 20)] = 1
kaggle_test_data['month_pos'].loc[kaggle_test_data.day_of_month_at_request >= 20] = 2

(1631, 17)


In [79]:
train_all = data_all[:int(len(data_all) * 0.7)]
test_all = data_all[int(len(data_all) * 0.7):]

print train_all.shape
print test_all.shape

train_data_text = train_all.request_text_edit_aware.values
train_labels = train_all.requester_received_pizza.values

test_data_text= test_all.request_text_edit_aware.values
test_labels = test_all.requester_received_pizza.values

(3969, 41)
(1702, 41)


## Some helper functions

In [9]:
# myan: helper function to easily compare performance
def compare_features(train_data, test_data, text_field = 'request_text_edit_aware', add_features_list=None):
    train_data_text = train_data[text_field].values
    train_labels = train_data.requester_received_pizza.values

    test_data_text= test_data[text_field].values
    test_labels = test_data.requester_received_pizza.values
    
    vec_train = CountVectorizer()
    tokenized_train_data = vec_train.fit_transform(train_data_text)

    vec_test = CountVectorizer(vocabulary=vec_train.vocabulary_)
    tokenized_test_data = vec_test.fit_transform(test_data_text)
    
    LR_train = LogisticRegression()
    LR_train.fit(tokenized_train_data,train_labels)

    test_pred = LR_train.predict(tokenized_test_data)
    print "------ NLP Baseline: ------\n"
    print(classification_report(test_labels, test_pred))
    print '\n'
    
    if add_features_list:
        train_token_mat = tokenized_train_data.toarray()
        test_token_mat = tokenized_test_data.toarray()
        
        features_added = []
        for feature in add_features_list:
            features_added.append(feature)
            train_token_mat = np.concatenate((train_token_mat, np.array([train_data[feature].values]).T), axis=1)
            test_token_mat = np.concatenate((test_token_mat, np.array([test_data[feature].values]).T), axis=1)
            
            LR_train_plus = LogisticRegression()
            LR_train_plus.fit(train_token_mat, train_labels)
            test_pred_plus = LR_train_plus.predict(test_token_mat)
            print "--- Features = {feature}: ---\n".format(feature=features_added)
            print classification_report(test_labels, test_pred_plus)
            print '\n'

In [10]:
def prepare_data(train_data, test_data, text_field= 'request_text_edit_aware', extra_features=None):
    train_data_text = train_data[text_field].values
    if 'requester_received_pizza' in train_data.columns:
        train_labels = train_data.requester_received_pizza.values
    else:
        train_labels = None

    test_data_text= test_data[text_field].values
    if 'requester_received_pizza' in test_data.columns:
        test_labels = test_data.requester_received_pizza.values
    else:
        test_labels = None
    
    vec_train = CountVectorizer()
    tokenized_train_data = vec_train.fit_transform(train_data_text)

    vec_test = CountVectorizer(vocabulary=vec_train.vocabulary_)
    tokenized_test_data = vec_test.fit_transform(test_data_text)    
    
    train_token_mat = tokenized_train_data.toarray()
    test_token_mat = tokenized_test_data.toarray()
    
    if extra_features:                
        for feature in extra_features:            
            train_token_mat = np.concatenate((train_token_mat, np.array([train_data[feature].values]).T), axis=1)
            test_token_mat = np.concatenate((test_token_mat, np.array([test_data[feature].values]).T), axis=1)
    
    return train_token_mat, train_labels, test_token_mat, test_labels

In [11]:
# myan: helper function to make a classifier based on a list of extra features
def make_classifier(train_data, test_data, text_field = 'request_text_edit_aware', add_features_list=None):
    train_data_text = train_data[text_field].values
    train_labels = train_data.requester_received_pizza.values

    test_data_text= test_data[text_field].values    
    
    vec_train = CountVectorizer()
    tokenized_train_data = vec_train.fit_transform(train_data_text)

    vec_test = CountVectorizer(vocabulary=vec_train.vocabulary_)
    tokenized_test_data = vec_test.fit_transform(test_data_text)    
    
    train_token_mat = tokenized_train_data.toarray()
    test_token_mat = tokenized_test_data.toarray()
    if add_features_list:                
        features_added = []
        for feature in add_features_list:
            features_added.append(feature)
            train_token_mat = np.concatenate((train_token_mat, np.array([train_data[feature].values]).T), axis=1)
            test_token_mat = np.concatenate((test_token_mat, np.array([test_data[feature].values]).T), axis=1)
            
    LR_train_plus = LogisticRegression()
    LR_train_plus.fit(train_token_mat, train_labels)
    return LR_train_plus, test_token_mat    

## FEATURE ENGINEERING: Look at subreddits matrix

Had this idea: maybe the people giving / requesting pizza belong to a similar group (e.g. gamers tend to give pizzas to fellow gamers, or people who look more 'legit' by the subredits they participate in will tend to receive pizzas)

The implementation is simple:
1. Extract all the unique subreddits from the training_data as **all_unique_subreddits**
2. Construct a matrix for each observation, if the requester has subreddits in ith element of **all_unique_subreddits**, then fill a 1, otherwise 0
3. Concatenante this matrix to the tokenized text data
4. Train the model with LogisticRegression
5. Make predictions

In [12]:
# myan: a helper function to construct the subreddit matrix for each observation in step 2
def make_subreddits_matrix(input_data, unique_subreddits):
    results = []
    for entry in input_data:
        results.append(np.in1d(unique_subreddits, entry))
    return np.array(results)

In [80]:
all_unique_subreddits = np.unique(np.concatenate(train_all.requester_subreddits_at_request.values))
all_unique_subreddits.shape

subreddits_matrix_train_all = make_subreddits_matrix(train_all.requester_subreddits_at_request.values, 
                                                     all_unique_subreddits)
subreddits_matrix_test_all = make_subreddits_matrix(test_all.requester_subreddits_at_request.values, 
                                                    all_unique_subreddits)

Here I just ran a simple Bernoulli Naive-Bayes classifier to do a quick sanity check. If this feature is any good, the NB performance shouldn't be too bad.

In [None]:
nb = BernoulliNB()
nb.fit(subreddits_matrix_train_all, train_labels)

test_pred = nb.predict(subreddits_matrix_test_all)
print(classification_report(test_labels, test_pred))

Now what if we upsample the data...

In [14]:
def upsample_minority(matrix, labels, random_state=0, ratio=1):
    num_positive = labels.sum()
    num_negative = len(labels) - num_positive
    
    if num_negative == num_positive:
        return matrix, labels
    
    minority_count = min(num_positive, num_negative)
    majority_count = max(num_positive, num_negative)
    
    if num_positive >= num_negative:
        matrix_minority, labels_minority = matrix[labels == 0, :], labels[labels == 0]        
        matrix_majority, labels_majority = matrix[labels == 1, :], labels[labels == 1]        
    else:
        matrix_minority, labels_minority = matrix[labels == 1, :], labels[labels == 1]
        matrix_majority, labels_majority = matrix[labels == 0, :], labels[labels == 0]
    
    index = np.random.randint(0, high=len(labels_minority),size=(majority_count - minority_count))
    extra_matrix_minority, extra_labels_minority = matrix_minority[index, :], labels_minority[index]
    matrix_minority = np.concatenate((matrix_minority, extra_matrix_minority))
    labels_minority = np.concatenate((labels_minority, extra_labels_minority))
    return np.concatenate((matrix_minority, matrix_majority)), np.concatenate((labels_minority, labels_majority))        

In [56]:
def upsample_minority_sparse(matrix, labels, random_state=0, ratio=1):
    matrix = matrix.todense()
    
    num_positive = labels.sum()
    num_negative = len(labels) - num_positive
    
    if num_negative == num_positive:
        return matrix, labels
    
    minority_count = min(num_positive, num_negative)
    majority_count = max(num_positive, num_negative)
    
    if num_positive >= num_negative:
        matrix_minority, labels_minority = matrix[labels == 0, :], labels[labels == 0]        
        matrix_majority, labels_majority = sparse.csr_matrix(matrix[labels == 1, :]), labels[labels == 1]        
    else:
        matrix_minority, labels_minority = matrix[labels == 1, :], labels[labels == 1]
        matrix_majority, labels_majority = sparse.csr_matrix(matrix[labels == 0, :]), labels[labels == 0]
    
    index = np.random.randint(0, high=len(labels_minority),size=(majority_count - minority_count))
    extra_matrix_minority, extra_labels_minority = matrix_minority[index, :], labels_minority[index]
    matrix_minority = sparse.vstack([sparse.csr_matrix(matrix_minority), sparse.csr_matrix(extra_matrix_minority)])    
    labels_minority = np.concatenate((labels_minority, extra_labels_minority))
    return sparse.vstack([matrix_minority, matrix_majority]), np.concatenate((labels_minority, labels_majority))  

In [None]:
subreddits_matrix_train_all_upsampled, train_labels_upsampled = upsample_minority(subreddits_matrix_train_all, 
                                                                                  train_labels)
print subreddits_matrix_train_all.shape
print subreddits_matrix_train_all_upsampled.shape
print train_labels_upsampled.shape
print train_labels_upsampled.sum()

In [None]:
nb_upsampled = BernoulliNB()
nb_upsampled.fit(subreddits_matrix_train_all_upsampled, train_labels_upsampled)

test_pred_upsampled = nb_upsampled.predict(subreddits_matrix_test_all)
print(classification_report(test_labels, test_pred_upsampled))

Quite interestingly, using a NB on the subreddits matrix *alone* gives us approximately the same performance we got with tokenizing the texts. Hmm, looks promising. How about we try to make a classifier and see what happens?

In [None]:
train_data_text = train_all.request_text_edit_aware.values
train_labels = train_all.requester_received_pizza.values

test_data_text= test_all.request_text_edit_aware.values
test_labels = test_all.requester_received_pizza.values

vec_train = CountVectorizer()
tokenized_train_data = vec_train.fit_transform(train_data_text)

vec_test = CountVectorizer(vocabulary=vec_train.vocabulary_)
tokenized_test_data = vec_test.fit_transform(test_data_text)

train_token_mat = tokenized_train_data.toarray()
test_token_mat = tokenized_test_data.toarray()

In [None]:
LR_train_plus = LogisticRegression()
LR_train_plus.fit(np.concatenate((train_token_mat, subreddits_matrix_train_all), axis=1), train_labels)

test_pred_plus = LR_train_plus.predict(np.concatenate((test_token_mat, subreddits_matrix_test_all), axis=1))
print classification_report(test_labels, test_pred_plus)

The results on test_data above looks like an overall improvement. Let's go ahead and make a submission

In [None]:
subreddits_kaggle_test_data = make_subreddits_matrix(kaggle_test_data.requester_subreddits_at_request.values,
                                                     all_unique_subreddits)

tokenized_test_data = vec_test.fit_transform(kaggle_test_data.request_text_edit_aware.values)
test_token_mat = tokenized_test_data.toarray()
test_token_mat = np.concatenate((test_token_mat, subreddits_kaggle_test_data), axis=1)

pred_test_all = LR_train_plus.predict(test_token_mat)

predictions = pd.DataFrame()
predictions['request_id'] = kaggle_test_data.request_id
#maxyan: sample submission was expecting 0 instead of False
predictions['requester_received_pizza'] = pred_test_all.astype(int)
predictions = predictions.set_index('request_id')

# make sure the length is as expected in https://www.kaggle.com/c/random-acts-of-pizza/submissions/attach
print len(predictions) == 1631
predictions.to_csv('max_nlp_plus_features_submission_v3.csv')

### This is the upsampled version

Mainly playing around with adding extra features.

In [23]:
from scipy import sparse
import scipy as sp

In [69]:
list(kaggle_test_data.columns)

[u'giver_username_if_known',
 u'request_id',
 u'request_text_edit_aware',
 u'request_title',
 u'requester_account_age_in_days_at_request',
 u'requester_days_since_first_post_on_raop_at_request',
 u'requester_number_of_comments_at_request',
 u'requester_number_of_comments_in_raop_at_request',
 u'requester_number_of_posts_at_request',
 u'requester_number_of_posts_on_raop_at_request',
 u'requester_number_of_subreddits_at_request',
 u'requester_subreddits_at_request',
 u'requester_upvotes_minus_downvotes_at_request',
 u'requester_upvotes_plus_downvotes_at_request',
 u'requester_username',
 u'unix_timestamp_of_request',
 u'unix_timestamp_of_request_utc',
 'hour_of_request',
 'length_of_title',
 'length_of_text',
 'day_of_week_at_request',
 'day_of_month_at_request',
 'request_text_revised']

In [85]:
# Upsampled version
# EXTRA_FEATURES=[]
# EXTRA_FEATURES = ['day_of_week_at_request', 'day_of_month_at_request']
EXTRA_FEATURES = ['requester_number_of_posts_on_raop_at_request', 
                  'requester_number_of_comments_in_raop_at_request',
                  'requester_upvotes_minus_downvotes_at_request',
                  'length_of_text',
                  'is_weekend',
                  'hour_of_request',
#                   'requester_account_age_in_days_at_request',
                 'month_pos']

train_matrix, train_labels, test_matrix, test_labels = prepare_data(train_all                                                                    
                                                                    , test_all
                                                                    , text_field='request_title'
                                                                    , extra_features=EXTRA_FEATURES)

print train_matrix.shape

# append subreddits features
# train_matrix = np.concatenate((train_matrix, subreddits_matrix_train_all), axis=1)
# test_matrix = np.concatenate((test_matrix, subreddits_matrix_test_all), axis=1)

train_matrix = sparse.hstack([sparse.csr_matrix(train_matrix), sparse.csr_matrix(subreddits_matrix_train_all.astype(int))])
test_matrix = sparse.hstack([sparse.csr_matrix(test_matrix), sparse.csr_matrix(subreddits_matrix_test_all.astype(int))])

train_matrix_upsampled, train_labels_upsampled = upsample_minority_sparse(train_matrix, train_labels, random_state=0)

LR = LogisticRegression()
LR.fit(train_matrix, train_labels)
test_pred_plus_upsampled = LR.predict(test_matrix)
print classification_report(test_labels, test_pred_plus_upsampled)


LR_upsampled = LogisticRegression()
LR_upsampled.fit(train_matrix_upsampled, train_labels_upsampled)
test_pred_plus_upsampled = LR_upsampled.predict(test_matrix)
print classification_report(test_labels, test_pred_plus_upsampled)



(3969, 4484)
             precision    recall  f1-score   support

      False       0.77      0.88      0.82      1278
       True       0.37      0.20      0.26       424

avg / total       0.67      0.72      0.68      1702

             precision    recall  f1-score   support

      False       0.79      0.74      0.76      1278
       True       0.35      0.42      0.38       424

avg / total       0.68      0.66      0.67      1702



In [86]:
subreddits_kaggle_test_data = make_subreddits_matrix(kaggle_test_data.requester_subreddits_at_request.values,
                                                     all_unique_subreddits)

_, _, test_token_mat, _ = prepare_data(train_all
                                       , kaggle_test_data
                                       , text_field='request_title'
                                       , extra_features=EXTRA_FEATURES)

# append subreddits features
# test_token_mat = np.concatenate((test_token_mat, subreddits_kaggle_test_data), axis=1)
test_token_mat = sparse.hstack([sparse.csr_matrix(test_token_mat), sparse.csr_matrix(subreddits_kaggle_test_data.astype(int))])

pred_test_all = LR_upsampled.predict(test_token_mat)

predictions = pd.DataFrame()
predictions['request_id'] = kaggle_test_data.request_id
#maxyan: sample submission was expecting 0 instead of False
predictions['requester_received_pizza'] = pred_test_all.astype(int)
predictions = predictions.set_index('request_id')

# make sure the length is as expected in https://www.kaggle.com/c/random-acts-of-pizza/submissions/attach
print len(predictions) == 1631
predictions.to_csv('max_submission_latest.csv')

True


**Some final words on my bit of exploration**

Throughout the exercise, I used the originally shuffled and *splitted* data for training and testing. I haven't bothered making any changes. I'm not sure if we can further improve the performance if we took the entire dataset for training.


Also, I have been using Ricardo's dataset (i.e. 'data/pizza_request_dataset.json') rather than the one provided by Kaggle. I think there are some differences but I haven't figured out what exactly these differences are. 

### Baseline

In [None]:
vec_train = CountVectorizer()
tokenized_train_data = vec_train.fit_transform(train_data_text)

vec_test = CountVectorizer(vocabulary=vec_train.vocabulary_)
tokenized_test_data = vec_test.fit_transform(test_data_text)

print tokenized_train_data.shape
print tokenized_test_data.shape

LR_train = LogisticRegression()
LR_train.fit(tokenized_train_data,train_labels)

test_pred = LR_train.predict(tokenized_test_data)

print(classification_report(test_labels, test_pred))

## Exploring other features

In [None]:
list(kaggle_test_data.columns.astype(str))

In [None]:
comapre_features(train_all, 
                 test_all, 
                 add_features_list=['requester_number_of_posts_on_raop_at_request',                                    
                                   'requester_number_of_subreddits_at_request',
                                   'requester_upvotes_minus_downvotes_at_request',
                                   'length_of_text',
                                   'length_of_title',
                                   'hour_of_request'])

In [None]:
comapre_features(train_all, 
                 test_all, 
                 text_field='request_title',
                 add_features_list=['requester_number_of_posts_on_raop_at_request',                                    
                                   'requester_number_of_subreddits_at_request',
                                   'requester_upvotes_minus_downvotes_at_request',
                                   'length_of_text',
                                   'length_of_title',
                                   'hour_of_request'])

## Make a classifier and predict

In [None]:
kaggle_test_mat

In [None]:
classifier, kaggle_test_mat = make_classifier(train_all, 
                                              kaggle_test_data, 
                                              add_features_list=['requester_number_of_posts_on_raop_at_request',                                    
                                                                 'requester_number_of_subreddits_at_request'])

In [None]:
classifier, kaggle_test_mat = make_classifier(train_all, 
                                              kaggle_test_data, 
                                              add_features_list=[])

In [None]:
pred_test_all = classifier.predict(kaggle_test_mat)

predictions = pd.DataFrame()
predictions['request_id'] = kaggle_test_data.request_id
#maxyan: sample submission was expecting 0 instead of False
predictions['requester_received_pizza'] = pred_test_all.astype(int)
predictions = predictions.set_index('request_id')

# make sure the length is as expected in https://www.kaggle.com/c/random-acts-of-pizza/submissions/attach
print len(predictions) == 1631
predictions.to_csv('max_nlp_plus_features_submission_v2.csv')