In [9]:
%matplotlib inline
from __future__ import division

import numpy as np
import pandas as pd
import json
import csv
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC

# Ensemble methods
from sklearn.ensemble import RandomForestClassifier

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

from scipy.stats import uniform

import pydot

In [10]:
# Load json dataset into pandas
train = pd.read_json('train.json', convert_dates=['unix_timestamp_of_request'])
test = pd.read_json('test.json', convert_dates=['unix_timestamp_of_request'])

In [11]:
# There are more columns in the train dataset than there are in the test dataset.  Remove the features
# that aren't available in the test dataset because we won't be able to use them in the model
def remove_train_features():
    data_cols = set(train.columns.values)
    test_cols = set(test.columns.values)
    trim_data_cols = data_cols.intersection(test_cols)
    return list(trim_data_cols)

trim_data_cols = remove_train_features()
trim_data_cols.append('requester_received_pizza') 
train = train[trim_data_cols]

In [12]:
import re
def get_word_counts(df, word_list):
    entry_count = []
    for entry in df.all_text:
        word_dict = dict((x,0) for x in word_list)
        for word in re.findall(r"\w+", entry):
            if word in word_dict:
                word_dict[word] += 1
        entry_count.append(sum(word_dict.values()))
    return pd.Series(entry_count)

In [13]:
# Topics and keywords from paper
job = ['work', 'paycheck', 'unemployment', 'interview', 'fired', 'unemployment', 'hire', 'hired']
money = ['money', 'now', 'broke', 'week', 'until', 'time', 'last', 'day', 'when', 'today', 'tonight', 'paid', 'next', 'first', 'night', 'after', 'tomorrow', 'month', 'while', 'account', 'before', 'long', 'Friday', 'rent', 'buy', 'bank', 'still', 'bills', 'ago', 'cash', 'due', 'past', 'never', 'paycheck', 'check', 'spent', 'years', 'poor', 'till', 'yesterday', 'morning', 'dollars', 'financial', 'hour', 'bill', 'evening', 'credit', 'budget', 'loan', 'bucks', 'deposit', 'dollar', 'current', 'payed']
student = ['college', 'student', 'school', 'roommate', 'studying', 'semester', 'university', 'finals', 'study', 'class', 'project', 'dorm', 'tuition']
family = ['family', 'mom', 'wife', 'wifey', 'parents', 'mother', 'husband', 'dad', 'son', 'daughter', 'father', 'parent', 'mom', 'baby', 'born', 'newborn', 'kid', 'children', 'child']
craving = ['friend', 'girlfriend', 'craving', 'birthday', 'boyfriend', 'celebrate', 'party', 'game', 'games', 'movie', 'date', 'drunk', 'beer', 'celebrating', 'invited', 'drinks', 'crave', 'wasted', 'invite']
gratitude = ['pay', 'forward', 'thank', 'thanks', 'blessed', 'bless', 'grateful', 'reciprocate', 'back', 'teach']
food = ['vegetable', 'vegetables', 'ramen', 'oatmeal']
pos_words = ['jpg', 'trade', 'blue', 'use', 'make', 'pretty', 'forward', 'friday', 'boyfriend', 'll', 'scarf']

In [14]:
# Feature creation
def encode_features(df, fte):
    '''create dummy variables for categorical data'''
    for feature in fte:
        df = df.join(pd.get_dummies(df[feature], prefix=feature))
    return df

def preprocess(df):
    df['karma_cat_2'] = np.where(df.requester_upvotes_minus_downvotes_at_request <= 10, 0, 1)
    df['all_text'] = [df.request_title.iloc[[i]].tolist()[0] + ' ' + df.request_text_edit_aware.iloc[[i]].tolist()[0]
                    for i in range(df.shape[0])]
    df['job_c'] = get_word_counts(df, job)
    df['money_c'] = get_word_counts(df, money)
    df['student_c'] = get_word_counts(df, student)
    df['family_c'] = get_word_counts(df, family)
    df['craving_c'] = get_word_counts(df, craving)
    df['gratitude_c'] = get_word_counts(df, gratitude)
    df['food_c'] = get_word_counts(df, food)
    df['pos_words'] = get_word_counts(df, pos_words)
    df['exclamation_c'] = map(lambda x: len(re.findall(r'(!)|(:\))', x)), df['all_text'])
    df['post_length'] = df['request_text_edit_aware'].map(lambda x: len(x))
    df['request_title_length'] = df['request_title'].map(lambda x: len(x))
    df['link'] = map(lambda x: 1 if x.find('http') > 0 else 0, df['request_text_edit_aware'])
    df['img'] = map(lambda x: 1 if x.find('img') > 0 else 0, df['request_text_edit_aware'])
    df['weekday'] = map(lambda x: x.weekday(), df['unix_timestamp_of_request'])
    df['month'] = map(lambda x: x.month, df['unix_timestamp_of_request'])
    df['hour'] = map(lambda x: x.hour, df['unix_timestamp_of_request'])
    df['afternoon'] = map(lambda x: 1 if (x >= 13 and x <= 19) else 0, df['hour'])
    df['link_or_img'] = map(lambda x: 1 if x.find('img') > 0 or x.find('http') > 0 else 0, df['request_text_edit_aware'])
    df['time_c'] = map(lambda x: 1 if x >= 1342561000 else 0, df['unix_timestamp_of_request_utc'])

    features_to_encode = ['weekday', 'hour', 'month']
    encode_features(df, features_to_encode)
    
    return df

In [22]:
train_preprocess = preprocess(train)
test_preprocess = preprocess(test)

In [15]:
features = ['karma_cat_2', 'afternoon', 'link', 'img', 'gratitude_c',
                   'post_length', 'requester_number_of_posts_on_raop_at_request',
                   'job_c', 'money_c', 'family_c', 'student_c', 'craving_c', 'request_title_length',
                  'requester_number_of_comments_in_raop_at_request', 'time_c']

train_pre = preprocess(train)
test_pre = preprocess(test)

train_pre = train_pre[features]
test_pre = test_pre[features]

scaler = scalar = preprocessing.StandardScaler()
scaler.fit(train_pre.values)

train_sc = scalar.transform(train_pre.values)

np.random.seed(0)
shuffle = np.random.permutation(np.arange(train.shape[0]))

# Pull out text features
train_length = 3230
data_mat, labels = train_sc[shuffle], train.requester_received_pizza.values[shuffle]

# shuffle training data for error analysis
data_shuffle = pd.DataFrame(train, index=shuffle)

train_data, train_labels = data_mat[:train_length], labels[:train_length]
dev_data, dev_labels = data_mat[train_length:], labels[train_length:]

test_data = scalar.transform(test_pre.values)

In [16]:
def ada_boost():
    
    clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=290)
    scores = cross_val_score(clf, train_data, train_labels)
    print scores
    clf.fit(train_data, train_labels)
    pred = clf.predict(dev_data)
    print clf.score(dev_data, dev_labels)
    print classification_report(pred, dev_labels)

    cm = confusion_matrix(dev_labels, pred)
    print(cm)
    
    for i in range(len(pred)):
        if pred[i] == 0 and dev_labels[i] == 1:
            #print data_shuffle.all_text.iloc[[i]].tolist()[0]
            pass
    return clf

    
ada = ada_boost()

[ 0.75487465  0.75116063  0.7527881 ]
0.77037037037
             precision    recall  f1-score   support

      False       0.97      0.78      0.86       752
       True       0.18      0.64      0.28        58

avg / total       0.91      0.77      0.82       810

[[587  21]
 [165  37]]


In [43]:
test_preds = map(lambda x: 1 if x == True else 0, ada.predict(test_data))
request_ids = test['request_id'].values

In [19]:
with open('submission.csv', 'wb') as csvfile:
    outputwriter = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
    outputwriter.writerow(['request_id'] + ['requester_received_pizza'])
    
    for i in range(len(request_ids)):
        outputwriter.writerow([request_ids[i], str(test_preds[i])])
