In [2]:
%matplotlib inline
from __future__ import division

import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

# Ensemble methods
from sklearn.ensemble import RandomForestClassifier

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

import pydot

In [3]:
# Load json dataset into pandas
data = pd.read_json('train.json')
test = pd.read_json('test.json')

In [4]:
data['requester_received_pizza'] = data['requester_received_pizza'].map({False: 0, True:1}).astype(int)
# post_was_edited has bad data - includes unix timestamps. need to convert to 1s
data['post_was_edited'] = data['post_was_edited'].map(lambda x: 1 if x > 0 else 0)
labels = data['requester_received_pizza']
del data['requester_received_pizza']

In [5]:
def remove_train_features():
    data_cols = set(data.columns.values)
    test_cols = set(test.columns.values)
    trim_data_cols = data_cols.intersection(test_cols)
    return list(trim_data_cols)

trim_data_cols = remove_train_features()
data = data[trim_data_cols]

In [6]:
def choose_features():
    # include columns from loop above...add other logic to easily adjust features, run against other models
    # potential list ['unix_timestamp_of_request', 'unix_timestamp_of_request_utc']
    dtypes = data.columns.to_series().groupby(data.dtypes).groups
    cols = dtypes[np.dtype('int64')] + dtypes[np.dtype('float64')]
    # potentials to exclude:
    # ['unix_timestamp_of_request', 'unix_timestamp_of_request_utc']
    cols_to_exclude = ['unix_timestamp_of_request', 'unix_timestamp_of_request_utc']
    s_cols = set(cols)
    s_cols.difference_update(cols_to_exclude)
    return list(s_cols)

m_features = choose_features()

In [7]:
print data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4040 entries, 0 to 4039
Data columns (total 17 columns):
request_title                                         4040 non-null object
unix_timestamp_of_request                             4040 non-null int64
giver_username_if_known                               4040 non-null object
requester_days_since_first_post_on_raop_at_request    4040 non-null float64
requester_subreddits_at_request                       4040 non-null object
requester_account_age_in_days_at_request              4040 non-null float64
requester_number_of_posts_on_raop_at_request          4040 non-null int64
requester_upvotes_plus_downvotes_at_request           4040 non-null int64
request_text_edit_aware                               4040 non-null object
request_id                                            4040 non-null object
requester_number_of_comments_at_request               4040 non-null int64
requester_number_of_posts_at_request                  4040 non-null in

In [8]:
print data.describe()

       unix_timestamp_of_request  \
count               4.040000e+03   
mean                1.342829e+09   
std                 2.333057e+07   
min                 1.297723e+09   
25%                 1.320469e+09   
50%                 1.342565e+09   
75%                 1.364618e+09   
max                 1.381552e+09   

       requester_days_since_first_post_on_raop_at_request  \
count                                        4040.000000    
mean                                           16.417034    
std                                            70.651428    
min                                             0.000000    
25%                                             0.000000    
50%                                             0.000000    
75%                                             0.000000    
max                                           785.457685    

       requester_account_age_in_days_at_request  \
count                               4040.000000   
mean                   

In [9]:
x = pd.cut(data.requester_upvotes_minus_downvotes_at_request, 2, labels=["low", "high"])
pd.crosstab(x, labels)

requester_received_pizza,0,1
requester_upvotes_minus_downvotes_at_request,Unnamed: 1_level_1,Unnamed: 2_level_1
low,3045,993
high,1,1


In [20]:
# requester_upvotes_minus_downvotes_at_request
x = np.where(data.requester_upvotes_minus_downvotes_at_request <= 0, 0, 
         np.where((data.requester_upvotes_minus_downvotes_at_request > 0) & 
                  (data.requester_upvotes_minus_downvotes_at_request <=100), 1, 2))
x_1 = np.where(data.requester_upvotes_minus_downvotes_at_request <= 0, 0, 1)
pd.crosstab(x_1, labels).apply(lambda r: r/r.sum(), axis=1)

plt.figure()
data.requester_upvotes_minus_downvotes_at_request.plot(kind='bar')
plt.axhline(0, color='k')
#pd.crosstab(x_1, labels)

AttributeError: 'numpy.ndarray' object has no attribute 'plot'

<matplotlib.figure.Figure at 0x115935c50>

In [18]:
# requester_number_of_posts_at_request 
req = np.where(data.requester_number_of_posts_at_request == 0, 0, 1)
pd.crosstab(req, labels).apply(lambda r: r/r.sum(), axis=1)

requester_received_pizza,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.807179,0.192821
1,0.731527,0.268473


In [17]:
p = data.requester_number_of_posts_on_raop_at_request
pd.crosstab(p, labels).apply(lambda r: r/r.sum(), axis=1)

requester_received_pizza,0,1
requester_number_of_posts_on_raop_at_request,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.765593,0.234407
1,0.582192,0.417808
2,0.411765,0.588235
3,0.125,0.875
4,0.0,1.0
5,0.0,1.0


In [22]:
df = pd.DataFrame(x_1, columns=['requester_upvotes_minus_downvotes_at_request'])
df['requester_number_of_posts_at_request'] = req

In [12]:
np.random.seed(0)
shuffle = np.random.permutation(np.arange(data.shape[0]))
train_length = 3230

feat_train_data, feat_train_labels = df.values, labels.values
feat_train_data, feat_train_labels = feat_train_data[shuffle], feat_train_labels[shuffle]

train_data, train_labels = feat_train_data[:train_length], feat_train_labels[:train_length]
dev_data, dev_labels = feat_train_data[train_length:], feat_train_labels[train_length:]

In [19]:
def get_mnb(alphas):
    mnb = BernoulliNB()
    mnb = GridSearchCV(mnb, alphas)
    mnb.fit(train_data, train_labels)
    for i in mnb.grid_scores_:
        print i
    print mnb.best_score_

    
alphas = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}
get_mnb(alphas)

mean: 0.75480, std: 0.00011, params: {'alpha': 0.0}
mean: 0.75480, std: 0.00011, params: {'alpha': 0.0001}
mean: 0.75480, std: 0.00011, params: {'alpha': 0.001}
mean: 0.75480, std: 0.00011, params: {'alpha': 0.01}
mean: 0.75480, std: 0.00011, params: {'alpha': 0.1}
mean: 0.75480, std: 0.00011, params: {'alpha': 0.5}
mean: 0.75480, std: 0.00011, params: {'alpha': 1.0}
mean: 0.75480, std: 0.00011, params: {'alpha': 2.0}
mean: 0.75480, std: 0.00011, params: {'alpha': 10.0}
0.75479876161


In [17]:
# Binarize all the data


      requester_upvotes_minus_downvotes_at_request  \
0                                                0   
1                                                1   
2                                                0   
3                                                1   
4                                                1   
5                                                1   
6                                                0   
7                                                1   
8                                                1   
9                                                1   
10                                               1   
11                                               0   
12                                               0   
13                                               1   
14                                               1   
15                                               1   
16                                               1   
17                          