In [1]:
# General libraries.
import re
import numpy as np
#import matplotlib.pyplot as plt
import json
import datetime
import os

# SK-learn libraries for learning.
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# import nltk functions for manipulating text. textblob for sentiment analysis
import nltk
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [116]:
# keywords to extract as features
highprob = [u'kindle', u'randomacts', u'kindle', u'randomacts', u'surviving', u'stretch', u'electronics', u'electronics', u'stopped', u'heat', u'total', u'checks', u'landlord', u'cover', u'RandomActsOfChristmas', u'RandomActsOfChristmas']

# stopwords to filter with nltk
sw = stopwords.words('english')

# tokenize using this regex
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')

# filter hyperlinks with this regex
links = re.compile(r'\bhttp\S+\b')

# lemmatize the tokens
wl = WordNetLemmatizer()

# lists for storing labels, request text, metadata for each request, and request ids
labels = []
text = []
num_data = []
ids = []
count = 0

# open this file for reading
f = open("train.json", "r")

try:
    data = json.load(f)
    
    for item in data:
        
        # extract the outcome labels
        labels.append(int(item['requester_received_pizza']))
        
        # keep track of the request ids. useful for debugging
        ids.append(item['request_id'])
        
        # get the text of request
        words = unicode(item['request_text_edit_aware'])
        
        # remove links and replace with string "hyperlink." this prevents lots of spurious tokens
        words = links.sub(u'hyperlink', words)
        
        # tokenize and lemmatize
        tokens = [t for t in tokenizer.tokenize(words) if t.lower() not in sw]
        tokens = [wl.lemmatize(t.lower()) for t in tokens]

        # convert tokens back to string and store it
        text.append(' '.join(tokens))
        
        # get the title of request, tokenize, stem/lemmatize
        title_words = unicode(item['request_title'])
        title_tokens = [t for t in tokenizer.tokenize(title_words) if t.lower() not in sw]
        title_tokens = [wl.lemmatize(t.lower()) for t in title_tokens]
        
        # combine tokens from text and title
        tokens += title_tokens
        
        # extract metadata. store data for each request in num
        num = []
        num.append(item['requester_account_age_in_days_at_request']/100.0)
        num.append(item['requester_days_since_first_post_on_raop_at_request'])
        num.append(item['requester_number_of_comments_at_request']/100.0)
        num.append(int(item['requester_number_of_comments_in_raop_at_request'] > 0))
        num.append(item['requester_number_of_posts_at_request'])
        num.append(item['requester_number_of_subreddits_at_request'])
        num.append(int(item['requester_number_of_posts_on_raop_at_request'] > 0))
        num.append(item['requester_upvotes_minus_downvotes_at_request'])
        num.append(item['requester_upvotes_plus_downvotes_at_request'])
        
        # store the number of tokens in the request and title as additional features
        num.append(len(tokens))
        num.append(len(title_tokens))
        
        # extract one feature for each word in the keyword array
        for word in highprob:
            num.append(int(word in tokens))
        
        # manually check for jpg in the request. ugly, but we already filtered urls
        if 'jpg' in item['request_text_edit_aware']:
            num.append(1)
        else:
            num.append(0)
        
        #num.append(int('hyperlink' in tokens))
       
        # get the day of the month request occurred. set feature to 1 if in first half of month
        # additional experimentation shows that this is the best day to select
        dt = int(datetime.datetime.fromtimestamp(item['unix_timestamp_of_request_utc']).strftime('%d'))
        num.append(int(dt <= 15))
            
        # append the entire list to num_data, which stores data for all examples
        num_data.append(num)
        count += 1

except Exception, e:
    print "error reading from file: %s" %e
    f.close()

f.close()
print len(labels)

# convert python lists to numpy arrays
# _data contains the text data, and _nums contains the metadata
train_labels = np.array(labels[:3500])
train_data = np.array(text[:3500])
train_nums = np.array(num_data[:3500])
print train_nums.shape

dev_labels = np.array(labels[3500:])
dev_data = np.array(text[3500:])
dev_nums = np.array(num_data[3500:])

4040
(3500, 36)


In [112]:
# train a simple KNN model and check accuracy
# this uses the metadata
def trainKNN(k=1):
    # value for k from gridsearch output
    model = KNeighborsClassifier(n_neighbors=38)
    model.fit(train_nums, train_labels)
    pred_labels = model.predict(dev_nums)
    
    # calculate accuracy and identify the false positive and negative cases
    correct = (pred_labels == dev_labels)
    false_pos = (pred_labels > dev_labels)
    false_neg = (pred_labels < dev_labels)
    accuracy = 1.0 * np.sum(correct) / len(pred_labels)
    print "%.03f" %accuracy
    print "false +, false - : %d, %d" %(false_pos.sum(), false_neg.sum())
    
    # get the request ids for the dev set and print ids for false pos or neg cases
    np_ids = np.array(ids[3500:])
    print np_ids[false_pos]
    print pred_labels[false_pos]
    print dev_labels[false_pos]

# run gridsearch to find best k value
def findK():
    # range is from an iterative process to narrow down
    params = {'n_neighbors': range(38,56,1)}
    search = GridSearchCV(KNeighborsClassifier(), params)
    search.fit(train_nums, train_labels)
    
    print "the best parameter is k=%f\n" %search.best_params_['n_neighbors']
    print "summary of all params:\n", search.grid_scores_

# train a simple BernoulliNB model and check accuracy
# this uses the metadata
def trainBern():
    bern = BernoulliNB(alpha=20.0, binarize=0.9)
    bern.fit(train_nums, train_labels)
    pred_labels = bern.predict(dev_nums)
    
    # display the accuracy
    correct = (pred_labels == dev_labels)
    accuracy = 1.0 * np.sum(correct) / len(dev_labels)
    print "accuracy for BernoulliNB is %.03f" %accuracy

# run gridsearch to find parameters for NB model
def bernParams():
    #params = {'binarize': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}
    #params = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0, 20.0, 50.0]}
    search = GridSearchCV(BernoulliNB(binarize=0.9), params)
    search.fit(train_nums, train_labels)
    
    print "the best parameter is alpha=%f\n" %search.best_params_['alpha']
    print "summary of all params:\n", search.grid_scores_

# train logistic regression using the metadata
def trainNumLM():
    lm = LogisticRegression()
    lm.fit(train_nums, train_labels)
    pred_labels = lm.predict(dev_nums)
    
    # print accuracy
    correct = (pred_labels == dev_labels)
    accuracy = 1.0 * np.sum(correct) / len(dev_labels)
    print "accuracy for LM is %.03f" %accuracy
    
    # get false positive and negative cases and print data for the cases
    false_pos = (pred_labels > dev_labels)
    false_neg = (pred_labels < dev_labels)
    print "false +, false - : %d, %d" %(false_pos.sum(), false_neg.sum())
    np_ids = np.array(ids[3500:])
    print np_ids[false_neg]
    #print lm.coef_[0]  

    
trainKNN(49)
#findK()
trainBern()
#bernParams()
trainNumLM()

0.757
false +, false - : 0, 131
[]
[]
[]
accuracy for BernoulliNB is 0.733
accuracy for LM is 0.761
false +, false - : 6, 123
[u't3_uvoyu' u't3_1exnem' u't3_lcs2n' u't3_q21uq' u't3_132u9h' u't3_w4fdd'
 u't3_i3ms5' u't3_12p0tt' u't3_zpn24' u't3_1mjbbd' u't3_lcqaj' u't3_ibua5'
 u't3_qnocf' u't3_w5491' u't3_1ik128' u't3_w8mfc' u't3_ijcon' u't3_u69ow'
 u't3_hutzt' u't3_1chlic' u't3_ur5yf' u't3_z7mrn' u't3_1ihhx4' u't3_j8fnb'
 u't3_hucws' u't3_ojq68' u't3_12zfh3' u't3_sp8xg' u't3_pk4g1' u't3_ktlwy'
 u't3_1lojow' u't3_1dr9hu' u't3_j94no' u't3_ihorg' u't3_y9vxk' u't3_11lvl7'
 u't3_1l5cvv' u't3_qqotp' u't3_j6433' u't3_icpji' u't3_l9g0n' u't3_ikszl'
 u't3_1enrol' u't3_idifo' u't3_xtx62' u't3_1ifnr4' u't3_ilau5' u't3_uotiq'
 u't3_1jlnc4' u't3_iekd3' u't3_19gv1n' u't3_o8aso' u't3_o4fyx' u't3_m7to4'
 u't3_wgk03' u't3_18zn0d' u't3_xeser' u't3_17vccr' u't3_kfxsm' u't3_1j485g'
 u't3_kwqew' u't3_ibvsg' u't3_1f530o' u't3_1jgj2w' u't3_js9be' u't3_ll3uk'
 u't3_jtkzo' u't3_182nt8' u't3_iqm6n' u't3_l7opc' 

In [118]:
# now import the test data

# lists for storing labels, request text, metadata for each request, and request ids
t_text = []
t_data = []
test_id = []
count = 0

# open this file for reading
f = open("test.json", "r")

try:
    data = json.load(f)
    
    for item in data:
        
        test_id.append(item['request_id'])
        
        # get the text of request
        words = unicode(item['request_text_edit_aware'])
        
        # remove links and replace with string "hyperlink." this prevents lots of spurious tokens
        words = links.sub(u'hyperlink', words)
        
        # tokenize and lemmatize
        tokens = [t for t in tokenizer.tokenize(words) if t.lower() not in sw]
        tokens = [wl.lemmatize(t.lower()) for t in tokens]

        # convert tokens back to string and store it
        t_text.append(' '.join(tokens))
        
        # get the title of request, tokenize, stem/lemmatize
        title_words = unicode(item['request_title'])
        title_tokens = [t for t in tokenizer.tokenize(title_words) if t.lower() not in sw]
        title_tokens = [wl.lemmatize(t.lower()) for t in title_tokens]
        
        # combine tokens from text and title
        tokens += title_tokens
        
        # extract metadata. store data for each request in num
        num = []
        num.append(item['requester_account_age_in_days_at_request']/100.0)
        num.append(item['requester_days_since_first_post_on_raop_at_request'])
        num.append(item['requester_number_of_comments_at_request']/100.0)
        num.append(int(item['requester_number_of_comments_in_raop_at_request'] > 0))
        num.append(item['requester_number_of_posts_at_request'])
        num.append(item['requester_number_of_subreddits_at_request'])
        num.append(int(item['requester_number_of_posts_on_raop_at_request'] > 0))
        num.append(item['requester_upvotes_minus_downvotes_at_request'])
        num.append(item['requester_upvotes_plus_downvotes_at_request'])
        
        # store the number of tokens in the request and title as additional features
        num.append(len(tokens))
        num.append(len(title_tokens))
        
        # extract one feature for each word in the keyword array
        for word in highprob:
            num.append(int(word in tokens))
        
        # manually check for jpg in the request. ugly, but we already filtered urls
        if 'jpg' in item['request_text_edit_aware']:
            num.append(1)
        else:
            num.append(0)
       
        #num.append(int('hyperlink' in tokens))
        
        # get the day of the month request occurred. set feature to 1 if in first half of month
        # additional experimentation shows that this is the best day to select
        dt = int(datetime.datetime.fromtimestamp(item['unix_timestamp_of_request_utc']).strftime('%d'))
        num.append(int(dt <= 15))
            
        # append the entire list to num_data, which stores data for all examples
        t_data.append(num)
        count += 1

except Exception, e:
    print "error reading from file: %s" %e
    f.close()

f.close()

# convert python lists to numpy arrays
# _data contains the text data, and _nums contains the metadata
test_text = np.array(t_text)
test_data = np.array(t_data)
print test_data.shape

(1631, 36)


In [119]:
# run models on the test data and export to csv file for submission

# store the full training data in numpy array
full_labels = np.array(labels)
full_train = np.array(num_data)

print full_train.shape

'''# run the best model on the test data
bern = BernoulliNB(alpha=20.0, binarize=0.9)
bern.fit(full_train, full_labels)
#pred_labels = bern.predict(test_data)
pred_labels = bern.predict_proba(test_data)'''

lm = LogisticRegression()
lm.fit(full_train, full_labels)
#pred_labels = lm.predict(test_data)
pred_labels = lm.predict_proba(test_data)

# eric's code to write to csv
if len(test_id) == pred_labels.shape[0]:
    print "Writing to file"
    outfile = open("output.csv",'w')
    outfile.write("request_id,requester_received_pizza\n")
    for i in range(pred_labels.shape[0]):
        outfile.write(test_id[i]+','+str(pred_labels[i][1])+'\n')
    outfile.close()
else:
    print "Prediction dimension mismatch"

(4040, 36)
Writing to file
