In [45]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
#import matplotlib.pyplot as plt
import json
import datetime
import os

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.mixture import GMM
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

# import nltk functions for manipulating text. textblob for sentiment analysis
import nltk
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
#from textblob import TextBlob

In [79]:
# keywords identified in research paper linked from kaggle page

money1 = ['week', 'ramen', 'paycheck', 'work', 'couple', 'rice', 'check', 'pizza', 'grocery', 'rent', 'anyone', 'favor', 'someone', 'bill', 'money']
money2 = ['food', 'house', 'rent', 'stamp', 'month', 'today', 'parent', 'help', 'anything', 'mom', 'anyone']
job = ['job', 'year', 'interview', 'luck', 'school', 'paycheck', 'unemployment', 'end']
student = ['student', 'college', 'final', 'loan', 'summer', 'university', 'class', 'meal', 'semester', 'story', 'kid']
time_family = ['tonight', 'night', 'tomorrow', 'friday', 'dinner', 'something', 'account', 'family', 'bank', 'home']
time = ['day', 'pay']
gratitude = ['thanks', 'advance', 'guy', 'reading', 'place', 'everyone', 'craving', 'kind']

In [100]:
#os.environ["NLTK_DATA"] = "~/nltk_data"
# stopwords to filter with nltk
sw = stopwords.words('english')

# tokenize using this regex
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')

# filter hyperlinks with this regex
links = re.compile(r'\bhttp\S+\b')

# lemmatize the tokens
wl = WordNetLemmatizer()
#sn = SnowballStemmer("english")

# lists for storing labels, request text, metadata for each request, and request ids
labels = []
text = []
num_data = []
ids = []
count = 0

# open this file for reading
f = open("train.json", "r")

try:
    data = json.load(f)
    
    for item in data:
        
        prop = 0
        adj = 0
        adv = 0
        # extract the outcome labels
        labels.append(int(item['requester_received_pizza']))
        
        # keep track of the request ids. useful for debugging
        ids.append(item['request_id'])
        
        # get the text of request
        words = unicode(item['request_text_edit_aware']) #item['request_title']#
        
        # remove links and replace with string "hyperlink." this prevents lots of spurious tokens
        words = links.sub(u'hyperlink', words)
        
        # tokenize and stem/lemmatize
        tokens = [t for t in tokenizer.tokenize(words) if t.lower() not in sw]
        tokens = [wl.lemmatize(t.lower()) for t in tokens]

        
        #tokens = [sn.stem(t) for t in tokens]
        # convert tokens back to string and store it
        text.append(' '.join(tokens))
        
        # get the title of request, tokenize, stem/lemmatize
        title_words = item['request_title']
        title_tokens = [t for t in tokenizer.tokenize(title_words) if t.lower() not in sw]
        title_tokens = [wl.lemmatize(t) for t in title_tokens]
        
        # extract metadata. store data for each request in num
        num = []
        num.append(item['requester_account_age_in_days_at_request']/100.0)
        num.append(item['requester_days_since_first_post_on_raop_at_request'])
        num.append(item['requester_number_of_comments_at_request']/100.0)
        num.append(item['requester_number_of_comments_in_raop_at_request'])
        num.append(item['requester_number_of_posts_at_request'])
        num.append(item['requester_number_of_subreddits_at_request'])
        num.append(item['requester_number_of_posts_on_raop_at_request'])
        num.append(item['requester_upvotes_minus_downvotes_at_request'])
        num.append(item['requester_upvotes_plus_downvotes_at_request'])
        
        # store the number of tokens in the request and title as additional features
        num.append(len(tokens))
        num.append(len(title_tokens))
        
        # extract one feature for each word in the keyword arrays
        # some arrays aren't helpful so we skip them
        for word in money1:
            num.append(int(word in tokens))
            
        for word in money2:
            num.append(int(word in tokens))
            
        for word in gratitude:
            num.append(int(word in tokens))
            
        for word in time_family:
            num.append(int(word in tokens))
        
        num.append(int('hyperlink' in tokens))
        
        # we can't use these features directly. they're not in the test data
        '''flair = item['requester_user_flair']
        num.append(item['number_of_downvotes_of_request_at_retrieval'])
        num.append(item['number_of_upvotes_of_request_at_retrieval'])
        num.append(item['request_number_of_comments_at_retrieval'])'''  
        
        # perform sentiment analysis (returns signed float). doesn't seem to affect accuracy
        '''blob = TextBlob(unicode(words))
        num.append(blob.sentiment.polarity)'''
        
        # calculate lexical diversity for text and title. doesn't seem to raise accuracy
        '''# store the lexical diversity of the request as a feature
        if len(tokens) > 0:
            num.append(1.0*len(set(tokens))/len(tokens))
        else:
            num.append(0)
        
        # store the lexical diversity of the title as a feature
        if len(title_tokens) > 0:
            num.append(1.0*len(set(title_tokens))/len(title_tokens))
        else:
            num.append(0)'''
        
        # extract features from timestamp. appears to decrease accuracy 
        # get the timestamp of request, convert to datetime, and save off the hour
        '''dt = -4 + int(datetime.datetime.fromtimestamp(item['unix_timestamp_of_request_utc']).strftime('%H'))
        # quick and dirty conversion to eastern daylight time
        if dt <= 0:
            dt += 24
        
        # if hour is in the morning
        if dt >= 3 and dt < 11:
            num.append(1)
            num.append(0)
            num.append(0)
        
        # if hour is in midday
        elif dt >= 11 and dt < 18:
            num.append(0)
            num.append(1)
            num.append(0)
        
        # if hour is at night
        else:
            num.append(0)
            num.append(0)
            num.append(1)'''
        #num.append(int(dt))
        '''for i in range(dt):
            num.append(0)
        num.append(1)
        for i in range(6-dt):
            num.append(0)'''
        '''if flair == None:
            num.append(0)
            num.append(0)
        elif flair == 'shroom':
            num.append(1)
            num.append(0)
        elif flair == 'PIF':
            num.append(0)
            num.append(1)'''
            
        # append the entire list to num_data, which stores data for all examples
        num_data.append(num)
        count += 1

except Exception, e:
    print "error reading from file: %s" %e
    f.close()

f.close()
print len(labels)

# convert python lists to numpy arrays
# _data contains the text data, and _nums contains the metadata
train_labels = np.array(labels[:3500])
train_data = np.array(text[:3500])
train_nums = np.array(num_data[:3500])
print train_nums.shape

dev_labels = np.array(labels[3500:])
dev_data = np.array(text[3500:])
dev_nums = np.array(num_data[3500:])

# normalize the numerical data
'''for i in range(11):
    max1 = np.mean(train_nums[:,i])
    max2 = np.mean(dev_nums[:,i])
    
    train_nums[:,i] = train_nums[:,i]/max1
    dev_nums[:,i] = dev_nums[:,i]/max2'''

4040
(3500, 56)


'for i in range(11):\n    max1 = np.mean(train_nums[:,i])\n    max2 = np.mean(dev_nums[:,i])\n    \n    train_nums[:,i] = train_nums[:,i]/max1\n    dev_nums[:,i] = dev_nums[:,i]/max2'

In [94]:
# scratchpad
#for i in range(58):
    #print np.mean(train_nums[:,i], axis=0)

print train_nums[:,7].mean()

print '*'
print dev_data[150], dev_labels[150]
print text[3650]

print dev_nums.shape, train_nums.shape
print np.max(dev_nums[:,1])
print train_nums.shape

1196.00028571
*
hyperlink 0
hyperlink
(540, 59) (3500, 59)
739.771006944
(3500, 59)


In [71]:
# skip this cell. its accuracy is not good.
def better_preprocessor(s):

    '''# remove links and replace with string "hyperlink"
    links = re.compile(u'\\bhttp\w+\\b')
    # remove non-alphanumerics
    non_alpha = re.compile(u'[\W_]+', re.UNICODE)
    # change all number sequences to single '0'
    num = re.compile(u'[0-9]+', re.UNICODE)
    # list of common English suffixes to remove
    suff = re.compile(u'(al|ance|ence|dom|ed|er|or|ism|ist|ity|ty|ment|ship|sion|ing|s|ly|ation|tion|able|ible|ate|en|ify|fy|ize|ise|ful|ic|ical|ious|ous|ish|ive)\\b', re.UNICODE)
    # list of common English prefixes to remove
    pre = re.compile(u'\\b(an|ante|anti|auto|circum|co|com|con|contra|de|dis|en|ex|extra|hyper|in|im|il|ir|inter|intra|macro|micro|mid|mis|mono|non|over|post|pre|pro|re|semi|sub|super|trans|tri|un)', re.UNICODE)
    # remove words with one or two characters only (treat as stopwords)
    stop = re.compile(u'\\b([\w]{1,2})\\b', re.UNICODE)

    # apply filters
    t = links.sub(u'hyperlink', s)
    t = non_alpha.sub(u' ', t)
    t = num.sub(u" 0 ", t)
    t = suff.sub(u' ', t)
    t = pre.sub(u' ', t)
    t = stop.sub(u' ', t)'''
    
    '''tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    words = np.array_str(s)
    print type(words), words
    tokens = tokenizer.tokenize(words)
    #tokens = [t for t in tokenizer.tokenize(words) if t.lower() not in sw]
    # Stem
    #tokens = [wl.lemmatize(t) for t in tokens]
    tokens = [sn.stem(t) for t in tokens]
    return tokens'''

# vectorize text data and then train logistic regression
# this isn't very accurate (~30%)
def trainLM():
    vectorizer = CountVectorizer(analyzer="word", ngram_range=[1,2])
    X = vectorizer.fit_transform(train_data)
    Y = vectorizer.transform(dev_data)
    vocab = vectorizer.get_feature_names()
    
    print X.shape, Y.shape
    
    lm = LogisticRegression()
    lm.fit(X, train_labels)
    pred_labels = lm.predict(Y)
    
    score = metrics.f1_score(dev_labels, pred_labels)
    print "base f1 =", score
    
    maxind = np.fabs(lm.coef_[0]).argsort()[-5:]
    for item in maxind:
        print vocab[item], lm.coef_[0][i]
    
    # vectorize data using the better preprocessor
    better_vect = CountVectorizer(preprocessor=None)
    better_X = better_vect.fit_transform(train_data)
    better_Y = better_vect.transform(dev_data)
    vocab2 = better_vect.get_feature_names()
    
    # run logistic regression on this data and report f1 score
    better_lm = LogisticRegression()
    better_lm.fit(better_X, train_labels)
    better_labels = better_lm.predict(better_Y)
    print "Improved f1 =", metrics.f1_score(dev_labels, better_labels)
    
    maxind2 = better_lm.coef_[0].argsort()[-5:]
    for item in maxind2:
        print vocab2[item], lm.coef_[0][i]
    
trainLM()

(3500, 92200) (540, 92200)
base f1 = 0.214689265537
money food -0.0326457190941
pizza love -0.0326457190941
forward money -0.0326457190941
could make -0.0326457190941
surprise -0.0326457190941
Improved f1 = 0.226600985222
ranch -0.0326457190941
expected -0.0326457190941
surprise -0.0326457190941
relative -0.0326457190941
mentioned -0.0326457190941


In [101]:
# train a simple KNN model and check accuracy
# this uses the metadata
def trainKNN(k=1):
    # value for k from gridsearch output
    model = KNeighborsClassifier(n_neighbors=38)
    model.fit(train_nums, train_labels)
    pred_labels = model.predict(dev_nums)
    
    # calculate accuracy and identify the false positive and negative cases
    correct = (pred_labels == dev_labels)
    false_pos = (pred_labels > dev_labels)
    false_neg = (pred_labels < dev_labels)
    accuracy = 1.0 * np.sum(correct) / len(pred_labels)
    print "%.03f" %accuracy
    print "false +, false - : %d, %d" %(false_pos.sum(), false_neg.sum())
    
    # get the request ids for the dev set and print ids for false pos or neg cases
    np_ids = np.array(ids[3500:])
    print np_ids[false_pos]
    print pred_labels[false_pos]
    print dev_labels[false_pos]

# run gridsearch to find best k value
def findK():
    # range is from an iterative process to narrow down
    params = {'n_neighbors': range(38,56,1)}
    search = GridSearchCV(KNeighborsClassifier(), params)
    search.fit(train_nums, train_labels)
    
    print "the best parameter is k=%f\n" %search.best_params_['n_neighbors']
    print "summary of all params:\n", search.grid_scores_

# train a simple BernoulliNB model and check accuracy
# this uses the metadata
def trainBern():
    bern = BernoulliNB(alpha=0.0001)
    bern.fit(train_nums, train_labels)
    pred_labels = bern.predict(dev_nums)
    
    # display the accuracy
    correct = (pred_labels == dev_labels)
    accuracy = 1.0 * np.sum(correct) / len(dev_labels)
    print "accuracy for BernoulliNB is %.03f" %accuracy

# run gridsearch to find parameters for NB model
def bernParams():
    params = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}
    search = GridSearchCV(BernoulliNB(), params)
    search.fit(train_nums, train_labels)
    
    print "the best parameter is alpha=%f\n" %search.best_params_['alpha']
    print "summary of all params:\n", search.grid_scores_

# train logistic regression using the metadata
def trainNumLM():
    lm = LogisticRegression()
    lm.fit(train_nums, train_labels)
    pred_labels = lm.predict(dev_nums)
    
    # print accuracy
    correct = (pred_labels == dev_labels)
    accuracy = 1.0 * np.sum(correct) / len(dev_labels)
    print "accuracy for LM is %.03f" %accuracy
    
    # get false positive and negative cases and print data for the cases
    false_pos = (pred_labels > dev_labels)
    false_neg = (pred_labels < dev_labels)
    print "false +, false - : %d, %d" %(false_pos.sum(), false_neg.sum())
    np_ids = np.array(ids[3500:])
    print np_ids[false_neg]
    print lm.coef_[0]

def trainSVM():
    clf = SVC()
    clf.fit(train_nums, train_labels)
    pred_labels = clf.predict(dev_nums)
    
    # print accuracy
    correct = (pred_labels == dev_labels)
    accuracy = 1.0 * np.sum(correct) / len(dev_labels)
    print "accuracy for SVM is %.03f" %accuracy
    
def trainRF():
    clf = RandomForestClassifier(max_depth=5, n_estimators=10)
    clf.fit(train_nums, train_labels)
    pred_labels = clf.predict(dev_nums)
    
    # print accuracy
    correct = (pred_labels == dev_labels)
    accuracy = 1.0 * np.sum(correct) / len(dev_labels)
    print "accuracy for random forest is %.03f" %accuracy
    
trainKNN(38)
#findK()
#trainBern()
#bernParams()
trainNumLM()
#trainSVM()
#trainRF()

0.757
false +, false - : 0, 131
[]
[]
[]
accuracy for LM is 0.765
false +, false - : 9, 118
[u't3_uvoyu' u't3_1exnem' u't3_lcs2n' u't3_q21uq' u't3_132u9h' u't3_w4fdd'
 u't3_i3ms5' u't3_12p0tt' u't3_zpn24' u't3_1mjbbd' u't3_lcqaj' u't3_ibua5'
 u't3_qnocf' u't3_w5491' u't3_1ik128' u't3_w8mfc' u't3_ijcon' u't3_u69ow'
 u't3_hutzt' u't3_1chlic' u't3_ur5yf' u't3_z7mrn' u't3_1ihhx4' u't3_j8fnb'
 u't3_hucws' u't3_ojq68' u't3_12zfh3' u't3_sp8xg' u't3_pk4g1' u't3_ktlwy'
 u't3_1lojow' u't3_1dr9hu' u't3_j94no' u't3_ihorg' u't3_y9vxk' u't3_11lvl7'
 u't3_1l5cvv' u't3_qqotp' u't3_j6433' u't3_icpji' u't3_l9g0n' u't3_ikszl'
 u't3_1enrol' u't3_idifo' u't3_xtx62' u't3_1ifnr4' u't3_ilau5' u't3_uotiq'
 u't3_1jlnc4' u't3_iekd3' u't3_19gv1n' u't3_o8aso' u't3_o4fyx' u't3_m7to4'
 u't3_wgk03' u't3_l0vt2' u't3_18zn0d' u't3_17vccr' u't3_kfxsm' u't3_1j485g'
 u't3_kwqew' u't3_ibvsg' u't3_1jgj2w' u't3_js9be' u't3_ll3uk' u't3_182nt8'
 u't3_iqm6n' u't3_l7opc' u't3_tpezd' u't3_xd34k' u't3_1ich4u' u't3_152l21'
 u't3_row

In [7]:
# try using GMMs
for gmm_comp in range(1,11):
    gm1 = GMM(n_components=gmm_comp, covariance_type='full')
    gm1.fit(train_nums[train_labels == 1, :])

    gm2 = GMM(n_components=gmm_comp, covariance_type='full')
    gm2.fit(train_nums[train_labels == 0, :])

    # predict positive when first GMM returns higher score. compare this to test_labels
    correct = (gm1.score(dev_nums) > gm2.score(dev_nums)) == dev_labels

    # calculate accuracy
    print "Accuracy for %d components: %f%%" %(gmm_comp, 100.0 * correct.sum() / correct.shape[0])

Accuracy for 1 components: 70.555556%
Accuracy for 2 components: 70.185185%
Accuracy for 3 components: 69.444444%
Accuracy for 4 components: 68.703704%
Accuracy for 5 components: 71.851852%
Accuracy for 6 components: 63.148148%
Accuracy for 7 components: 71.111111%
Accuracy for 8 components: 72.962963%
Accuracy for 9 components: 71.111111%
Accuracy for 10 components: 68.148148%


[0 0 0 0 0]
