In [182]:
from sklearn import metrics
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import codecs
import json
import itertools
import csv
import math

def read_dataset(path):
  with codecs.open(path, 'r', 'utf-8') as myFile:
    content = myFile.read()
  dataset = json.loads(content)
  return dataset

path = './pizza_request_dataset.json'
dataset = read_dataset(path)

print 'The dataset contains %d samples.' %(len(dataset))
print 'Available attributes: ', sorted(dataset[0].keys())
print 'First post:'
print json.dumps(dataset[0], sort_keys=True, indent=2)

successes = [r['requester_received_pizza'] for r in dataset]
success_rate = 100.0 * sum(successes) / float(len(successes))
print 'The average success rate is: %.2f%%' %(success_rate)


The dataset contains 5671 samples.
Available attributes:  [u'giver_username_if_known', u'in_test_set', u'number_of_downvotes_of_request_at_retrieval', u'number_of_upvotes_of_request_at_retrieval', u'post_was_edited', u'request_id', u'request_number_of_comments_at_retrieval', u'request_text', u'request_text_edit_aware', u'request_title', u'requester_account_age_in_days_at_request', u'requester_account_age_in_days_at_retrieval', u'requester_days_since_first_post_on_raop_at_request', u'requester_days_since_first_post_on_raop_at_retrieval', u'requester_number_of_comments_at_request', u'requester_number_of_comments_at_retrieval', u'requester_number_of_comments_in_raop_at_request', u'requester_number_of_comments_in_raop_at_retrieval', u'requester_number_of_posts_at_request', u'requester_number_of_posts_at_retrieval', u'requester_number_of_posts_on_raop_at_request', u'requester_number_of_posts_on_raop_at_retrieval', u'requester_number_of_subreddits_at_request', u'requester_received_pizza', u'

In [183]:
#read train data
def create_data_sets(dataset):
    
    train = []
    cv = []
    test = []
    current = 0
    
    shuffle = np.random.permutation(np.arange(len(dataset)))
    
    train_range = int(round(len(shuffle) * 0.6))
    cv_range = int(round(len(shuffle) * 0.2))
    test_range = len(dataset) - train_range - cv_range
    
    for i in range(train_range):
        train.append(dataset[shuffle[current]])
        current += 1

    for i in range(cv_range):
        cv.append(dataset[shuffle[current]])
        current += 1

    for i in range(test_range):
        test.append(dataset[shuffle[current]])
        current += 1

    train_labels = np.zeros(len(train))
    cv_labels = np.zeros(len(cv))
    test_labels = np.zeros(len(test))

    train_data_text = []
    cv_data_text = []
    test_data_text = []

    for i in range(len(train_labels)):
        train_labels[i] = train[i].get('requester_received_pizza')

    for i in range(len(cv_labels)):
        cv_labels[i] = cv[i].get('requester_received_pizza')

    for i in range(len(test_labels)):
        test_labels[i] = test[i].get('requester_received_pizza')

    return train, train_labels, cv, cv_labels, test, test_labels

train, train_labels, cv, cv_labels, test, test_labels = create_data_sets(dataset)

In [184]:
def find_optim_model_textual(total, train, train_labels, cv, cv_labels, test, test_labels, feature):
    #print out the before and after with the new feature, and
    #confirm that it isn't overfit via cv and test compare
    
    #return the new train_data cv_data and test_data
    
    total_data_text = []
    train_data_text = []
    cv_data_text = []
    test_data_text = []

    for i in range(len(total)):
        total_data_text.append(total[i].get(feature))

    for i in range(len(train_labels)):
        train_data_text.append(train[i].get(feature))

    for i in range(len(cv_labels)):
        cv_data_text.append(cv[i].get(feature))

    for i in range(len(test_labels)):
        test_data_text.append(test[i].get(feature))

    vec_total = CountVectorizer()
    tokenized_total_data = vec_total.fit_transform(train_data_text)

    vec_train = CountVectorizer(vocabulary=vec_total.vocabulary_)
    tokenized_train_data = vec_train.fit_transform(train_data_text)

    vec_cv = CountVectorizer(vocabulary=vec_total.vocabulary_)
    tokenized_cv_data = vec_cv.fit_transform(cv_data_text)

    vec_test = CountVectorizer(vocabulary=vec_total.vocabulary_)
    tokenized_test_data = vec_test.fit_transform(test_data_text)
    
    #find optimal C
    C_val = [0.001, 0.01, 0.1, 0.2, 0.4, 0.8, 1, 2, 4, 8]
    max_score = 0
    opt_C_LR = 0
    opt_C_BNB = 0
    opt_C_KNN = 0
    best = ""
    
    for i in range(len(C_val)):

        #Logistic Regression section
        LR_train = LogisticRegression(C=C_val[i])
        LR_train.fit(tokenized_train_data,train_labels)

        cv_score = LR_train.score(tokenized_cv_data,cv_labels)
        
        if cv_score > max_score:
            opt_C_LR = C_val[i]
            max_score = cv_score
            best = "LR"
        
        #Bernoulli Naive Bayes section
        BNB_train = BernoulliNB(alpha=C_val[i])
        BNB_train.fit(tokenized_train_data,train_labels)

        cv_score = BNB_train.score(tokenized_cv_data,cv_labels)
        
        if cv_score > max_score:
            opt_C_BNB = C_val[i]
            max_score = cv_score
            best = "BNB"
            
        #KNN section
        KNN_train = KNeighborsClassifier(n_neighbors=int(math.ceil(C_val[i] * 2)))
        KNN_train.fit(tokenized_train_data,train_labels)

        cv_score = KNN_train.score(tokenized_cv_data,cv_labels)
        
        if cv_score > max_score:
            opt_C_KNN = C_val[i]
            max_score = cv_score
            best = "KNN"
    
    #temp override
    best = "LR"
    
    if best == "LR":
        print "LR is the best \n"
        LR_train = LogisticRegression(C=opt_C_LR)
        LR_train.fit(tokenized_train_data,train_labels)

        cv_pred = LR_train.predict(tokenized_cv_data)
        print "The CV results are:\n"
        print(classification_report(cv_labels, cv_pred))

        test_pred = LR_train.predict(tokenized_test_data)
        print "The test results are:\n"
        print(classification_report(test_labels, test_pred))
        
        return vec_total.vocabulary_,opt_C_LR, best, tokenized_train_data.todense(), tokenized_cv_data.todense(), tokenized_test_data.todense()
        
    elif best == "BNB":
        print "BNB is the best \n"
        BNB_train = BernoulliNB(alpha=opt_C_BNB)
        BNB_train.fit(tokenized_train_data,train_labels)

        cv_pred = BNB_train.predict(tokenized_cv_data)
        print "The CV results are:\n"
        print(classification_report(cv_labels, cv_pred))

        test_pred = BNB_train.predict(tokenized_test_data)
        print "The test results are:\n"
        print(classification_report(test_labels, test_pred))
    
        return vec_total.vocabulary_,opt_C_BNB, best, tokenized_train_data.todense(), tokenized_cv_data.todense(), tokenized_test_data.todense()
    
    else:
        print "KNN is the best \n"
        KNN_train = KNeighborsClassifier(n_neighbors=opt_C_KNN)
        KNN_train.fit(tokenized_train_data,train_labels)

        cv_pred = KNN_train.predict(tokenized_cv_data)
        print "The CV results are:\n"
        print(classification_report(cv_labels, cv_pred))

        test_pred = BNB_train.predict(tokenized_test_data)
        print "The test results are:\n"
        print(classification_report(test_labels, test_pred))
        
        return vec_total.vocabulary_,opt_C_KNN, best, tokenized_train_data.todense(), tokenized_cv_data.todense(), tokenized_test_data.todense()
    

feature = 'request_text_edit_aware'
print feature + "\n"
vocab, opt_val, best, train_data_text, cv_data_text, test_data_text = find_optim_model_textual(dataset, train, train_labels, cv, cv_labels, test, test_labels, feature)

#feature = 'requester_subreddits_at_request'
#print feature + "\n"
#opt_val, best = find_optim_model_textual(dataset, train, train_labels, cv, cv_labels, test, test_labels, feature)

#feature = 'request_title'
#print feature + "\n"
#opt_val, best, train_data_text, cv_data_text, test_data_text = find_optim_model_textual(dataset, train, train_labels, cv, cv_labels, test, test_labels, feature)


request_text_edit_aware

LR is the best 

The CV results are:

             precision    recall  f1-score   support

        0.0       0.74      1.00      0.85       838
        1.0       0.86      0.02      0.04       296

avg / total       0.77      0.74      0.64      1134

The test results are:

             precision    recall  f1-score   support

        0.0       0.74      0.99      0.85       842
        1.0       0.30      0.01      0.02       292

avg / total       0.63      0.74      0.64      1134



In [185]:
LR_train = LogisticRegression()
LR_train.fit(train_data_text,train_labels)

test_pred = LR_train.predict(test_data_text)

print(classification_report(test_labels, test_pred))


             precision    recall  f1-score   support

        0.0       0.76      0.85      0.80       842
        1.0       0.34      0.22      0.27       292

avg / total       0.65      0.69      0.66      1134



In [186]:
def evaluate_new_feature(train_data, train_labels, train_mat, cv_data, cv_labels, cv_mat, test_data, 
                         test_labels, test_mat, feature, opt_val, modeltype):

    train_temp = []
    cv_temp = []
    test_temp = []
    
    for i in range(len(train_labels)):
        train_temp.append((train_data[i].get(feature)))

    for i in range(len(cv_labels)):
        cv_temp.append((cv_data[i].get(feature)))

    for i in range(len(test_labels)):
        test_temp.append((test_data[i].get(feature)))
        
    train_mat_tmp = np.c_[train_mat,train_temp]
    cv_mat_tmp = np.c_[cv_mat,cv_temp]
    test_mat_tmp = np.c_[test_mat,test_temp]

    if modeltype == "LR":
        
        LR_train = LogisticRegression(C = opt_val)
        LR_train.fit(train_mat_tmp,train_labels)

        cv_pred = LR_train.predict(cv_mat_tmp)

        print "For LR cv :\n"
        print(classification_report(cv_labels, cv_pred))

        test_pred = LR_train.predict(test_mat_tmp)

        print "For LR test :\n"
        print(classification_report(test_labels, test_pred))
        
        return LR_train,train_mat_tmp, cv_mat_tmp, test_mat_tmp
        
    elif modeltype == "BNB":
        BNB_train = BernoulliNB(alpha = opt_val)
        BNB_train.fit(train_mat_tmp,train_labels)

        cv_pred = BNB_train.predict(cv_mat_tmp)

        print "For BNB cv :\n"
        print(classification_report(cv_labels, cv_pred))

        test_pred = LR_train.predict(test_mat_tmp)

        print "For BNB test :\n"
        print(classification_report(test_labels, test_pred))
        
        return BNB_train,train_mat_tmp, cv_mat_tmp, test_mat_tmp
        
    else:
        KNN_train = KNeighborsClassifier(n_neighbors = opt_val)
        KNN_train.fit(train_mat_tmp,train_labels)

        cv_pred = BNB_train.predict(cv_mat_tmp)

        print "For KNN cv :\n"
        print(classification_report(cv_labels, cv_pred))

        test_pred = LR_train.predict(test_mat_tmp)

        print "For KNN test :\n"
        print(classification_report(test_labels, test_pred))
        
        return KNN_train,train_mat_tmp, cv_mat_tmp, test_mat_tmp


#temp override
opt_val = 1

feature = 'requester_account_age_in_days_at_request'
print feature + "\n"

model,train_data_plus, cv_data_plus, test_data_plus = evaluate_new_feature(train, train_labels, train_data_text, cv, cv_labels, cv_data_text, test, test_labels, test_data_text, feature, opt_val, "LR")

feature = 'requester_days_since_first_post_on_raop_at_request'
print feature + "\n"

model,train_data_plus_2, cv_data_plus_2, test_data_plus_2 = evaluate_new_feature(train, train_labels, train_data_plus, cv, cv_labels, cv_data_plus, test, test_labels, test_data_plus, feature, opt_val, "LR")

feature = 'requester_number_of_comments_at_request'
print feature + "\n"

model,train_data_plus_3, cv_data_plus_3, test_data_plus_3 = evaluate_new_feature(train, train_labels, train_data_plus_2,cv, cv_labels, cv_data_plus_2, test, test_labels, test_data_plus_2, feature, opt_val, "LR")

feature = 'requester_number_of_comments_in_raop_at_request'
print feature + "\n"

model,train_data_plus_4, cv_data_plus_4, test_data_plus_4 = evaluate_new_feature(train, train_labels, train_data_plus_3,cv, cv_labels, cv_data_plus_3, test, test_labels, test_data_plus_3, feature, opt_val, "LR")

feature = 'requester_number_of_posts_at_request'
print feature + "\n"

model,train_data_plus_5, cv_data_plus_5, test_data_plus_5 = evaluate_new_feature(train, train_labels, train_data_plus_4,cv, cv_labels, cv_data_plus_4, test, test_labels, test_data_plus_4, feature, opt_val, "LR")

feature = 'requester_number_of_comments_in_raop_at_request'
print feature + "\n"

model,train_data_plus_6, cv_data_plus_6, test_data_plus_6 = evaluate_new_feature(train, train_labels, train_data_plus_5,cv, cv_labels, cv_data_plus_5, test, test_labels, test_data_plus_5, feature, opt_val, "LR")

feature = 'requester_number_of_subreddits_at_request'
print feature + "\n"

model,train_data_plus_7, cv_data_plus_7, test_data_plus_7 = evaluate_new_feature(train, train_labels, train_data_plus_6,cv, cv_labels, cv_data_plus_6, test, test_labels, test_data_plus_6, feature, opt_val, "LR")


feature = 'requester_upvotes_minus_downvotes_at_request'
print feature + "\n"

model,train_data_plus_8, cv_data_plus_8, test_data_plus_8 = evaluate_new_feature(train, train_labels, train_data_plus_7,cv, cv_labels, cv_data_plus_7, test, test_labels, test_data_plus_7, feature, opt_val, "LR")




requester_account_age_in_days_at_request

For LR cv :

             precision    recall  f1-score   support

        0.0       0.76      0.86      0.81       838
        1.0       0.38      0.24      0.29       296

avg / total       0.66      0.70      0.67      1134

For LR test :

             precision    recall  f1-score   support

        0.0       0.76      0.86      0.80       842
        1.0       0.34      0.22      0.26       292

avg / total       0.65      0.69      0.67      1134

requester_days_since_first_post_on_raop_at_request

For LR cv :

             precision    recall  f1-score   support

        0.0       0.76      0.86      0.81       838
        1.0       0.37      0.24      0.29       296

avg / total       0.66      0.70      0.67      1134

For LR test :

             precision    recall  f1-score   support

        0.0       0.76      0.85      0.80       842
        1.0       0.34      0.23      0.27       292

avg / total       0.65      0.69      0.67  

In [192]:
#create upsample for the positive case

def upsample_50_50(dataset, train_labels, cv_labels, test_labels):
    
    class_ratio = 1 / ((sum(train_labels) + sum(cv_labels) +sum(test_labels)) / len(dataset))
    
    minority_count = ((sum(train_labels) + sum(cv_labels) +sum(test_labels)))
    minority_goal = minority_count * (class_ratio - 1)
    
    upsampled_dataset = list(dataset)
                      
    while minority_count < minority_goal:
        index = int(round(np.random.rand()*len(dataset))) - 1
        
        if dataset[index].get('requester_received_pizza') == 1:
            upsampled_dataset.append(dataset[index])
            minority_count += 1
        
    return upsampled_dataset
                      
upsampled_dataset = upsample_50_50(dataset, train_labels, cv_labels, test_labels)

print len(upsampled_dataset)

train_up, train_up_labels, cv_up, cv_up_labels, test_up, test_up_labels = create_data_sets(upsampled_dataset)


print len(train_up_labels)
print len(cv_up_labels)
print len(test_up_labels)

feature = 'request_text_edit_aware'
print feature + "\n"
vocab_up, opt_val_up, best_up, train_data_up_text, cv_data_up_text, test_data_up_text = find_optim_model_textual(upsampled_dataset, 
                                                        train_up, train_up_labels, cv_up, cv_up_labels, test_up, test_up_labels, feature)

#temp override
opt_val = 1

feature = 'requester_account_age_in_days_at_request'
print feature + "\n"

model_up,train_up_data_plus, cv_up_data_plus, test_up_data_plus = evaluate_new_feature(train_up, train_up_labels, 
                                                                                       train_data_up_text, cv_up, cv_up_labels, cv_data_up_text, test_up, test_up_labels, test_data_up_text, feature, opt_val, "LR")

feature = 'requester_days_since_first_post_on_raop_at_request'
print feature + "\n"

model_up,train_up_data_plus_2, cv_up_data_plus_2, test_up_data_plus_2 = evaluate_new_feature(train_up, train_up_labels, train_up_data_plus,
                                                    cv_up, cv_up_labels, cv_up_data_plus, test_up, test_up_labels, test_up_data_plus, feature, opt_val, "LR")

feature = 'requester_number_of_comments_at_request'
print feature + "\n"

model_up,train_up_data_plus_3, cv_up_data_plus_3, test_up_data_plus_3 = evaluate_new_feature(train_up, train_up_labels, train_up_data_plus_2,
                                                    cv_up, cv_up_labels, cv_up_data_plus_2, test_up, test_up_labels, test_up_data_plus_2, feature, opt_val, "LR")


feature = 'requester_number_of_comments_in_raop_at_request'
print feature + "\n"

model_up,train_up_data_plus_4, cv_up_data_plus_4, test_up_data_plus_4 = evaluate_new_feature(train_up, train_up_labels, train_up_data_plus_3,
                                                    cv_up, cv_up_labels, cv_up_data_plus_3, test_up, test_up_labels, test_up_data_plus_3, feature, opt_val, "LR")


feature = 'requester_number_of_posts_at_request'
print feature + "\n"

model_up,train_up_data_plus_5, cv_up_data_plus_5, test_up_data_plus_5 = evaluate_new_feature(train_up, train_up_labels, train_up_data_plus_4,
                                                    cv_up, cv_up_labels, cv_up_data_plus_4, test_up, test_up_labels, test_up_data_plus_4, feature, opt_val, "LR")


feature = 'requester_number_of_comments_in_raop_at_request'
print feature + "\n"

model_up,train_up_data_plus_6, cv_up_data_plus_6, test_up_data_plus_6 = evaluate_new_feature(train_up, train_up_labels, train_up_data_plus_5,
                                                    cv_up, cv_up_labels, cv_up_data_plus_5, test_up, test_up_labels, test_up_data_plus_5, feature, opt_val, "LR")


feature = 'requester_number_of_subreddits_at_request'
print feature + "\n"

model_up,train_up_data_plus_7, cv_up_data_plus_7, test_up_data_plus_7 = evaluate_new_feature(train_up, train_up_labels, train_up_data_plus_6,
                                                    cv_up, cv_up_labels, cv_up_data_plus_6, test_up, test_up_labels, test_up_data_plus_6, feature, opt_val, "LR")



feature = 'requester_upvotes_minus_downvotes_at_request'
print feature + "\n"

model_up,train_up_data_plus_8, cv_up_data_plus_8, test_up_data_plus_8 = evaluate_new_feature(train_up, train_up_labels, train_up_data_plus_7,
                                                    cv_up, cv_up_labels, cv_up_data_plus_7, test_up, test_up_labels, test_up_data_plus_7, feature, opt_val, "LR")



8548
5129
1710
1709
request_text_edit_aware

LR is the best 

The CV results are:

             precision    recall  f1-score   support

        0.0       0.84      0.73      0.78       847
        1.0       0.76      0.86      0.81       863

avg / total       0.80      0.80      0.79      1710

The test results are:

             precision    recall  f1-score   support

        0.0       0.84      0.72      0.77       866
        1.0       0.75      0.86      0.80       843

avg / total       0.80      0.79      0.79      1709

requester_account_age_in_days_at_request

For LR cv :

             precision    recall  f1-score   support

        0.0       0.81      0.73      0.77       847
        1.0       0.76      0.84      0.80       863

avg / total       0.79      0.78      0.78      1710

For LR test :

             precision    recall  f1-score   support

        0.0       0.81      0.72      0.76       866
        1.0       0.74      0.83      0.78       843

avg / total       

In [193]:
#create predictions for Kaggle
test_data = pd.read_json('data/test.json')
test_data_text = test_data.request_text_edit_aware
                 
vec_submit = CountVectorizer(vocabulary=vocab_up)
tokenized_test_data = vec_submit.fit_transform(test_data_text)

tokenized_test_data_1 = np.c_[tokenized_test_data.todense(),test_data.requester_account_age_in_days_at_request]
tokenized_test_data_2 = np.c_[tokenized_test_data_1,test_data.requester_days_since_first_post_on_raop_at_request]
tokenized_test_data_3 = np.c_[tokenized_test_data_2,test_data.requester_number_of_comments_at_request]
tokenized_test_data_4 = np.c_[tokenized_test_data_3,test_data.requester_number_of_comments_in_raop_at_request]
tokenized_test_data_5 = np.c_[tokenized_test_data_4,test_data.requester_number_of_posts_at_request]
tokenized_test_data_6 = np.c_[tokenized_test_data_5,test_data.requester_number_of_posts_on_raop_at_request]
tokenized_test_data_7 = np.c_[tokenized_test_data_6,test_data.requester_number_of_subreddits_at_request]
tokenized_test_data_8 = np.c_[tokenized_test_data_7,test_data.requester_upvotes_minus_downvotes_at_request]

test_pred = model_up.predict(tokenized_test_data_8)

predictions = pd.DataFrame()
predictions['request_id'] = test_data.request_id
predictions['requester_received_pizza'] = test_pred
predictions = predictions.set_index('request_id')

predictions.to_csv('NLP_submission_v4_upsample_allfeatures.csv')

In [None]:
#create predictions for Kaggle
test_data = pd.read_json('data/test.json')
test_data_text = test_data.request_text_edit_aware
                 
vec_submit = CountVectorizer(vocabulary=vocab_up)
tokenized_test_data = vec_submit.fit_transform(test_data_text)

tokenized_test_data_1 = np.c_[tokenized_test_data.todense(),test_data.requester_account_age_in_days_at_request]
tokenized_test_data_2 = np.c_[tokenized_test_data_1,test_data.requester_days_since_first_post_on_raop_at_request]
tokenized_test_data_3 = np.c_[tokenized_test_data_2,test_data.requester_number_of_comments_at_request]
tokenized_test_data_4 = np.c_[tokenized_test_data_3,test_data.requester_number_of_comments_in_raop_at_request]
tokenized_test_data_5 = np.c_[tokenized_test_data_4,test_data.requester_number_of_posts_at_request]
tokenized_test_data_6 = np.c_[tokenized_test_data_5,test_data.requester_number_of_posts_on_raop_at_request]
tokenized_test_data_7 = np.c_[tokenized_test_data_6,test_data.requester_number_of_subreddits_at_request]
tokenized_test_data_8 = np.c_[tokenized_test_data_7,test_data.requester_upvotes_minus_downvotes_at_request]
 
test_pred = model.predict(tokenized_test_data_8)

predictions = pd.DataFrame()
predictions['request_id'] = test_data.request_id
predictions['requester_received_pizza'] = test_pred
predictions = predictions.set_index('request_id')

predictions.to_csv('NLP_submission_v4_upsample_allfeatures.csv')