In [2]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import model_selection

import codecs
import json
import os

In [5]:
IMPORT = True
EXPORT = True
DISPLAY = True
VERBOSE = True
DATA_DIR = "data"
SEED = 221

In [6]:
def read_dataset(path):
    with codecs.open(path, 'r', 'utf-8') as myFile:
        content = myFile.read()
        dataset = json.loads(content)
    return dataset

if (IMPORT):
    path = os.path.join(DATA_DIR, 'pizza_request_dataset/pizza_request_dataset.json')
    dataset = read_dataset(path)
  
    if (VERBOSE):
        print('The dataset contains %d samples.' %(len(dataset)))
        print('Available attributes: ', sorted(dataset[0].keys()))
        print('First post:')
        print(json.dumps(dataset[0], sort_keys=True, indent=2))

        successes = [r['requester_received_pizza'] for r in dataset]
        success_rate = 100.0 * sum(successes) / float(len(successes))
        print('The average success rate is: %.2f%%' %(success_rate))

The dataset contains 5671 samples.
Available attributes:  ['giver_username_if_known', 'in_test_set', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_r

In [4]:
data = pd.DataFrame(dataset)
data = data.set_index("request_id", verify_integrity=True)
data.head()

Unnamed: 0_level_0,giver_username_if_known,in_test_set,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,post_was_edited,request_number_of_comments_at_retrieval,request_text,request_text_edit_aware,request_title,requester_account_age_in_days_at_request,...,requester_received_pizza,requester_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_minus_downvotes_at_retrieval,requester_upvotes_plus_downvotes_at_request,requester_upvotes_plus_downvotes_at_retrieval,requester_user_flair,requester_username,unix_timestamp_of_request,unix_timestamp_of_request_utc
request_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
t3_w5491,,False,2,6,False,7,"I'm not in College, or a starving artist or an...","I'm not in College, or a starving artist or an...","[Request] Ontario, Canada - On my 3rd of 5 day...",14.416875,...,True,"[AdviceAnimals, WTF, funny, gaming, movies, te...",32,212,48,610,shroom,RitalinYourMemory,1341605000.0,1341601000.0
t3_qysgy,,False,2,6,True,8,Hello! It's been a hard 2 months with money an...,Hello! It's been a hard 2 months with money an...,"[REQUEST] Southern Arizona, Tucson Hungry Family",11.95706,...,True,"[aww, pics]",5,21,13,57,shroom,blubirdhvn,1331868000.0,1331865000.0
t3_if0ed,,False,1,4,True,1,I'm sure there are needier people on this subr...,I'm sure there are needier people on this subr...,[Request] Pizza for a broke college student,454.388461,...,True,"[AskReddit, DoesAnybodyElse, IAmA, Marijuana, ...",1359,2110,2423,3456,shroom,taterpot,1309622000.0,1309619000.0
t3_jr3w1,,False,2,13,False,3,I've been unemployed going on three months now...,I've been unemployed going on three months now...,[Request] Unemployed and Sick of Rice (Suffolk...,141.715625,...,True,"[AdviceAnimals, Art, AskReddit, BookCollecting...",1205,4889,2403,8245,shroom,or_me_bender,1314060000.0,1314056000.0
t3_1d18tc,,False,1,4,1.36685e+09,5,I ran out of money on my meal card a while bac...,I ran out of money on my meal card a while bac...,"[Request] Ohio USA, broke student musician in ...",161.699155,...,True,"[TrueAtheism, atheism, funny]",81,86,225,232,shroom,m_chamberlin,1366840000.0,1366836000.0


In [5]:
print(len(data))
data = data[~(data['request_text']=="")]
data.request_text = data.request_text.str.replace("\r", "", regex=False)
data.request_text_edit_aware = data.request_text_edit_aware.str.replace("\r", "", regex=False)
print(len(data))

5671
5513


In [10]:
givers = data.giver_username_if_known.unique()
givers

array(['N/A', 'thisfreaknguy', 'omatrisha', 'bridgetonarnia',
       'ghotisgirl06', 'dirtypaws', 'iamaredditer', 'wensul', 'jski5711',
       'instant_japanese', 'whubbard', 'mclwlm', 'perezdev',
       'halftheman1', 'starfireliz', 'tuckednip', 'fogdelune',
       'overripebanana', 'coldheat', 'daner54', 'wchill', 'clarle',
       'wingsalone', 'foki', 'ltshrink', 'mkeanon', 'b4tty0n3',
       'logicday', 'synth3tk', 'womg', '_kill-fx_', 'idtugyourboat',
       'mr_jeep', 'dontlikeclowns', 'harrisonfire',
       'glinda_da_good_witch', 'jabberworx', 'clearmoon247', 'ynoty3k',
       'i_like_owls', 'pizzamom', 'moomoocowninja', '1st_account_i_swear',
       'osk213', 'shrapnull', '0mggames', 'minivansareevil',
       'anarchyreigns', 'jesses_girl', 'in2itiveact', 'whosthatguru_v2',
       'govalle', 'danilee88', 'abrham_smith', 'tdralston94', 'raxxarn',
       'tanyagal2', 'bigbadjew', 'flamingcow', 'cdjpurple', 'omnomagon',
       'erinpierce', 'zandyman', 'fivestarsoul', 'psm321', '

In [9]:
data[data.requester_username.isin(givers)]

Unnamed: 0_level_0,giver_username_if_known,in_test_set,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,post_was_edited,request_number_of_comments_at_retrieval,request_text,request_text_edit_aware,request_title,requester_account_age_in_days_at_request,...,requester_received_pizza,requester_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_minus_downvotes_at_retrieval,requester_upvotes_plus_downvotes_at_request,requester_upvotes_plus_downvotes_at_retrieval,requester_user_flair,requester_username,unix_timestamp_of_request,unix_timestamp_of_request_utc
request_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
t3_16ecwy,jeremybeadle,False,1,6,False,0,Our family has been going through a rough time...,Our family has been going through a rough time...,[Request] Had a rough time lately and could us...,362.23463,...,True,"[AdviceAnimals, AskReddit, RandomActsOfChristm...",1080,10028,3804,16922,PIF,kimi21,1357937000.0,1357937000.0
t3_1c796y,thegeorge,False,7,9,False,11,"sorry, i am a reddit noob - trying to remember...","sorry, i am a reddit noob - trying to remember...",[Request] U.K. can anyone help a british broth...,155.399942,...,True,"[AdviceAtheists, AskReddit, Graffiti, IAmA, br...",85,2215,111,3009,PIF,chalt0,1365772000.0,1365769000.0
t3_1idsgl,iamaslacker246,False,2,10,False,2,We're completely broke and craving pizza. I th...,We're completely broke and craving pizza. I th...,"[Request] Spokane WA We're broke, I'm hormonal...",211.213854,...,True,"[AHealthierMe, AdviceAnimals, Assistance, Chea...",2534,2762,3036,3286,PIF,duckydoom,1373943000.0,1373939000.0
t3_zwf27,,False,1,2,False,0,I'm a 21 year old Boston college student and m...,I'm a 21 year old Boston college student and m...,[Request] Hungry and Broke College Girl,249.715822,...,True,"[AdviceAnimals, AskReddit, Cloud, Dreams, IAmA...",2340,2366,4746,4780,PIF,jme413,1347666000.0,1347662000.0
t3_187fqj,,False,2,10,False,0,My very first post on Reddit was to give a piz...,My very first post on Reddit was to give a piz...,"[Request] Maine - Given pizzas twice, asking f...",251.00559,...,False,"[AskReddit, Baking, EmersonCollege, Fitness, R...",81,186,119,254,,deus_ex_eagles,1360439000.0,1360439000.0
t3_tv7zq,amarkson,True,0,2,False,9,"I have given a couple times on here, and it's ...","I have given a couple times on here, and it's ...",[Request] TN ~ want to celebrate son finishing...,128.095347,...,True,"[AskReddit, AskWomen, BeardPorn, HIFW, Parenti...",327,550,507,802,PIF,ithinkimdoingthis,1337469000.0,1337466000.0


# Make splits while maintaining class balance

In [6]:
splits = [0.6, 0.2, 0.2]

In [7]:
train_and_val, test_set = sklearn.model_selection.train_test_split(
    data, test_size=0.2, stratify=data.requester_received_pizza, random_state=SEED)

In [8]:
train_set, val_set = sklearn.model_selection.train_test_split(
    train_and_val, test_size=0.25, stratify=train_and_val.requester_received_pizza,
    random_state=SEED)

In [9]:
train_mini, _ = sklearn.model_selection.train_test_split(
    train_set, train_size=0.01, stratify=train_set.requester_received_pizza, random_state=SEED)
val_mini, _ = sklearn.model_selection.train_test_split(
    val_set, train_size=0.01, stratify=val_set.requester_received_pizza, random_state=SEED)
test_mini, _ = sklearn.model_selection.train_test_split(
    test_set, train_size=0.01, stratify=test_set.requester_received_pizza, random_state=SEED)

train_medium, _ = sklearn.model_selection.train_test_split(
    train_set, train_size=0.1, stratify=train_set.requester_received_pizza, random_state=SEED)
val_medium, _ = sklearn.model_selection.train_test_split(
    val_set, train_size=0.1, stratify=val_set.requester_received_pizza, random_state=SEED)
test_medium, _ = sklearn.model_selection.train_test_split(
    test_set, train_size=0.1, stratify=test_set.requester_received_pizza, random_state=SEED)

## Ensure the number of total examples is the same
We're missing 3 examples, likely from rounding error. Should be fine

In [10]:
len(train_set) + len(val_set) + len(test_set), len(data)

(5513, 5513)

In [11]:
len(train_mini) + len(val_mini) + len(test_mini), len(data)

(55, 5513)

## Check class balance

In [12]:
len(train_set[train_set.requester_received_pizza])/len(train_set)

0.24977320834593286

In [13]:
len(val_set[val_set.requester_received_pizza])/len(val_set)

0.24932003626473254

In [14]:
len(test_set[test_set.requester_received_pizza])/len(test_set)

0.24932003626473254

In [15]:
repr(train_set.loc["t3_1niqoy"].request_text_edit_aware)

'"I work for a section of my university that cleans windows. They have been trying to shut us down for a while since we don\'t bring in money just cost it. We had a meeting with our boss and she Informed us that we will not have jobs after this week. We where told the reason why was no room in the budget. Well I don\'t have room in my budget to not get a pay check anymore. Thank you for helping and thank you for listening. "'

# Export as csvs

In [16]:
if (EXPORT):
    train_set.to_csv(os.path.join(DATA_DIR, "train_all.csv"), index_label="request_id")
    val_set.to_csv(os.path.join(DATA_DIR, "val_all.csv"), index_label="request_id")
    test_set.to_csv(os.path.join(DATA_DIR, "test_all.csv"), index_label="request_id")
    
    train_medium.to_csv(os.path.join(DATA_DIR, "train_medium.csv"), index_label="request_id")
    val_medium.to_csv(os.path.join(DATA_DIR, "val_medium.csv"), index_label="request_id")
    test_medium.to_csv(os.path.join(DATA_DIR, "test_medium.csv"), index_label="request_id")
    
    train_mini.to_csv(os.path.join(DATA_DIR, "train_mini.csv"), index_label="request_id")
    val_mini.to_csv(os.path.join(DATA_DIR, "val_mini.csv"), index_label="request_id")
    test_mini.to_csv(os.path.join(DATA_DIR, "test_mini.csv"), index_label="request_id")