In [2]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import model_selection

import codecs
import json
import os

In [3]:
IMPORT = True
EXPORT = True
DISPLAY = True
VERBOSE = False
DATA_DIR = "data"
SEED = 221

In [4]:
def read_dataset(path):
    with codecs.open(path, 'r', 'utf-8') as myFile:
        content = myFile.read()
        dataset = json.loads(content)
    return dataset

if (IMPORT):
    path = os.path.join(DATA_DIR, 'pizza_request_dataset/pizza_request_dataset.json')
    dataset = read_dataset(path)
  
    if (VERBOSE):
        print('The dataset contains %d samples.' %(len(dataset)))
        print('Available attributes: ', sorted(dataset[0].keys()))
        print('First post:')
        print(json.dumps(dataset[0], sort_keys=True, indent=2))

        successes = [r['requester_received_pizza'] for r in dataset]
        success_rate = 100.0 * sum(successes) / float(len(successes))
        print('The average success rate is: %.2f%%' %(success_rate))

In [5]:
data = pd.DataFrame(dataset)
data = data.set_index("request_id", verify_integrity=True)
data.head()

Unnamed: 0_level_0,giver_username_if_known,in_test_set,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,post_was_edited,request_number_of_comments_at_retrieval,request_text,request_text_edit_aware,request_title,requester_account_age_in_days_at_request,...,requester_received_pizza,requester_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_minus_downvotes_at_retrieval,requester_upvotes_plus_downvotes_at_request,requester_upvotes_plus_downvotes_at_retrieval,requester_user_flair,requester_username,unix_timestamp_of_request,unix_timestamp_of_request_utc
request_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
t3_w5491,,False,2,6,False,7,"I'm not in College, or a starving artist or an...","I'm not in College, or a starving artist or an...","[Request] Ontario, Canada - On my 3rd of 5 day...",14.416875,...,True,"[AdviceAnimals, WTF, funny, gaming, movies, te...",32,212,48,610,shroom,RitalinYourMemory,1341605000.0,1341601000.0
t3_qysgy,,False,2,6,True,8,Hello! It's been a hard 2 months with money an...,Hello! It's been a hard 2 months with money an...,"[REQUEST] Southern Arizona, Tucson Hungry Family",11.95706,...,True,"[aww, pics]",5,21,13,57,shroom,blubirdhvn,1331868000.0,1331865000.0
t3_if0ed,,False,1,4,True,1,I'm sure there are needier people on this subr...,I'm sure there are needier people on this subr...,[Request] Pizza for a broke college student,454.388461,...,True,"[AskReddit, DoesAnybodyElse, IAmA, Marijuana, ...",1359,2110,2423,3456,shroom,taterpot,1309622000.0,1309619000.0
t3_jr3w1,,False,2,13,False,3,I've been unemployed going on three months now...,I've been unemployed going on three months now...,[Request] Unemployed and Sick of Rice (Suffolk...,141.715625,...,True,"[AdviceAnimals, Art, AskReddit, BookCollecting...",1205,4889,2403,8245,shroom,or_me_bender,1314060000.0,1314056000.0
t3_1d18tc,,False,1,4,1.36685e+09,5,I ran out of money on my meal card a while bac...,I ran out of money on my meal card a while bac...,"[Request] Ohio USA, broke student musician in ...",161.699155,...,True,"[TrueAtheism, atheism, funny]",81,86,225,232,shroom,m_chamberlin,1366840000.0,1366836000.0


# Make splits while maintaining class balance

In [6]:
splits = [0.6, 0.2, 0.2]

In [7]:
train_and_val, test_set = sklearn.model_selection.train_test_split(
    data, test_size=0.2, stratify=data.requester_received_pizza, random_state=SEED)

In [8]:
train_set, val_set = sklearn.model_selection.train_test_split(
    train_and_val, test_size=0.25, stratify=train_and_val.requester_received_pizza,
    random_state=SEED)

In [14]:
train_mini, _ = sklearn.model_selection.train_test_split(
    train_set, train_size=0.01, stratify=train_set.requester_received_pizza, random_state=SEED)
val_mini, _ = sklearn.model_selection.train_test_split(
    val_set, train_size=0.01, stratify=val_set.requester_received_pizza, random_state=SEED)
test_mini, _ = sklearn.model_selection.train_test_split(
    test_set, train_size=0.01, stratify=test_set.requester_received_pizza, random_state=SEED)

train_medium, _ = sklearn.model_selection.train_test_split(
    train_set, train_size=0.1, stratify=train_set.requester_received_pizza, random_state=SEED)
val_medium, _ = sklearn.model_selection.train_test_split(
    val_set, train_size=0.1, stratify=val_set.requester_received_pizza, random_state=SEED)
test_medium, _ = sklearn.model_selection.train_test_split(
    test_set, train_size=0.1, stratify=test_set.requester_received_pizza, random_state=SEED)

## Ensure the number of total examples is the same
We're missing 3 examples, likely from rounding error. Should be fine

In [15]:
len(train_set) + len(val_set) + len(test_set), len(data)

(5671, 5671)

In [16]:
len(train_mini) + len(val_mini) + len(test_mini), len(data)

(56, 5671)

## Check class balance

In [17]:
len(train_set[train_set.requester_received_pizza])/len(train_set)

0.24632569077013522

In [18]:
len(val_set[val_set.requester_received_pizza])/len(val_set)

0.24603174603174602

In [19]:
len(test_set[test_set.requester_received_pizza])/len(test_set)

0.24669603524229075

# Export as csvs

In [20]:
if (EXPORT):
    train_set.to_csv(os.path.join(DATA_DIR, "train_all.csv"), index=False)
    val_set.to_csv(os.path.join(DATA_DIR, "val_all.csv"), index=False)
    test_set.to_csv(os.path.join(DATA_DIR, "test_all.csv"), index=False)
    
    train_medium.to_csv(os.path.join(DATA_DIR, "train_medium.csv"), index=False)
    val_medium.to_csv(os.path.join(DATA_DIR, "val_medium.csv"), index=False)
    test_medium.to_csv(os.path.join(DATA_DIR, "test_medium.csv"), index=False)
    
    train_mini.to_csv(os.path.join(DATA_DIR, "train_mini.csv"), index=False)
    val_mini.to_csv(os.path.join(DATA_DIR, "val_mini.csv"), index=False)
    test_mini.to_csv(os.path.join(DATA_DIR, "test_mini.csv"), index=False)