In [1]:
import json
import pandas as pd
import numpy as np
import re
from auxiliary_functions import print_raop_request

raw_data = json.load(open('../Data/data.json'))
raw_data = pd.json_normalize(raw_data)

# Quick data quality control

In [2]:
raw_data.describe(include='all')

Unnamed: 0,giver_username_if_known,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,post_was_edited,request_id,request_number_of_comments_at_retrieval,request_text,request_text_edit_aware,request_title,requester_account_age_in_days_at_request,...,requester_received_pizza,requester_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_minus_downvotes_at_retrieval,requester_upvotes_plus_downvotes_at_request,requester_upvotes_plus_downvotes_at_retrieval,requester_user_flair,requester_username,unix_timestamp_of_request,unix_timestamp_of_request_utc
count,4040.0,4040.0,4040.0,4040,4040,4040.0,4040.0,4040.0,4040,4040.0,...,4040,4040,4040.0,4040.0,4040.0,4040.0,994,4040,4040.0,4040.0
unique,250.0,,,301,4040,,3936.0,3932.0,4025,,...,2,2987,,,,,2,4040,,
top,,,,False,t3_15nuxv,,,,[REQUEST],,...,False,[],,,,,shroom,madkiro101,,
freq,3753.0,,,3395,1,,104.0,104.0,5,,...,3046,729,,,,,935,1,,
mean,,2.424505,6.180446,,,2.87104,,,,254.586579,...,,,1160.07995,2720.342079,3743.236,7788.069,,,1342829000.0,1342826000.0
std,,3.023101,10.74632,,,4.723339,,,,303.27573,...,,,3718.365515,6264.378878,25838.16,39167.41,,,23330570.0,23329890.0
min,,0.0,0.0,,,0.0,,,,0.0,...,,,-173.0,-173.0,0.0,0.0,,,1297723000.0,1297723000.0
25%,,1.0,2.0,,,0.0,,,,3.473168,...,,,3.0,22.0,9.0,52.0,,,1320469000.0,1320466000.0
50%,,2.0,4.0,,,1.0,,,,157.06717,...,,,174.5,708.0,351.0,1283.5,,,1342565000.0,1342561000.0
75%,,3.0,7.0,,,4.0,,,,390.092653,...,,,1163.75,3304.0,2303.75,6829.0,,,1364618000.0,1364614000.0


In [3]:
print(f"There is {raw_data.request_id.duplicated().sum()} duplicated request ids")
print("\n Duplicated request texts")
print(raw_data.request_text.loc[raw_data.request_text.duplicated()].value_counts())
print("\n Duplicated request titles")
print(raw_data.request_title.loc[raw_data.request_title.duplicated()].value_counts())

There is 0 duplicated request ids

 Duplicated request texts
                         103
Please and thank you!      1
Name: request_text, dtype: int64

 Duplicated request titles
[REQUEST]     4
Request       3
request       2
[Request]     2
[Request]     2
[REQUEST]     1
[request]     1
Name: request_title, dtype: int64


The dataset is very clean: no clear duplicates and no missing data. the only needed cleaning operation are
* NA reformating in giver_username column
* unix timestamps to datetime reformatting

In [4]:
def clean_data(raw_data):
    cleaned_data = raw_data.copy()
    
    # Reformat NA in giver_username column
    cleaned_data.loc[cleaned_data.giver_username_if_known == "N/A", "giver_username_if_known"] = np.nan
    
    #format datetime
    cleaned_data.unix_timestamp_of_request = pd.to_datetime(cleaned_data.unix_timestamp_of_request, unit='s')
    cleaned_data.unix_timestamp_of_request_utc = pd.to_datetime(cleaned_data.unix_timestamp_of_request_utc, unit='s')
    return cleaned_data

cleaned_data = clean_data(raw_data)

# Enriching data with basic features

In [5]:
def enrich_data(clean_data):
    enriched_data = clean_data.copy()
    # Compute date features based on timestamp
    enriched_data['request_year'] = enriched_data.unix_timestamp_of_request.dt.year
    enriched_data['request_month'] = enriched_data.unix_timestamp_of_request.dt.month
    enriched_data['request_day'] = enriched_data.unix_timestamp_of_request.dt.day
    enriched_data['request_hour'] = enriched_data.unix_timestamp_of_request.dt.hour
    enriched_data['request_weekday'] = enriched_data.unix_timestamp_of_request.dt.weekday

    ## Get basic number of words features for some narrative columns
    enriched_data['request_length'] = enriched_data.request_text.apply(lambda x: len(x.split(" ")))
    enriched_data['request_edit_aware_length'] = enriched_data.request_text_edit_aware.apply(lambda x: len(x.split(" ")))
    enriched_data['request_title_length'] = enriched_data.request_title.apply(lambda x: len(x.split(" ")))

    ## Computing the number of upvotes and downvotes at request. 
    ## those variables will be much easier to interpret than the one availble. 
    enriched_data['requester_upvotes_at_request'] = (enriched_data.requester_upvotes_minus_downvotes_at_request + 
                enriched_data.requester_upvotes_plus_downvotes_at_request)/2
    enriched_data['requester_downvotes_at_request'] = (enriched_data.requester_upvotes_plus_downvotes_at_request - 
                enriched_data.requester_upvotes_minus_downvotes_at_request)/2
    
    # has picture or has a link. According to the original article 
    # https://cs.stanford.edu/~althoff/raop-dataset/altruistic_requests_icwsm.pdf, providing
    # evidence for what is said in the request through picture is a good way of increasing 
    # the chance of success. 
    enriched_data['has_picture'] = enriched_data.request_text_edit_aware.apply(lambda x: bool(re.search(r'(imgur)|(jpg)', x)))
    enriched_data['has_link'] = enriched_data.request_text_edit_aware.apply(lambda x: bool(re.search(r'(www)|(http)', x)))
    
    # Subreddits at request is a list of subereddits name. For ease of use, 
    # we convert it to a single string. 
    enriched_data['requester_subreddits_at_request_text'] = enriched_data.requester_subreddits_at_request.apply(lambda x: " ".join(x))
    return enriched_data

enriched_data = enrich_data(cleaned_data)

# Save cleaned and enriched data

In [6]:
enriched_data.to_pickle("../Data/enriched_data.pkl")

# Have a look at some requests

Look at most popular request

In [7]:
enriched_data.sort_values('number_of_upvotes_of_request_at_retrieval', ascending=False, inplace=True)
roap_request = enriched_data.loc[enriched_data.requester_received_pizza,:].iloc[0,:]
print_raop_request(roap_request)

Requester got pizza : True

##########
request title
##########
[Request]Vancouver, BC, Canada Father of 5, wife just got out of surgery, we were suddenly cut off from employment insurance.

##########
request text edit aware
##########
The government screwed up and now we have to wait over a month for them to refile and reestablish my claim.  There is no way to expedite this at all, in spite of the fact that it is their mistake.  We have 2 girls (9 and 7) and 3 boys (5,3, 2months).

My wife had to be taken by ambulance to the hospital last week for emergency gall bladder removal surgery and we are feeling a bit beat on at the moment.  This would be a humungous pick-us-up.

I am happy to provide any verification you need.  Thanks in advance.

*

##########
request text
##########
The government screwed up and now we have to wait over a month for them to refile and reestablish my claim.  There is no way to expedite this at all, in spite of the fact that it is their mistake.  We have 2 g

In [8]:
roap_request = enriched_data.loc[enriched_data.requester_received_pizza,:].iloc[1,:]
print_raop_request(roap_request)

Requester got pizza : True

##########
request title
##########
[REQUEST] No sob story, it's just my birthday tomorrow and I really like pizza :D

##########
request text edit aware
##########
My husband has to work 7-5 and class from 530-930 on my bday, so I just wanna order some pizza and read a book to occupy myself tomorrow. Anyone wanna hook it up with a pizza? 

##########
request text
##########
My husband has to work 7-5 and class from 530-930 on my bday, so I just wanna order some pizza and read a book to occupy myself tomorrow. Anyone wanna hook it up with a pizza? 


In [9]:
# Example for an unseccessfull request
roap_request = enriched_data.loc[~enriched_data.requester_received_pizza,:].iloc[-1,:]
print_raop_request(roap_request)

Requester got pizza : False

##########
request title
##########
[request] NY USA Would love to help me through a stressful week

##########
request text edit aware
##########
My mother lives in the area effected by the giant little bear fire in New mexico and is waiting to hear if she has a place to live or is coming back to New York to live with me. My father lost his job several month ago and is also likely about to move in with me. These two have been divorced for 20+ years and as much as i love them I have no idea if I can handle that. So I'm sitting here stressed out already wondering if I have enough to survive this economic hellstorm myself, let alone save my parents. I would love a pizza right now.

##########
request text
##########
My mother lives in the area effected by the giant little bear fire in New mexico and is waiting to hear if she has a place to live or is coming back to New York to live with me. My father lost his job several month ago and is also likely about to 