# Data Exploration and Cleaning

First, we will concentrate on the data we have. We want to see what type of features are present, and how the data is distributed.

In [1]:
import json
import pandas as pd
import seaborn as sns

sns.set(rc={"figure.figsize": (20, 12)})

### Loading Data

In [2]:
datafile_path = "../data/01_raw/pizza_data.json"
with open(datafile_path) as f:
    data = json.load(f)

df = pd.DataFrame(data)

### Data Exploration

In [3]:
df.shape

(4040, 32)

In [4]:
df.head()

Unnamed: 0,giver_username_if_known,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,post_was_edited,request_id,request_number_of_comments_at_retrieval,request_text,request_text_edit_aware,request_title,requester_account_age_in_days_at_request,...,requester_received_pizza,requester_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_minus_downvotes_at_retrieval,requester_upvotes_plus_downvotes_at_request,requester_upvotes_plus_downvotes_at_retrieval,requester_user_flair,requester_username,unix_timestamp_of_request,unix_timestamp_of_request_utc
0,,0,1,False,t3_l25d7,0,Hi I am in need of food for my 4 children we a...,Hi I am in need of food for my 4 children we a...,Request Colorado Springs Help Us Please,0.0,...,False,[],0,1,0,1,,nickylvst,1317853000.0,1317849000.0
1,,2,5,False,t3_rcb83,0,I spent the last money I had on gas today. Im ...,I spent the last money I had on gas today. Im ...,"[Request] California, No cash and I could use ...",501.1111,...,False,"[AskReddit, Eve, IAmA, MontereyBay, RandomKind...",34,4258,116,11168,,fohacidal,1332652000.0,1332649000.0
2,,0,3,False,t3_lpu5j,0,My girlfriend decided it would be a good idea ...,My girlfriend decided it would be a good idea ...,"[Request] Hungry couple in Dundee, Scotland wo...",0.0,...,False,[],0,3,0,3,,jacquibatman7,1319650000.0,1319646000.0
3,,0,1,True,t3_mxvj3,4,"It's cold, I'n hungry, and to be completely ho...","It's cold, I'n hungry, and to be completely ho...","[Request] In Canada (Ontario), just got home f...",6.518438,...,False,"[AskReddit, DJs, IAmA, Random_Acts_Of_Pizza]",54,59,76,81,,4on_the_floor,1322855000.0,1322855000.0
4,,6,6,False,t3_1i6486,5,hey guys:\n I love this sub. I think it's grea...,hey guys:\n I love this sub. I think it's grea...,[Request] Old friend coming to visit. Would LO...,162.063252,...,False,"[GayBrosWeightLoss, RandomActsOfCookies, Rando...",1121,1225,1733,1887,,Futuredogwalker,1373658000.0,1373654000.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4040 entries, 0 to 4039
Data columns (total 32 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   giver_username_if_known                               4040 non-null   object 
 1   number_of_downvotes_of_request_at_retrieval           4040 non-null   int64  
 2   number_of_upvotes_of_request_at_retrieval             4040 non-null   int64  
 3   post_was_edited                                       4040 non-null   object 
 4   request_id                                            4040 non-null   object 
 5   request_number_of_comments_at_retrieval               4040 non-null   int64  
 6   request_text                                          4040 non-null   object 
 7   request_text_edit_aware                               4040 non-null   object 
 8   request_title                                         4040

In [6]:
df.describe()

Unnamed: 0,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,request_number_of_comments_at_retrieval,requester_account_age_in_days_at_request,requester_account_age_in_days_at_retrieval,requester_days_since_first_post_on_raop_at_request,requester_days_since_first_post_on_raop_at_retrieval,requester_number_of_comments_at_request,requester_number_of_comments_at_retrieval,requester_number_of_comments_in_raop_at_request,...,requester_number_of_posts_at_retrieval,requester_number_of_posts_on_raop_at_request,requester_number_of_posts_on_raop_at_retrieval,requester_number_of_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_minus_downvotes_at_retrieval,requester_upvotes_plus_downvotes_at_request,requester_upvotes_plus_downvotes_at_retrieval,unix_timestamp_of_request,unix_timestamp_of_request_utc
count,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0,...,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0,4040.0
mean,2.424505,6.180446,2.87104,254.586579,757.69272,16.417034,518.993205,115.098267,289.425743,0.64505,...,41.151733,0.063614,1.239109,18.076733,1160.07995,2720.342079,3743.236,7788.069,1342829000.0,1342826000.0
std,3.023101,10.74632,4.723339,303.27573,333.035728,70.651428,267.872623,193.318968,357.416133,3.413813,...,80.798543,0.325773,0.603083,21.736465,3718.365515,6264.378878,25838.16,39167.41,23330570.0,23329890.0
min,0.0,0.0,0.0,0.0,45.291562,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-173.0,-173.0,0.0,0.0,1297723000.0,1297723000.0
25%,1.0,2.0,0.0,3.473168,522.248455,0.0,279.009051,0.0,8.0,0.0,...,2.0,0.0,1.0,1.0,3.0,22.0,9.0,52.0,1320469000.0,1320466000.0
50%,2.0,4.0,1.0,157.06717,753.270874,0.0,528.781939,24.0,114.0,0.0,...,13.0,0.0,1.0,11.0,174.5,708.0,351.0,1283.5,1342565000.0,1342561000.0
75%,3.0,7.0,4.0,390.092653,900.349838,0.0,776.22667,140.25,479.0,0.0,...,46.0,0.0,1.0,27.0,1163.75,3304.0,2303.75,6829.0,1364618000.0,1364614000.0
max,47.0,345.0,61.0,2809.750787,2879.276319,785.457685,1025.407593,994.0,1000.0,88.0,...,999.0,5.0,9.0,186.0,155010.0,223708.0,1286864.0,2046482.0,1381552000.0,1381523000.0


In [7]:
df.columns

Index(['giver_username_if_known',
       'number_of_downvotes_of_request_at_retrieval',
       'number_of_upvotes_of_request_at_retrieval', 'post_was_edited',
       'request_id', 'request_number_of_comments_at_retrieval', 'request_text',
       'request_text_edit_aware', 'request_title',
       'requester_account_age_in_days_at_request',
       'requester_account_age_in_days_at_retrieval',
       'requester_days_since_first_post_on_raop_at_request',
       'requester_days_since_first_post_on_raop_at_retrieval',
       'requester_number_of_comments_at_request',
       'requester_number_of_comments_at_retrieval',
       'requester_number_of_comments_in_raop_at_request',
       'requester_number_of_comments_in_raop_at_retrieval',
       'requester_number_of_posts_at_request',
       'requester_number_of_posts_at_retrieval',
       'requester_number_of_posts_on_raop_at_request',
       'requester_number_of_posts_on_raop_at_retrieval',
       'requester_number_of_subreddits_at_request', 'r

In [8]:
# check for missing values if > 0
df.isnull().sum()

giver_username_if_known                                    0
number_of_downvotes_of_request_at_retrieval                0
number_of_upvotes_of_request_at_retrieval                  0
post_was_edited                                            0
request_id                                                 0
request_number_of_comments_at_retrieval                    0
request_text                                               0
request_text_edit_aware                                    0
request_title                                              0
requester_account_age_in_days_at_request                   0
requester_account_age_in_days_at_retrieval                 0
requester_days_since_first_post_on_raop_at_request         0
requester_days_since_first_post_on_raop_at_retrieval       0
requester_number_of_comments_at_request                    0
requester_number_of_comments_at_retrieval                  0
requester_number_of_comments_in_raop_at_request            0
requester_number_of_comm

We will drop the `requester_user_flair` because most of the data is missing.

We drop `request_id` because it is not an useful feature, it's probably just a random sequence of numbers and letters.

In [9]:
df["giver_username_if_known"].value_counts()

N/A             3753
jetboyterp         5
johngalt1337       4
m2nu               4
mr_jeep            4
                ... 
airmanhand         1
mfbyrne            1
jski5711           1
nygalz             1
hrnmyd             1
Name: giver_username_if_known, Length: 250, dtype: int64

There is also a lot of missing data in `giver_username_if_known` column, so we will drop it.

Then, we will drop every `retrieval` columns because they don't impact the decision of giving or not giving a pizza.
We don't want to poison the training data with informations we won't have in production, in a real case.

In [10]:
# check for each row if unix_timestamp_of_request is equal to unix_timestamp_of_request_utc
timestamps_equality = df["unix_timestamp_of_request"] == df["unix_timestamp_of_request_utc"]
timestamps_equality.value_counts()

False    3105
True      935
dtype: int64

As we can see, not all the values of timestamps columns are equal. We will keep both for now, but we could also keep only `utc` to keep the same base time for all the data.

Finally we will keep only the `request_text_edit_aware` column and drop the `request_text` column, because we want to keep the last version of the request text.

In [11]:
df = df.drop(columns=[
    "giver_username_if_known", 
    "number_of_downvotes_of_request_at_retrieval",
    "number_of_upvotes_of_request_at_retrieval",
    "requester_account_age_in_days_at_retrieval",
    "requester_days_since_first_post_on_raop_at_retrieval",
    "request_id",
    "request_number_of_comments_at_retrieval",
    "requester_number_of_comments_in_raop_at_retrieval",
    "requester_number_of_posts_at_retrieval",
    "requester_number_of_posts_on_raop_at_retrieval",
    "request_text",
    "requester_upvotes_minus_downvotes_at_retrieval",
    "requester_upvotes_plus_downvotes_at_retrieval",
    "requester_user_flair",
])

### Saving cleaned data

Let's save the cleaned data in a new file.

In [12]:
# Save dataframe to a new json file
df.to_json("../data/02_intermediate/cleaned_data.json", orient="records")