In [1]:
import json
import pandas as pd

In [2]:
datafile_path = "../data/03_primary/cleaned_text_data.json"
with open(datafile_path) as f:
    data = json.load(f)

df = pd.DataFrame(data)

In [3]:
df.dtypes

post_was_edited                                        object
request_text_edit_aware                                object
request_title                                          object
requester_account_age_in_days_at_request              float64
requester_days_since_first_post_on_raop_at_request    float64
requester_number_of_comments_at_request                 int64
requester_number_of_comments_at_retrieval               int64
requester_number_of_comments_in_raop_at_request         int64
requester_number_of_posts_at_request                    int64
requester_number_of_posts_on_raop_at_request            int64
requester_number_of_subreddits_at_request               int64
requester_received_pizza                                 bool
requester_subreddits_at_request                        object
requester_upvotes_minus_downvotes_at_request            int64
requester_upvotes_plus_downvotes_at_request             int64
requester_username                                     object
unix_tim

## Data Preprocessing

Let's start by converting all boolean values to 0 and 1 where 0 is False and 1 is True.

#### post_was_edited

In [4]:
df["post_was_edited"].value_counts()

False           3395
True             346
1377877925.0       1
1347564988.0       1
1349478371.0       1
                ... 
1354396381.0       1
1375391768.0       1
1374109637.0       1
1350765213.0       1
1369770892.0       1
Name: post_was_edited, Length: 301, dtype: int64

As we can see there is some coulumns with timestamps instead of boolean values. We will consider that if there is a timestamp, then the value is True.

In [5]:
df["post_was_edited"] = df["post_was_edited"].apply(lambda x: True if isinstance(x, float) else x)
df["post_was_edited"] = df["post_was_edited"].astype(int)
df["post_was_edited"].value_counts()

0    3395
1     645
Name: post_was_edited, dtype: int64

#### requester_received_pizza

In [6]:
df["requester_received_pizza"].value_counts()

False    3046
True      994
Name: requester_received_pizza, dtype: int64

In [7]:
df["requester_received_pizza"] = df["requester_received_pizza"].astype(int)
df["requester_received_pizza"].value_counts()

0    3046
1     994
Name: requester_received_pizza, dtype: int64

#### unix_timestamp_of_request_utc

Now we can convert the timestamp to a datetime object.

In [8]:
df["unix_timestamp_of_request_utc"] = pd.to_datetime(df["unix_timestamp_of_request_utc"], unit="s")
df["unix_timestamp_of_request"] = pd.to_datetime(df["unix_timestamp_of_request"], unit="s")
print(df.dtypes["unix_timestamp_of_request_utc"], df.dtypes["unix_timestamp_of_request"])

datetime64[ns] datetime64[ns]


#### requester_subreddits_at_request

This last column contains a list of subreddits or directly an integer. We will convert all lists to integers in order to make the data more manageable and easier to work with.

In [9]:
df["requester_subreddits_at_request"].value_counts()

[]                                                                                                                                                                                                                                                                                                                                729
[Random_Acts_Of_Pizza]                                                                                                                                                                                                                                                                                                            243
[AskReddit]                                                                                                                                                                                                                                                                                                                        20
[AskReddit, Random_Act

In [10]:
df["requester_subreddits_at_request"] = df["requester_subreddits_at_request"].apply(lambda x: len(x) if isinstance(x, list) else x)
df["requester_subreddits_at_request"].value_counts()

0      729
1      362
2      128
3      119
6      107
      ... 
149      1
113      1
99       1
83       1
103      1
Name: requester_subreddits_at_request, Length: 128, dtype: int64

## Saving data

In [11]:
df.head()

Unnamed: 0,post_was_edited,request_text_edit_aware,request_title,requester_account_age_in_days_at_request,requester_days_since_first_post_on_raop_at_request,requester_number_of_comments_at_request,requester_number_of_comments_at_retrieval,requester_number_of_comments_in_raop_at_request,requester_number_of_posts_at_request,requester_number_of_posts_on_raop_at_request,requester_number_of_subreddits_at_request,requester_received_pizza,requester_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_plus_downvotes_at_request,requester_username,unix_timestamp_of_request,unix_timestamp_of_request_utc,cleaned_text
0,0,Hi I am in need of food for my 4 children we a...,Request Colorado Springs Help Us Please,0.0,0.0,0,0,0,0,0,0,0,0,0,0,nickylvst,2011-10-05 22:10:07,2011-10-05 21:10:07,hi i am in need of food for my 4 children we a...
1,0,I spent the last money I had on gas today. Im ...,"[Request] California, No cash and I could use ...",501.1111,0.0,0,1000,0,15,0,12,0,12,34,116,fohacidal,2012-03-25 05:13:44,2012-03-25 04:13:44,i spent the last money i had on gas today. im ...
2,0,My girlfriend decided it would be a good idea ...,"[Request] Hungry couple in Dundee, Scotland wo...",0.0,0.0,0,0,0,0,0,0,0,0,0,0,jacquibatman7,2011-10-26 17:28:14,2011-10-26 16:28:14,my girlfriend decided it would be a good idea ...
3,1,"It's cold, I'n hungry, and to be completely ho...","[Request] In Canada (Ontario), just got home f...",6.518438,0.0,36,41,0,1,0,4,0,4,54,76,4on_the_floor,2011-12-02 19:50:34,2011-12-02 19:50:34,"it's cold, i'n hungry, and to be completely ho..."
4,0,hey guys:\n I love this sub. I think it's grea...,[Request] Old friend coming to visit. Would LO...,162.063252,101.606505,140,178,2,14,0,11,0,11,1121,1733,Futuredogwalker,2013-07-12 19:34:51,2013-07-12 18:34:51,hey guys: i love this sub. i think it's great....


In [12]:
# Save dataframe to a new json file
df.to_json("../data/04_feature/preprocessed_data.json", orient="records")