# AITA Data Loading
This file will be dedicated to loading data to add to my already created tables in GCP Big Query


In [4]:
import sys
import importlib
import pandas as pd
import numpy as np
from pathlib import Path
from dotenv import load_dotenv
from google.cloud import bigquery
from google.cloud.exceptions import NotFound

In [5]:
sys.path.append("..")

# load enviornment variables for praw to work later
load_dotenv(dotenv_path=Path("./settings.env"))

True

In [6]:
client = bigquery.Client()

In [7]:
PROJ_NAME = "bonion"
DATASET_NAME = "AITA_dataset"

In [8]:
# table id definitions
post_table_id = "{}.{}.post_table".format(PROJ_NAME, DATASET_NAME)
comment_table_id = "{}.{}.comment_table".format(PROJ_NAME, DATASET_NAME)
reply_table_id = "{}.{}.reply_table".format(PROJ_NAME, DATASET_NAME)

## Collect Data
Using my PrawInstance Object, I will be collecting the top posts from the subreddit. I will be storing this data in google bigquery.

In [9]:
import praw_instance
importlib.reload(praw_instance)

<module 'praw_instance' from '/home/cstainsby/class/dataProj/bonion/src/backend_api_service/praw_instance.py'>

In [10]:
subreddit_name = "amitheasshole"

In [11]:
praw_inst = praw_instance.PrawInstance()

### Store Top 1000 Posts

In [12]:
top_10000_posts = praw_instance.get_hot_by_subreddit(praw_inst, subreddit_name, limit=10000)

In [13]:
len(top_10000_posts)

655

In [14]:
# store the top 1000 posts into gcp
# convert data into json rows 
post_table_df = praw_instance.post_dict_to_df(top_10000_posts)

post_table_df.head(10)

Unnamed: 0,reddit_post_id,post_title,post_self_text,upvotes,num_responses
0,13coioe,AITA for telling my ILs I will not be changing...,"The bakery was originally my parents bakery, m...",8687,1758
1,13ct0vf,AITA for cancelling a vacation because my wife...,My wife(44F) makes quite a bit less than me(45...,5062,1361
2,13cvj7n,AITA for telling my kids not to bother visitin...,My ex and I have 2 kids together (12f and 10f)...,2504,2209
3,13cmddr,AITA for not reminding my girlfriend to come g...,"\nThis past weekend, my (21F) girlfriend (22 F...",4707,1336
4,13cz7zc,AITA for telling my kids that I don't care if ...,Throwaway\n\nI have three kids the youngest is...,964,650
5,13cufh7,AITA for telling my wife her parents cannot sl...,We went out of town a few weeks ago for a wedd...,1326,540
6,13crhty,AITA for leaving my sisters wedding?,"For a little backstory, I (24F) and my sister ...",1311,290
7,13cp0k5,AITA for being a picky eater and calling my ex...,I (30f) am a picky eater. Prestory: when I was...,1716,301
8,13cgu71,AITA for insisting my pregnant sister switch b...,Throwaway coz I don’t want it connected to my ...,5745,1356
9,13cli1w,"AITA for ""forcing my daughter to waste food""",This is stupid but the daughter in question to...,1878,597


In [15]:
# cast table columns to type
post_table_df["reddit_post_id"] = post_table_df["reddit_post_id"].astype(str)
post_table_df["post_title"] = post_table_df["post_title"].astype(str)
post_table_df["post_self_text"] = post_table_df["post_self_text"].astype(str)
post_table_df["upvotes"] = post_table_df["upvotes"].astype(int)
post_table_df["num_responses"] = post_table_df["num_responses"].astype(int)
post_table_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 655 entries, 0 to 654
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   reddit_post_id  655 non-null    object
 1   post_title      655 non-null    object
 2   post_self_text  655 non-null    object
 3   upvotes         655 non-null    int64 
 4   num_responses   655 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 25.7+ KB


In [28]:
# quick to csv method in case you dont want to push to cloud
# post_table_df.to_csv("./test_post.csv", index=False)

#### Push Data

In [16]:
import pandas_gbq

In [18]:
pandas_gbq.to_gbq(post_table_df, post_table_id, project_id=PROJ_NAME, if_exists="append")

100%|██████████| 1/1 [00:00<00:00, 10280.16it/s]


### Get Top Comments and Replies for the top posts

In [21]:
# get the top post ids

post_table_ids_df = pd.DataFrame(post_table_df["reddit_post_id"])
print(len(post_table_ids_df))
post_table_ids_df.head(4)

655


Unnamed: 0,reddit_post_id
0,13coioe
1,13ct0vf
2,13cvj7n
3,13cmddr


To grab the comments and replies for each of the posts, I will use a helper function which can grab n comments based on comment_limit and m replies to each of those comments based on reply_limit

I have also included variables
- start_index
- stop index 

These allow you to specify the range of items you want to process (allows you to batch to a degree). By stopping it prematurly with stop index, you can push your changes and order will be preserved.

In [28]:
comment_and_reply_dict_list = []

# I will batch the groups of comments I'm grabbing
# so far I have pulled indices 0 thru 50
start_index = 51
stop_index = 655

comment_limit = 10
reply_limit = 0

print("Pulling {} comments and {} replies for each comment:".format(comment_limit, reply_limit))
for index, row in post_table_ids_df.iterrows():
    if index >= start_index and index <= stop_index: 
        print("\tOn index {} of {}".format(index - start_index, stop_index - start_index))
        comment_and_reply_dict = praw_instance.get_top_comments_and_top_replies_by_post_id(
            praw_inst,
            row["reddit_post_id"],
            comment_limit=comment_limit,
            reply_limit=reply_limit
        )

        comment_and_reply_dict_list.append(comment_and_reply_dict)

Pulling 10 comments and 0 replies for each comment:
	On index 0 of 604
	On index 1 of 604
	On index 2 of 604
	On index 3 of 604
	On index 4 of 604
	On index 5 of 604
	On index 6 of 604
	On index 7 of 604
	On index 8 of 604
	On index 9 of 604
	On index 10 of 604
	On index 11 of 604
	On index 12 of 604
	On index 13 of 604
	On index 14 of 604
	On index 15 of 604
	On index 16 of 604
	On index 17 of 604
	On index 18 of 604
	On index 19 of 604
	On index 20 of 604
	On index 21 of 604
	On index 22 of 604
	On index 23 of 604
	On index 24 of 604
	On index 25 of 604
	On index 26 of 604
	On index 27 of 604
	On index 28 of 604
	On index 29 of 604
	On index 30 of 604
	On index 31 of 604
	On index 32 of 604
	On index 33 of 604
	On index 34 of 604
	On index 35 of 604
	On index 36 of 604
	On index 37 of 604
	On index 38 of 604
	On index 39 of 604
	On index 40 of 604
	On index 41 of 604
	On index 42 of 604
	On index 43 of 604
	On index 44 of 604
	On index 45 of 604
	On index 46 of 604
	On index 47 of 60

In [29]:
# convert the data to a dataframe
full_comment_list, full_reply_list = [], []
for post_i, comment_and_replies_dict in enumerate(comment_and_reply_dict_list):
    comment_df_at_post_i, reply_df_at_post_i =  praw_instance.comment_and_reply_dict_to_df(comment_and_replies_dict)

    # store df's in an intermediate list before combining into a df
    full_comment_list.append(comment_df_at_post_i)
    full_reply_list.append(reply_df_at_post_i)

comment_table_df = pd.concat(full_comment_list, axis=0, ignore_index=True)
reply_table_df = pd.concat(full_reply_list, axis=0, ignore_index=True)

In [30]:
comment_table_df["comment_id"] = comment_table_df["comment_id"].astype(str)
comment_table_df["parent_id"] = comment_table_df["parent_id"].astype(str)
comment_table_df["content"] = comment_table_df["content"].astype(str)
comment_table_df["upvotes"] = comment_table_df["upvotes"].astype(int)

print("Comment table length", len(comment_table_df))
print(comment_table_df.info())
comment_table_df.head()

Comment table length 5054
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5054 entries, 0 to 5053
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   comment_id  5054 non-null   object
 1   parent_id   5054 non-null   object
 2   content     5054 non-null   object
 3   upvotes     5054 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 158.1+ KB
None


Unnamed: 0,comment_id,parent_id,content,upvotes
0,jjhwdnw,t3_13d0djc,NTA - If the girl was too big to fit the dress...,40
1,jjhwj8b,t3_13d0djc,"NTA, if she's a different size than the dress,...",19
2,jji8wml,t3_13d0djc,INFO:\nHow did you say this? \n\nIt’s one thin...,13
3,jjhwwtl,t3_13d0djc,NTA - tell your SIL to take it up with the law...,7
4,jji1dk4,t3_13d0djc,"NTA, if someone is too big for clothing they’l...",6


In [31]:
reply_table_df["reply_id"] = reply_table_df["reply_id"].astype(str)
reply_table_df["parent_id"] = reply_table_df["parent_id"].astype(str)
reply_table_df["content"] = reply_table_df["content"].astype(str)
reply_table_df["upvotes"] = reply_table_df["upvotes"].astype(int)

print("Reply table length", len(reply_table_df))
print(reply_table_df.info())
reply_table_df.head()

Reply table length 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   reply_id   0 non-null      object
 1   parent_id  0 non-null      object
 2   content    0 non-null      object
 3   upvotes    0 non-null      int64 
dtypes: int64(1), object(3)
memory usage: 124.0+ bytes
None


Unnamed: 0,reply_id,parent_id,content,upvotes


### Push Comments and Replies 

In [32]:
pandas_gbq.to_gbq(comment_table_df, comment_table_id, project_id=PROJ_NAME, if_exists="append")
pandas_gbq.to_gbq(reply_table_df, reply_table_id, project_id=PROJ_NAME, if_exists="append")

100%|██████████| 1/1 [00:00<00:00, 5833.52it/s]
