# Am I the Asshole EDA
This file will be dedicated to explore how to craft responses to posts in the subreddit r/AmITheAsshole. 

In [None]:
import os 
import sys
import importlib
import pandas as pd
import numpy as np
from pathlib import Path
from dotenv import load_dotenv
from google.cloud import bigquery
from google.cloud.exceptions import NotFound

In [None]:
sys.path.append("..")

# load enviornment variables for praw to work later
load_dotenv(dotenv_path=Path("../settings.env"))

In [None]:
client = bigquery.Client()

In [None]:
# helper function to check if table exists
def gcp_table_exists(client: bigquery.Client, table_id: str):
    try:
        client.get_table(table_id)  # Make an API request.
        return True
    except NotFound:
        return False

## Create Dataset
Using Google Bigquery, create datasets for the data that is about to be loaded. The dataset will be called **AITA_dataset**

In [None]:
PROJ_NAME = "bonion"
DATASET_NAME = "AITA_dataset"

#### Get all dataset ID's
To prevent overwriting pre-existing datasets, get all dataset id's to check before any creation step.

In [None]:
# Get all datasets 
datasets = list(client.list_datasets())  # Make an API request.
dataset_ids = [dataset.dataset_id for dataset in datasets]
project = client.project

if datasets:
    print("Datasets in project {}:".format(project))
    for dataset in datasets:
        print("\t{}".format(dataset.dataset_id))
else:
    print("{} project does not contain any datasets.".format(project))

In [None]:
# create dataset
if DATASET_NAME not in dataset_ids:
    dataset = bigquery.Dataset("{}.{}".format(PROJ_NAME, DATASET_NAME))
    dataset.location = "US"

    # send dataset to API for completion
    initial_post_dataset = client.create_dataset(dataset, timeout=30)  # Make an API request.
    print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

## Post Table
The initial post for each of the comments will need to be stored to reference its content which dictated the reply.

In [None]:
# table id definitions
post_table_id = "{}.{}.post_table".format(PROJ_NAME, DATASET_NAME)
post_reply_table_id = "{}.{}.post_reply_table".format(PROJ_NAME, DATASET_NAME)
post_reply_top_children_table_id = "{}.{}.post_reply_top_children".format(PROJ_NAME, DATASET_NAME)

#### Get all Table ID's
To prevent overwriting pre-existing tables, get all table id's to check before any creation step.

In [None]:
tables = list(client.list_tables(DATASET_NAME))  # Make an API request.
table_ids = [table.table_id for table in tables]

print("Tables contained in '{}':".format(DATASET_NAME))
for table in tables:
    print("{}.{}.{}".format(table.project, table.dataset_id, table.table_id))

In [None]:
post_table_schema = [
    bigquery.SchemaField("reddit_post_id", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("post_title", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("post_self_text", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("upvotes", "INTEGER", mode="REQUIRED"),
    bigquery.SchemaField("num_responses", "INTEGER", mode="REQUIRED")
]

if not gcp_table_exists(client, table_id=post_table_id): 
    post_table = bigquery.Table(post_table_id, schema=post_table_schema)
    post_table.description = """
        A table which holds popular posts from the subreddit r/AITA
    """

    table = client.create_table(post_table)
    print(
        "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
    )

## Post Reply Table
This table holds all of the responses for each of the posts in the post table.

In [None]:
post_reply_table_schema = [
    bigquery.SchemaField("reddit_post_id", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("comment_id", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("comment_contents", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("upvotes", "INTEGER", mode="REQUIRED"),
    bigquery.SchemaField("num_responses", "INTEGER", mode="REQUIRED")
]

if not gcp_table_exists(client, table_id=post_reply_table_id): 
    post_reply_table = bigquery.Table(post_reply_table_id, schema=post_reply_table_schema)
    post_reply_table.description = """
        A table which holds the most popular replys to saved posts from the subreddit r/AITA
    """

    table = client.create_table(post_reply_table)
    print(
        "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
    )

## Post Reply Top Children
This table will hold replys to the initial post reply. This will be used for also dictating the performance of a reply. Because there can be infinite replies to any given post or reply, each post reply id will be limited to some number of replies. These replies will likley be based off of the same metrics as the original post for "quality". 

In [None]:
post_reply_top_children_table_schema = [
    bigquery.SchemaField("parent_comment_id", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("comment_id", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("comment_contents", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("upvotes", "INTEGER", mode="REQUIRED"),
    bigquery.SchemaField("num_responses", "INTEGER", mode="REQUIRED")
]   

if not gcp_table_exists(client, table_id=post_reply_top_children_table_id): 
    post_reply_top_children_table = bigquery.Table(post_reply_top_children_table_id, schema=post_reply_top_children_table_schema)
    post_reply_top_children_table.description = """
        This table will hold replys to the initial post reply. 
        This will be used for also dictating the performance of a reply. 
        Because there can be infinite replies to any given post or reply, 
        each post reply id will be limited to some number of replies. 
        These replies will likley be based off of the same metrics as 
        the original post for "quality". 
    """

    table = client.create_table(post_reply_top_children_table)
    print(
        "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
    )

## Collect Data
Using my PrawInstance Object, I will be collecting the top posts from the subreddit. I will be storing this data in google bigquery.

In [None]:
import praw_instance
importlib.reload(praw_instance)

In [None]:
subreddit_name = "amitheasshole"

In [None]:
praw_inst = praw_instance.PrawInstance()

### Store Top 1000 Posts

In [None]:
top_10000_posts = praw_instance.get_top_by_subreddit(praw_inst, subreddit_name, limit=1000)

In [None]:
# store the top 1000 posts into gcp
# convert data into json rows 
post_table_df = praw_instance.post_dict_to_df(top_10000_posts)

post_table_df.head(10)

In [None]:
post_table_df["reddit_post_id"] = post_table_df["reddit_post_id"].astype(str)
post_table_df["post_title"] = post_table_df["post_title"].astype(str)
post_table_df["post_self_text"] = post_table_df["post_self_text"].astype(str)
post_table_df["upvotes"] = post_table_df["upvotes"].astype(int)
post_table_df["num_responses"] = post_table_df["num_responses"].astype(int)
post_table_df.info()

#### Push Data

In [None]:
import pandas_gbq

In [None]:
pandas_gbq.to_gbq(post_table_df, post_table_id, project_id=PROJ_NAME)

### Get Top Comments and Replies for the top posts

In [None]:
# get the top post ids

query = """
    SELECT reddit_post_id
    FROM {}
""".format(post_table_id)

post_table_ids_df = pd.read_gbq(query, project_id=PROJ_NAME)

post_table_ids_df.head(4)

In [None]:
comment_and_reply_dict_list = []

for index, row in post_table_ids_df.iterrows():
    comment_and_reply_dict = praw_instance.get_top_comments_and_top_replies_by_post_id(
        praw_inst,
        row["reddit_post_id"],
        comment_limit=10,
        reply_limit=5
    )

    comment_and_reply_dict_list.append(comment_and_reply_dict)