# Am I the Asshole EDA
This file will be dedicated to explore how to craft responses to posts in the subreddit r/AmITheAsshole. 

In [1]:
import os 
import sys
import importlib
import pandas as pd
import numpy as np
from pathlib import Path
from dotenv import load_dotenv
from google.cloud import bigquery
from google.cloud.exceptions import NotFound

In [2]:
sys.path.append("..")

# load enviornment variables for praw to work later
load_dotenv(dotenv_path=Path("../settings.env"))

True

In [27]:
client = bigquery.Client()

In [4]:
# helper function to check if table exists
def gcp_table_exists(client: bigquery.Client, table_id: str):
    try:
        client.get_table(table_id)  # Make an API request.
        return True
    except NotFound:
        return False

## Create Dataset
Using Google Bigquery, create datasets for the data that is about to be loaded. The dataset will be called **AITA_dataset**

In [5]:
PROJ_NAME = "bonion"
DATASET_NAME = "AITA_dataset"

#### Get all dataset ID's
To prevent overwriting pre-existing datasets, get all dataset id's to check before any creation step.

In [6]:
# Get all datasets 
datasets = list(client.list_datasets())  # Make an API request.
dataset_ids = [dataset.dataset_id for dataset in datasets]
project = client.project

if datasets:
    print("Datasets in project {}:".format(project))
    for dataset in datasets:
        print("\t{}".format(dataset.dataset_id))
else:
    print("{} project does not contain any datasets.".format(project))

Datasets in project bonion:
	AITA_dataset


In [7]:
# create dataset
if DATASET_NAME not in dataset_ids:
    dataset = bigquery.Dataset("{}.{}".format(PROJ_NAME, DATASET_NAME))
    dataset.location = "US"

    # send dataset to API for completion
    initial_post_dataset = client.create_dataset(dataset, timeout=30)  # Make an API request.
    print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

## Post Table
The initial post for each of the comments will need to be stored to reference its content which dictated the reply.

In [8]:
# table id definitions
post_table_id = "{}.{}.post_table".format(PROJ_NAME, DATASET_NAME)
post_reply_table_id = "{}.{}.post_reply_table".format(PROJ_NAME, DATASET_NAME)
post_reply_top_children_table_id = "{}.{}.post_reply_top_children".format(PROJ_NAME, DATASET_NAME)

#### Get all Table ID's
To prevent overwriting pre-existing tables, get all table id's to check before any creation step.

In [9]:
tables = list(client.list_tables(DATASET_NAME))  # Make an API request.
table_ids = [table.table_id for table in tables]

print("Tables contained in '{}':".format(DATASET_NAME))
for table in tables:
    print("{}.{}.{}".format(table.project, table.dataset_id, table.table_id))

Tables contained in 'AITA_dataset':
bonion.AITA_dataset.post_reply_table
bonion.AITA_dataset.post_reply_top_children
bonion.AITA_dataset.post_table


In [10]:
post_table_schema = [
    bigquery.SchemaField("reddit_post_id", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("post_title", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("post_self_text", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("upvotes", "INTEGER", mode="REQUIRED"),
    bigquery.SchemaField("num_responses", "INTEGER", mode="REQUIRED")
]

if not gcp_table_exists(client, table_id=post_table_id): 
    post_table = bigquery.Table(post_table_id, schema=post_table_schema)
    post_table.description = """
        A table which holds popular posts from the subreddit r/AITA
    """

    table = client.create_table(post_table)
    print(
        "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
    )

## Post Reply Table
This table holds all of the responses for each of the posts in the post table.

In [11]:
post_reply_table_schema = [
    bigquery.SchemaField("reddit_post_id", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("comment_id", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("comment_contents", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("upvotes", "INTEGER", mode="REQUIRED"),
    bigquery.SchemaField("num_responses", "INTEGER", mode="REQUIRED")
]

if not gcp_table_exists(client, table_id=post_reply_table_id): 
    post_reply_table = bigquery.Table(post_reply_table_id, schema=post_reply_table_schema)
    post_reply_table.description = """
        A table which holds the most popular replys to saved posts from the subreddit r/AITA
    """

    table = client.create_table(post_reply_table)
    print(
        "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
    )

## Post Reply Top Children
This table will hold replys to the initial post reply. This will be used for also dictating the performance of a reply. Because there can be infinite replies to any given post or reply, each post reply id will be limited to some number of replies. These replies will likley be based off of the same metrics as the original post for "quality". 

In [12]:
post_reply_top_children_table_schema = [
    bigquery.SchemaField("parent_comment_id", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("comment_id", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("comment_contents", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("upvotes", "INTEGER", mode="REQUIRED"),
    bigquery.SchemaField("num_responses", "INTEGER", mode="REQUIRED")
]   

if not gcp_table_exists(client, table_id=post_reply_top_children_table_id): 
    post_reply_top_children_table = bigquery.Table(post_reply_top_children_table_id, schema=post_reply_top_children_table_schema)
    post_reply_top_children_table.description = """
        This table will hold replys to the initial post reply. 
        This will be used for also dictating the performance of a reply. 
        Because there can be infinite replies to any given post or reply, 
        each post reply id will be limited to some number of replies. 
        These replies will likley be based off of the same metrics as 
        the original post for "quality". 
    """

    table = client.create_table(post_reply_top_children_table)
    print(
        "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
    )

## Collect Data
Using my PrawInstance Object, I will be collecting the top posts from the subreddit. I will be storing this data in google bigquery.

In [13]:
import praw_instance
importlib.reload(praw_instance)

<module 'praw_instance' from '/home/cstainsby/class/dataProj/bonion/src/notebooks/../praw_instance.py'>

In [14]:
subreddit_name = "amitheasshole"

In [15]:
praw_inst = praw_instance.PrawInstance()

In [16]:
top_10000_posts = praw_instance.get_top_by_subreddit(praw_inst, subreddit_name, limit=10000)

In [17]:
# store the top 1000 posts into gcp
# convert data into json rows 
post_table_df = praw_instance.post_dict_to_df(top_10000_posts)

post_table_df.head(10)

full data shape (998, 5)


Unnamed: 0,reddit_post_id,post_title,post_self_text,upvotes,num_responses
0,ocx94s,AITA for telling my wife the lock on my daught...,My brother in-law (Sammy) lost his home shortl...,81021,5279
1,d6xoro,META: This sub is moving towards a value syste...,I’ve enjoyed reading and posting on this sub f...,80910,6190
2,azvko1,"UPDATE, AITA for despising my mentally handica...","I'm back like I said I would be,. My [original...",72780,1985
3,gr8bp3,AITA For suing my girlfriend after she had my ...,I'll try to keep this short. I had a [1967 Imp...,70814,2757
4,x2k5kv,AITA for bringing my SIL’s wallet to the resta...,Edit: update on profile\n\nMy (f28) SIL “Amy” ...,69793,3822
5,cjetsa,UPDATE: AITA for wanting to go to the funeral ...,I want to sincerely thank everyone who comment...,67572,2
6,e5k3z2,AITA for pretending to get fired when customer...,I am a high schooler with a weekend job at a c...,63528,3621
7,zvmflw,"AITA for bringing up my brother's ""premature"" ...",I am a nurse practitioner and I am the primary...,59171,3370
8,flan73,UPDATE: WIBTA if I took over planning my own f...,"Hello, everyone. First of all, thank you all f...",58489,2
9,dhfeg9,AITA for making a dad joke?,"Note. My step-daughter, Madeline, was about a ...",56954,1995


In [28]:
# job_config = bigquery.LoadJobConfig(schema=post_table_schema)
errors = client.insert_rows_from_dataframe(
    post_table_id,
    post_table_df
)

ValueError: Could not determine schema for table 'bonion.AITA_dataset.post_table'. Call client.get_table() or pass in a list of schema fields to the selected_fields argument.