# Create GCP Tables For AITA Data
**NOTE:** Do not run this file again

This is exclusivley for getting my tables up and running.

In [1]:
import sys
import importlib
import pandas as pd
import numpy as np
from pathlib import Path
from dotenv import load_dotenv
from google.cloud import bigquery
from google.cloud.exceptions import NotFound

In [2]:
sys.path.append("..")

# load enviornment variables for praw to work later
load_dotenv(dotenv_path=Path("./settings.env"))

True

In [3]:
# helper function to check if table exists
def gcp_table_exists(client: bigquery.Client, table_id: str):
    try:
        client.get_table(table_id)  # Make an API request.
        return True
    except NotFound:
        return False

## Create Dataset
Using Google Bigquery, create datasets for the data that is about to be loaded. The dataset will be called **AITA_dataset**

In [4]:
PROJ_NAME = "bonion"
DATASET_NAME = "AITA_dataset"

In [5]:
client = bigquery.Client()

In [7]:
# table id definitions
post_table_id = "{}.{}.post_table".format(PROJ_NAME, DATASET_NAME)
comment_table_id = "{}.{}.comment_table".format(PROJ_NAME, DATASET_NAME)
reply_table_id = "{}.{}.reply_table".format(PROJ_NAME, DATASET_NAME)

#### Get all dataset ID's
To prevent overwriting pre-existing datasets, get all dataset id's to check before any creation step.

In [6]:
# Get all datasets 
datasets = list(client.list_datasets())  # Make an API request.
dataset_ids = [dataset.dataset_id for dataset in datasets]
project = client.project

if datasets:
    print("Datasets in project {}:".format(project))
    for dataset in datasets:
        print("\t{}".format(dataset.dataset_id))
else:
    print("{} project does not contain any datasets.".format(project))

Datasets in project bonion:
	AITA_dataset


#### Get all Table ID's
To prevent overwriting pre-existing tables, get all table id's to check before any creation step.

In [8]:
tables = list(client.list_tables(DATASET_NAME))  # Make an API request.
table_ids = [table.table_id for table in tables]

print("Tables contained in '{}':".format(DATASET_NAME))
for table in tables:
    print("{}.{}.{}".format(table.project, table.dataset_id, table.table_id))

Tables contained in 'AITA_dataset':
bonion.AITA_dataset.comment_table
bonion.AITA_dataset.post_table
bonion.AITA_dataset.reply_table


#### Create Dataset if it doesn't exist

In [None]:
# create dataset
if DATASET_NAME not in dataset_ids:
    dataset = bigquery.Dataset("{}.{}".format(PROJ_NAME, DATASET_NAME))
    dataset.location = "US"

    # send dataset to API for completion
    initial_post_dataset = client.create_dataset(dataset, timeout=30)  # Make an API request.
    print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

## Post Table
The initial post for each of the comments will need to be stored to reference its content which dictated the reply.

In [None]:
post_table_schema = [
    bigquery.SchemaField("reddit_post_id", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("post_title", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("post_self_text", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("upvotes", "INTEGER", mode="REQUIRED"),
    bigquery.SchemaField("num_responses", "INTEGER", mode="REQUIRED")
]

if not gcp_table_exists(client, table_id=post_table_id): 
    post_table = bigquery.Table(post_table_id, schema=post_table_schema)
    post_table.description = """
        A table which holds popular posts from the subreddit r/AITA
    """

    table = client.create_table(post_table)
    print(
        "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
    )

## Comment Table
This table holds all of the responses for each of the posts in the post table.

In [None]:
comment_table_schema = [
    bigquery.SchemaField("comment_id", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("parent_id", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("content", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("upvotes", "INTEGER", mode="REQUIRED")
]

if not gcp_table_exists(client, table_id=comment_table_id): 
    post_reply_table = bigquery.Table(comment_table_id, schema=comment_table_schema)
    post_reply_table.description = """
        A table which holds the most popular replys to saved posts from the subreddit r/AITA
    """

    table = client.create_table(post_reply_table)
    print(
        "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
    )

## Reply Table
This table will hold replys to the initial post reply. This will be used for also dictating the performance of a reply. Because there can be infinite replies to any given post or reply, each post reply id will be limited to some number of replies. These replies will likley be based off of the same metrics as the original post for "quality". 

In [None]:
reply_table_schema = [
    bigquery.SchemaField("reply_id", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("parent_id", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("content", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("upvotes", "INTEGER", mode="REQUIRED")
]   

if not gcp_table_exists(client, table_id=reply_table_id): 
    post_reply_top_children_table = bigquery.Table(reply_table_id, schema=reply_table_schema)
    post_reply_top_children_table.description = """
        This table will hold replys to the initial post reply. 
        This will be used for also dictating the performance of a reply. 
        Because there can be infinite replies to any given post or reply, 
        each post reply id will be limited to some number of replies. 
        These replies will likley be based off of the same metrics as 
        the original post for "quality". 
    """

    table = client.create_table(post_reply_top_children_table)
    print(
        "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
    )