In [17]:
import os
import json
from dotenv import load_dotenv

import pandas as pd
import psycopg2
from sqlalchemy import create_engine

#### Check if data files have been extracted

In theory, this notebook should be run after `EDA.ipynb` where we perform our intial data exploratory.
In that notebook, we unzip the compressed data file into two `.ndjson` for submission and comment data.

In [3]:
comments_data_path = os.path.join("..", "data", "extracted_data", "comments_data.ndjson")
submissions_data_path = os.path.join("..", "data", "extracted_data", "submissions_data.ndjson")

assert os.path.isfile(comments_data_path) is True
assert os.path.isfile(submissions_data_path) is True

In [9]:
def upload_to_db(db_engine, data_file: str, table_name: str, selected_columns: list[str]):
    """
    Upload to database
    """

    assert os.path.isfile(data_file) is True
    assert data_file.endswith('.ndjson') is True

    with open(data_file, 'r') as f:
        done = False
        it = 1
        while not done:
            jsons = []
            # 1000 lines at a time
            for i in range(1000):
                line = f.readline().strip()

                # end of line is reached
                if not line:
                    done = True
                    break

                json_line = json.loads(line)
                jsons.append(json_line)
            
            if done:
                break

            df = pd.DataFrame(jsons)
            # only select relevant columns
            for col in selected_columns:
                # NOTE: some json entries may not contain the selected column,  
                # we will that as NA
                if col not in df.columns:
                    df[col] = pd.NA

            df = df[selected_columns]
            df.to_sql(table_name, db_engine, if_exists='append', index=False)
            print(f'iteration #{it}: data written to {table_name} successfully.')
            it += 1

#### Create database engine

In [10]:
dotenv_path = '../.env'
load_dotenv(dotenv_path)

def get_db_url(db_name):
    db_str = 'postgresql+psycopg2://{}:{}@{}:{}/{}'
    return db_str.format(
        os.getenv('DB_USERNAME'),
        os.getenv('DB_PASSWORD'),
        os.getenv('DB_HOST'),
        os.getenv('DB_PORT'),
        db_name
    )

db_url = get_db_url('reddit')
db_engine = create_engine(db_url)

# TODO: ensure db_engine is successfully created

#### Upload data to database

Comments:

In [None]:
# define relevant comment columns
# this is the result of the EDA we did on the Reddit dataset
relevant_comment_columns = [
    "id", "archived", "author", "author_created_utc", 
    "author_fullname", "body", "controversiality", 
    "created_utc", "downs", "locked", 
    "name", "num_reports", "parent_id", "permalink", 
    "retrieved_on", "score", "subreddit", 
    "subreddit_id", "subreddit_name_prefixed", 
    "subreddit_type", "total_awards_received", 
    "updated_on", "ups"
]

upload_to_db(db_engine, comments_data_path, 'comments_test', relevant_comment_columns)

Submissions:

In [None]:
# define relevant submission columns
# this is the result of the EDA we did on the Reddit dataset
relevant_submission_columns = [
    "id", 
    "archived", 
    "author", 
    "author_created_utc", 
    "author_fullname", 
    "created_utc", 
    "downs", 
    "is_self", 
    "locked", 
    "name", 
    "num_comments", 
    "num_crossposts", 
    "num_reports", 
    "permalink", 
    "score", 
    "selftext", 
    "spoiler", 
    "subreddit", 
    "subreddit_id", 
    "subreddit_name_prefixed", 
    "subreddit_subscribers", 
    "subreddit_type", 
    "title", 
    "total_awards_received", 
    "ups", 
    "upvote_ratio", 
    "url"
]

upload_to_db(db_engine, submissions_data_path, 'submissions_test', relevant_submission_columns)

### Normalize Reddit dataset

#### `Subreddit`
From inspection of the two data tables `submissions` and `comments` and their attributes, it appears that the dataset is fairly denormalized.
For instance, entries in both `submissions` and `comments` have attributes `subreddit`, `subreddit_id`, `subreddit_name_prefixed`, `subreddit_type`. Entries in `submissions` also contain `subreddit_subscribers`, which is the number of subscribers to the subreddit. This might be useful to track the number of subsribers to a subreddit over time. `subreddit_name_prefixed` is the prefixed subreddit name, which can be easily constructed from `subreddit` in this format `r/[subreddit]`, thus not too relevant and can be excluded.


From the observations, `subreddit` can be converted to its own entity set which allows retrieve info about subreddits more easily. This makes sense, as in our dataset, there are only posts and comments from 13 different subreddits.

In this code below, we will further explore information on subreddits stored in both `submissions` and `comments` and create a seperate table for `subreddit`.

In [19]:
subreddit_columns = [
    "subreddit", 
    "subreddit_id", 
    "subreddit_name_prefixed", 
    "subreddit_subscribers", 
    "subreddit_type" 
]

#### Connect to database using `psycopg2`

In [20]:
conn = psycopg2.connect(
    dbname='reddit',
    user=os.getenv('DB_USERNAME'),
    password=os.getenv('DB_PASSWORD'),
    host=os.getenv('DB_HOST'),
    port=os.getenv('DB_PORT')
)

cur = conn.cursor()

Query subreddits from `submissions` and `comments`, union them to find all subreddits and store it in new `subreddit` tables.
The attributes of `subreddit` includues `subreddit_name`, `subreddit_id`, `subreddit_name_prefixed` and `subreddit_type`.

In [None]:
sql_cmd = '''

SELECT 


'''

#### Users