In [1]:
import os
import json
from dotenv import load_dotenv

import pandas as pd
import psycopg2
from sqlalchemy import create_engine

#### Check if data files have been extracted

In theory, this notebook should be run after `EDA.ipynb` where we perform our intial data exploratory.
In that notebook, we unzip the compressed data file into two `.ndjson` for submission and comment data.

In [2]:
comments_data_path = os.path.join("..", "data", "extracted_data", "comments_data.ndjson")
submissions_data_path = os.path.join("..", "data", "extracted_data", "submissions_data.ndjson")

assert os.path.isfile(comments_data_path) is True
assert os.path.isfile(submissions_data_path) is True

In [9]:
def upload_to_db(db_engine, data_file: str, table_name: str, selected_columns: list[str]):
    """
    Upload to database
    """

    assert os.path.isfile(data_file) is True
    assert data_file.endswith('.ndjson') is True

    with open(data_file, 'r') as f:
        done = False
        it = 1
        while not done:
            jsons = []
            # 1000 lines at a time
            for i in range(1000):
                line = f.readline().strip()

                # end of line is reached
                if not line:
                    done = True
                    break

                json_line = json.loads(line)
                jsons.append(json_line)
            
            if done:
                break

            df = pd.DataFrame(jsons)
            # only select relevant columns
            for col in selected_columns:
                # NOTE: some json entries may not contain the selected column,  
                # we will that as NA
                if col not in df.columns:
                    df[col] = pd.NA

            df = df[selected_columns]
            df.to_sql(table_name, db_engine, if_exists='append', index=False)
            print(f'iteration #{it}: data written to {table_name} successfully.')
            it += 1

#### Create database engine

In [5]:
dotenv_path = '../.env'
load_dotenv(dotenv_path)

def get_db_url(db_name):
    db_str = 'postgresql+psycopg2://{}:{}@{}:{}/{}'
    return db_str.format(
        os.getenv('DB_USERNAME'),
        os.getenv('DB_PASSWORD'),
        os.getenv('DB_HOST'),
        os.getenv('DB_PORT'),
        db_name
    )

db_url = get_db_url('reddit')
db_engine = create_engine(db_url)

# TODO: ensure db_engine is successfully created

#### Upload data to database

Comments:

In [None]:
# define relevant comment columns
# this is the result of the EDA we did on the Reddit dataset
relevant_comment_columns = [
    "id", "archived", "author", "author_created_utc", 
    "author_fullname", "body", "controversiality", 
    "created_utc", "downs", "locked", 
    "name", "num_reports", "parent_id", "permalink", 
    "retrieved_on", "score", "subreddit", 
    "subreddit_id", "subreddit_name_prefixed", 
    "subreddit_type", "total_awards_received", 
    "updated_on", "ups"
]

upload_to_db(db_engine, comments_data_path, 'comments', relevant_comment_columns)

Submissions:

In [None]:
# define relevant submission columns
# this is the result of the EDA we did on the Reddit dataset
relevant_submission_columns = [
    "id", 
    "archived", 
    "author", 
    "author_created_utc", 
    "author_fullname", 
    "created_utc", 
    "downs", 
    "is_self", 
    "locked", 
    "name", 
    "num_comments", 
    "num_crossposts", 
    "num_reports", 
    "permalink", 
    "score", 
    "selftext", 
    "spoiler", 
    "subreddit", 
    "subreddit_id", 
    "subreddit_name_prefixed", 
    "subreddit_subscribers", 
    "subreddit_type", 
    "title", 
    "total_awards_received", 
    "ups", 
    "upvote_ratio", 
    "url"
]

upload_to_db(db_engine, submissions_data_path, 'submissions', relevant_submission_columns)

### Normalize Reddit dataset

#### `Subreddit`
From inspection of the two data tables `submissions` and `comments` and their attributes, it appears that the dataset is fairly denormalized.
For instance, entries in both `submissions` and `comments` have attributes `subreddit`, `subreddit_id`, `subreddit_name_prefixed`, `subreddit_type`. Entries in `submissions` also contain `subreddit_subscribers`, which is the number of subscribers to the subreddit. This might be useful to track the number of subsribers to a subreddit over time.


From the observations, `subreddit` can be regarded as a separate entity set which allows the retrieval of info about subreddits easier. This makes sense, as in our dataset, there are only posts and comments from 13 different subreddits.

In this code below, we will further explore information on subreddits stored in both `submissions` and `comments` and create a seperate table for `subreddit`.

In [6]:
subreddit_columns = [
    "subreddit", 
    "subreddit_id", 
    "subreddit_name_prefixed", 
    "subreddit_subscribers", 
    "subreddit_type" 
]

Connect to database using `psycopg2`:

In [44]:
conn = psycopg2.connect(
    dbname='reddit',
    user=os.getenv('DB_USERNAME'),
    password=os.getenv('DB_PASSWORD'),
    host=os.getenv('DB_HOST'),
    port=os.getenv('DB_PORT')
)

cur = conn.cursor()

The following SQL query retrieves the most recent submission where all subreddit data is available, representing the latest "subreddit data fetch" that provides the most up-to-date information about a subreddit.

In [30]:
sql_query = '''

SELECT DISTINCT ON (subreddit) created_utc, subreddit, subreddit_id, subreddit_name_prefixed, subreddit_type
FROM submissions
WHERE subreddit_name_prefixed IS NOT NULL 
    AND subreddit_type IS NOT NULL
ORDER BY subreddit, created_utc DESC

'''

cur.execute(sql_query)
results = cur.fetchall()
for row in results:
    print(row)

(1704059586, 'AcademicPsychology', 't5_2sluh', 'r/AcademicPsychology', 'public')
(1704065506, 'ArtificialInteligence', 't5_3crzr', 'r/ArtificialInteligence', 'public')
(1704057226, 'ChatGPTCoding', 't5_7ipnaj', 'r/ChatGPTCoding', 'public')
(1704057460, 'climatechange', 't5_2rawx', 'r/climatechange', 'public')
(1704042913, 'cogsci', 't5_2qh0k', 'r/cogsci', 'public')
(1703999262, 'edtech', 't5_2r5zc', 'r/edtech', 'public')
(1703980527, 'fintech', 't5_2u7f1', 'r/fintech', 'public')
(1704043506, 'ImaginaryTechnology', 't5_2tf7t', 'r/ImaginaryTechnology', 'public')
(1704066515, 'NLP', 't5_2qqpg', 'r/NLP', 'public')
(1703866977, 'stocks', 't5_2qjfk', 'r/stocks', 'public')
(1704061306, 'StocksAndTrading', 't5_2wwow', 'r/StocksAndTrading', 'public')
(1704042061, 'stockstobuytoday', 't5_3hczbd', 'r/stockstobuytoday', 'public')


Since this query works on both `submissions` and `comments`, which have the same subreddit data attributes, we will combine their results and select the most recent entries. This will create a table called `subreddits` that contains the latest subreddit data.

In [41]:
sql_query = '''

CREATE TABLE IF NOT EXISTS subreddits AS
WITH submissions_comments AS (
    SELECT created_utc, subreddit, subreddit_id, subreddit_name_prefixed, subreddit_type
    FROM submissions
    WHERE subreddit_name_prefixed IS NOT NULL 
      AND subreddit_type IS NOT NULL
    
    UNION ALL
    
    SELECT created_utc, subreddit, subreddit_id, subreddit_name_prefixed, subreddit_type
    FROM comments
    WHERE subreddit_name_prefixed IS NOT NULL 
      AND subreddit_type IS NOT NULL
)

SELECT DISTINCT ON (subreddit) created_utc as last_updated_utc, subreddit, subreddit_id, subreddit_name_prefixed, subreddit_type
FROM submissions_comments
ORDER BY subreddit, created_utc DESC;

'''

cur.execute(sql_query)

##### Query `subreddits` table

In [42]:
sql_query = '''
SELECT * FROM subreddits;
'''

cur.execute(sql_query)
results = cur.fetchall()
for row in results:
    print(row)

(1704062760, 'AcademicPsychology', 't5_2sluh', 'r/AcademicPsychology', 'public')
(1704067109, 'ArtificialInteligence', 't5_3crzr', 'r/ArtificialInteligence', 'public')
(1704065858, 'ChatGPTCoding', 't5_7ipnaj', 'r/ChatGPTCoding', 'public')
(1704067180, 'climatechange', 't5_2rawx', 'r/climatechange', 'public')
(1704056457, 'cogsci', 't5_2qh0k', 'r/cogsci', 'public')
(1704061327, 'edtech', 't5_2r5zc', 'r/edtech', 'public')
(1704022970, 'fintech', 't5_2u7f1', 'r/fintech', 'public')
(1704055582, 'ImaginaryTechnology', 't5_2tf7t', 'r/ImaginaryTechnology', 'public')
(1704066515, 'NLP', 't5_2qqpg', 'r/NLP', 'public')
(1703866977, 'stocks', 't5_2qjfk', 'r/stocks', 'public')
(1704062761, 'StocksAndTrading', 't5_2wwow', 'r/StocksAndTrading', 'public')
(1704042061, 'stockstobuytoday', 't5_3hczbd', 'r/stockstobuytoday', 'public')


#### `User`

In [45]:
sql_query = '''

CREATE TABLE IF NOT EXISTS users AS
WITH submissions_comments AS (
    SELECT author, author_created_utc, author_fullname
    FROM submissions
    
    UNION ALL
    
    SELECT author, author_created_utc, author_fullname
    FROM comments
)
SELECT DISTINCT ON (author) 
    author, 
    COALESCE(author_created_utc, NULL) AS author_created_utc, 
    author_fullname
FROM submissions_comments
ORDER BY author, author_created_utc DESC;

'''

cur.execute(sql_query)