In [None]:
# %pip install pandas
# %pip install python-dotenv
# %pip install psycopg2-binary
# %pip install SQLAlchemy

In [27]:
import zipfile
import os

comments_data_path = os.path.join("..", "data", "extracted_data", "comments_data.ndjson")
submissions_data_path = os.path.join("..", "data", "extracted_data", "submissions_data.ndjson")


## Extract the Data

In [1]:
def extract_zip(zip_path, extract_to):
    """
    Extracts a ZIP folder to the specified directory.

    Args:
        zip_path (str): Path to the ZIP file.
        extract_to (str): Directory to extract the contents to.
    """
    try:
        # Ensure the extraction directory exists
        os.makedirs(extract_to, exist_ok=True)
        
        # Open the ZIP file
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            # Extract all contents
            zip_ref.extractall(extract_to)
            print(f"Successfully extracted {zip_path} to {extract_to}")
    except FileNotFoundError:
        print(f"Error: The file {zip_path} was not found.")
    except zipfile.BadZipFile:
        print(f"Error: The file {zip_path} is not a valid ZIP archive.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Example usage
zip_file_path = os.path.join("..", "data", "reddit_data.zip")  # Replace with the path to your ZIP file
output_folder = os.path.join("..", "data", "extracted_data")  # Replace with your desired output directory

extract_zip(zip_file_path, output_folder)

Successfully extracted ..\data\reddit_data.zip to ..\data\extracted_data


## Explore the Data

Since both datasets are extremeley large we will sample a smaller chunk to see what the json looks like accross a few random samples.

In [4]:
import json
import pandas as pd
import random

def sample_ndjson(file_path, sample_size=1000):
    """
    Randomly sample lines from an NDJSON file.

    Args:
        file_path (str): Path to the NDJSON file.
        sample_size (int): Number of lines to sample.

    Returns:
        pd.DataFrame: A DataFrame containing the sampled data.
    """
    with open(file_path, 'r') as f:
        lines = f.readlines()
    sampled_lines = random.sample(lines, sample_size)
    sampled_data = [json.loads(line) for line in sampled_lines if line.strip()]
    return pd.DataFrame(sampled_data)

# Example Usage
# file_path = "path/to/large_file.ndjson"
# sampled_df = sample_ndjson(file_path, sample_size=1000)
# print(sampled_df.head())

## Comments dataset

### Analyze the Comments dataset

#### Sample & explore the dataset

In [5]:
df_comments_sample = sample_ndjson(comments_data_path, sample_size=10000)

In [6]:
df_comments_sample.head()

Unnamed: 0,archived,author,author_flair_css_class,author_flair_template_id,author_flair_text,body,can_gild,can_mod_post,controversiality,created_utc,...,steward_reports,retrieved_utc,_meta,rte_mode,author_cakeday,media_metadata,body_html,editable,body_sha1,nest_level
0,False,Broheimanous,,,,To Boldly Go Where No Van Has Gone Before\n\nM...,True,False,0,1523401592.0,...,,,,,,,,,,
1,False,rosemonkey12,,,,Your right! I didn’t mean offense I’m sorry I ...,True,False,0,1689553580.0,...,,,,,,,,,,
2,False,[deleted],,,,[deleted],True,,0,1655331354.0,...,,,,,,,,,,
3,,Napalm32,,,,True Greta is doing what nobody else was able ...,True,False,0,1570843944.0,...,[],,,,,,,,,
4,False,4shadowedbm,,,,Breathing smoke all summer (used to be just a ...,False,False,0,1696053240.0,...,,,,,,,,,,


In [7]:
df_comments_sample.columns

Index(['archived', 'author', 'author_flair_css_class',
       'author_flair_template_id', 'author_flair_text', 'body', 'can_gild',
       'can_mod_post', 'controversiality', 'created_utc', 'distinguished',
       'edited', 'gilded', 'id', 'is_submitter', 'link_id', 'no_follow',
       'parent_id', 'permalink', 'removal_reason', 'retrieved_on', 'score',
       'score_hidden', 'send_replies', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_type', 'all_awardings', 'approved_at_utc', 'approved_by',
       'associated_award', 'author_flair_background_color',
       'author_flair_richtext', 'author_flair_text_color', 'author_flair_type',
       'author_fullname', 'author_is_blocked', 'author_patreon_flair',
       'author_premium', 'awarders', 'banned_at_utc', 'banned_by', 'collapsed',
       'collapsed_because_crowd_control', 'collapsed_reason',
       'collapsed_reason_code', 'comment_type', 'created', 'downs', 'gildings',
       'likes', 'locked', 'mod_note', 'mod_reason_by', '

#### Missing values
There are many missing values in this dataset. Given the huge number of fields, we can try constructing `missing_value_df` using the same technique as in Project 4 to omit some columns that consist mostly of `NaN`.

In [8]:
missing_value_df = pd.DataFrame({'column_name': df_comments_sample.columns,
    'percent_missing': 100 * df_comments_sample.isnull().sum() / len(df_comments_sample)
})
missing_value_df.reset_index(drop=True, inplace=True)
missing_value_df

Unnamed: 0,column_name,percent_missing
0,archived,26.81
1,author,0.00
2,author_flair_css_class,99.36
3,author_flair_template_id,99.35
4,author_flair_text,97.85
...,...,...
77,media_metadata,99.95
78,body_html,99.90
79,editable,99.99
80,body_sha1,99.97


In [9]:
# define threshold value
threshold = 75

# columns with more than threshold% of missing values
over_threshold_missing = missing_value_df[missing_value_df['percent_missing'] > threshold].sort_values('percent_missing', ascending=False)
display(over_threshold_missing)
print(f'Number of columns with more than {threshold}% missing values:', len(over_threshold_missing))

Unnamed: 0,column_name,percent_missing
19,removal_reason,100.0
47,comment_type,100.0
31,associated_award,100.0
30,approved_by,100.0
29,approved_at_utc,100.0
51,likes,100.0
44,collapsed_because_crowd_control,100.0
42,banned_by,100.0
41,banned_at_utc,100.0
63,top_awarded_type,100.0


Number of columns with more than 75% missing values: 33


In [10]:
# only include columns that have fewer than threshold% missing values
df_comments_sample = df_comments_sample.iloc[:, missing_value_df[missing_value_df['percent_missing'] <= threshold].index]
df_comments_sample

Unnamed: 0,archived,author,body,can_gild,can_mod_post,controversiality,created_utc,edited,gilded,id,...,replies,report_reasons,saved,subreddit_name_prefixed,total_awards_received,treatment_tags,updated_on,ups,user_reports,author_created_utc
0,False,Broheimanous,To Boldly Go Where No Van Has Gone Before\n\nM...,True,False,0,1523401592,False,0,dx5kzta,...,,,,,,,,,,
1,False,rosemonkey12,Your right! I didn’t mean offense I’m sorry I ...,True,False,0,1689553580.0,False,0,js9cfho,...,,[],False,r/ArtificialInteligence,0.0,[],1.689554e+09,1.0,[],
2,False,[deleted],[deleted],True,,0,1655331354,False,0,icie5kz,...,,,,r/climatechange,0.0,[],,,,
3,,Napalm32,True Greta is doing what nobody else was able ...,True,False,0,1570843944,False,0,f3dq9vl,...,,,,r/climatechange,0.0,,,,,1.415130e+09
4,False,4shadowedbm,Breathing smoke all summer (used to be just a ...,False,False,0,1696053240.0,False,0,k2tunn7,...,,[],False,r/climatechange,0.0,[],1.696053e+09,1.0,[],
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,False,ReadersAreRedditors,The GPT can actively make third party calls to...,False,False,0,1700510419.0,False,0,ka24us0,...,,[],False,r/ChatGPTCoding,0.0,[],1.700510e+09,1.0,[],
9996,False,jonmon6691,[This will always be how I remember 2020](http...,True,,0,1662391709,False,0,in700fm,...,,,,r/ImaginaryTechnology,0.0,[],,,,1.282982e+09
9997,False,EllisWyatt1,this is stupid.,True,,0,1661106173,False,0,il808a1,...,,,,r/fintech,0.0,[],,,,1.569246e+09
9998,False,jjaym2,Saw ads for Chat IQ a we app that's supposed t...,True,False,0,1683013979,False,0,jijde40,...,,[],False,r/ChatGPTCoding,0.0,[],1.691639e+09,1.0,[],


In [11]:
df_comments_sample.columns

Index(['archived', 'author', 'body', 'can_gild', 'can_mod_post',
       'controversiality', 'created_utc', 'edited', 'gilded', 'id',
       'is_submitter', 'link_id', 'no_follow', 'parent_id', 'permalink',
       'retrieved_on', 'score', 'score_hidden', 'send_replies', 'stickied',
       'subreddit', 'subreddit_id', 'subreddit_type', 'all_awardings',
       'author_flair_richtext', 'author_flair_type', 'author_fullname',
       'author_is_blocked', 'author_patreon_flair', 'author_premium',
       'awarders', 'collapsed', 'created', 'downs', 'gildings', 'locked',
       'mod_reports', 'name', 'num_reports', 'replies', 'report_reasons',
       'saved', 'subreddit_name_prefixed', 'total_awards_received',
       'treatment_tags', 'updated_on', 'ups', 'user_reports',
       'author_created_utc'],
      dtype='object')

#### Fields Select
From the remaining columns, tentatively choose 20 most relevant columns to save to database. We can add or remove more down the line if necessary.

In [12]:
relevent_comment_columns = [
    'author',
    'author_created_utc',
    'body',
    'created_utc',
    'edited',
    'id',
    'locked',
    'parent_id',
    'permalink',
    'retrieved_on',
    'score',
    'subreddit',
    'subreddit_id',
    'subreddit_name_prefixed',
    'subreddit_type',
    'archived',
    'downs',
    'ups'
]

print('Number of relevant columns:', len(relevent_comment_columns))

Number of relevant columns: 18


In [13]:
missing_value_df[missing_value_df['column_name'].isin(relevent_comment_columns)]

Unnamed: 0,column_name,percent_missing
0,archived,26.81
1,author,0.0
5,body,0.0
9,created_utc,0.0
11,edited,0.0
13,id,0.0
17,parent_id,0.0
18,permalink,19.98
20,retrieved_on,6.37
21,score,0.0


#### Construct new dataframe keeping only relevant columns

In [14]:
comments = df_comments_sample[relevent_comment_columns]
comments

Unnamed: 0,author,author_created_utc,body,created_utc,edited,id,locked,parent_id,permalink,retrieved_on,score,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_type,archived,downs,ups
0,Broheimanous,,To Boldly Go Where No Van Has Gone Before\n\nM...,1523401592,False,dx5kzta,,t3_8b6kp3,/r/ImaginaryTechnology/comments/8b6kp3/to_bold...,1.526187e+09,5,ImaginaryTechnology,t5_2tf7t,,public,False,,
1,rosemonkey12,,Your right! I didn’t mean offense I’m sorry I ...,1689553580.0,False,js9cfho,False,t1_js95xao,/r/ArtificialInteligence/comments/151cau6/un_w...,1.689554e+09,1,ArtificialInteligence,t5_3crzr,r/ArtificialInteligence,public,False,0.0,1.0
2,[deleted],,[deleted],1655331354,False,icie5kz,False,t1_icidz8e,/r/climatechange/comments/vd1711/im_currently_...,1.656904e+09,2,climatechange,t5_2rawx,r/climatechange,public,False,,
3,Napalm32,1.415130e+09,True Greta is doing what nobody else was able ...,1570843944,False,f3dq9vl,False,t1_f3dno66,/r/climatechange/comments/dglp03/is_climate_ch...,1.578846e+09,1,climatechange,t5_2rawx,r/climatechange,public,,,
4,4shadowedbm,,Breathing smoke all summer (used to be just a ...,1696053240.0,False,k2tunn7,False,t3_16vqhl0,/r/climatechange/comments/16vqhl0/when_will_be...,1.696053e+09,1,climatechange,t5_2rawx,r/climatechange,public,False,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,ReadersAreRedditors,,The GPT can actively make third party calls to...,1700510419.0,False,ka24us0,False,t1_ka23prh,/r/ChatGPTCoding/comments/17zqtfu/ive_built_a_...,1.700510e+09,1,ChatGPTCoding,t5_7ipnaj,r/ChatGPTCoding,public,False,0.0,1.0
9996,jonmon6691,1.282982e+09,[This will always be how I remember 2020](http...,1662391709,False,in700fm,False,t1_in6h9td,/r/ImaginaryTechnology/comments/x6cvnm/we_are_...,1.665105e+09,30,ImaginaryTechnology,t5_2tf7t,r/ImaginaryTechnology,public,False,,
9997,EllisWyatt1,1.569246e+09,this is stupid.,1661106173,False,il808a1,False,t1_il7y1ju,/r/fintech/comments/wu2imv/what_min_monthly_re...,1.662636e+09,2,fintech,t5_2u7f1,r/fintech,public,False,,
9998,jjaym2,,Saw ads for Chat IQ a we app that's supposed t...,1683013979,False,jijde40,False,t3_134yuzu,/r/ChatGPTCoding/comments/134yuzu/can_i_just_p...,1.687745e+09,1,ChatGPTCoding,t5_7ipnaj,r/ChatGPTCoding,public,False,0.0,1.0


#### Type exploration

In [15]:
comments.dtypes

author                      object
author_created_utc         float64
body                        object
created_utc                 object
edited                      object
id                          object
locked                      object
parent_id                   object
permalink                   object
retrieved_on               float64
score                        int64
subreddit                   object
subreddit_id                object
subreddit_name_prefixed     object
subreddit_type              object
archived                    object
downs                      float64
ups                        float64
dtype: object

For most columns, the column type can be inferred from the column name and values. However, for `edited` column this is a little more tricky.

In [16]:
comments.groupby('edited').size()

edited
False           9519
True              85
1339298667         1
1341945690         1
1357881158         1
                ... 
1703006535.0       1
1703088347.0       1
1703140876.0       1
1703300251.0       1
1703543515.0       1
Length: 398, dtype: int64

Most of the column values of `edited` boolean. However, there are also a handful that are not. Without the metadata of the dataset, intepreting this may be difficult. Therefore, we will also omit this column for now.

In [17]:
comments = comments.drop('edited', axis=1)
comments

Unnamed: 0,author,author_created_utc,body,created_utc,id,locked,parent_id,permalink,retrieved_on,score,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_type,archived,downs,ups
0,Broheimanous,,To Boldly Go Where No Van Has Gone Before\n\nM...,1523401592,dx5kzta,,t3_8b6kp3,/r/ImaginaryTechnology/comments/8b6kp3/to_bold...,1.526187e+09,5,ImaginaryTechnology,t5_2tf7t,,public,False,,
1,rosemonkey12,,Your right! I didn’t mean offense I’m sorry I ...,1689553580.0,js9cfho,False,t1_js95xao,/r/ArtificialInteligence/comments/151cau6/un_w...,1.689554e+09,1,ArtificialInteligence,t5_3crzr,r/ArtificialInteligence,public,False,0.0,1.0
2,[deleted],,[deleted],1655331354,icie5kz,False,t1_icidz8e,/r/climatechange/comments/vd1711/im_currently_...,1.656904e+09,2,climatechange,t5_2rawx,r/climatechange,public,False,,
3,Napalm32,1.415130e+09,True Greta is doing what nobody else was able ...,1570843944,f3dq9vl,False,t1_f3dno66,/r/climatechange/comments/dglp03/is_climate_ch...,1.578846e+09,1,climatechange,t5_2rawx,r/climatechange,public,,,
4,4shadowedbm,,Breathing smoke all summer (used to be just a ...,1696053240.0,k2tunn7,False,t3_16vqhl0,/r/climatechange/comments/16vqhl0/when_will_be...,1.696053e+09,1,climatechange,t5_2rawx,r/climatechange,public,False,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,ReadersAreRedditors,,The GPT can actively make third party calls to...,1700510419.0,ka24us0,False,t1_ka23prh,/r/ChatGPTCoding/comments/17zqtfu/ive_built_a_...,1.700510e+09,1,ChatGPTCoding,t5_7ipnaj,r/ChatGPTCoding,public,False,0.0,1.0
9996,jonmon6691,1.282982e+09,[This will always be how I remember 2020](http...,1662391709,in700fm,False,t1_in6h9td,/r/ImaginaryTechnology/comments/x6cvnm/we_are_...,1.665105e+09,30,ImaginaryTechnology,t5_2tf7t,r/ImaginaryTechnology,public,False,,
9997,EllisWyatt1,1.569246e+09,this is stupid.,1661106173,il808a1,False,t1_il7y1ju,/r/fintech/comments/wu2imv/what_min_monthly_re...,1.662636e+09,2,fintech,t5_2u7f1,r/fintech,public,False,,
9998,jjaym2,,Saw ads for Chat IQ a we app that's supposed t...,1683013979,jijde40,False,t3_134yuzu,/r/ChatGPTCoding/comments/134yuzu/can_i_just_p...,1.687745e+09,1,ChatGPTCoding,t5_7ipnaj,r/ChatGPTCoding,public,False,0.0,1.0


Now that we have understood more about the `comments` dataset, we can start building the `comments` database with the selected columns as attributes. 

#### Schema of `Comments` database

```sql
CREATE TABLE IF NOT EXISTS comments
(
    id                          VARCHAR(255)     NOT NULL,
    archived                    BOOLEAN,
    author                      VARCHAR(255),
    author_created_utc          BIGINT,
    body                        TEXT,
    created_utc                 BIGINT,
    downs                       INT,
    locked                      BOOLEAN,
    parent_id                   VARCHAR(255),
    permalink                   VARCHAR(255),
    retrieved_on                BIGINT,
    score                       INT,
    subreddit                   VARCHAR(255),
    subreddit_id                VARCHAR(255),
    subreddit_name_prefixed     VARCHAR(255),
    subreddit_type              VARCHAR(255),
    updated_on                  BIGINT,
    ups                         INT
);
```

## Sample the submissions dataset

In [18]:
df_submissions_sample = sample_ndjson(submissions_data_path, sample_size=1000)
df_submissions_sample.head()

Unnamed: 0,all_awardings,allow_live_comments,archived,author,author_created_utc,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,...,is_gallery,media_metadata,poll_data,crosspost_parent,crosspost_parent_list,post_categories,_meta,previous_selftext,previous_visits,author_cakeday
0,[],False,False,scatterbraimedddd,1606842000.0,,,[],,,...,,,,,,,,,,
1,,,False,TheGuru12,,,,,,,...,,,,,,,,,,
2,[],False,False,fintechinshorts,1546230000.0,,,[],,,...,,,,,,,,,,
3,,,False,TheGuru12,,,,,,,...,,,,,,,,,,
4,,,False,TheGuru12,,,,,,,...,,,,,,,,,,


In [19]:
df_submissions_sample.columns

Index(['all_awardings', 'allow_live_comments', 'archived', 'author',
       'author_created_utc', 'author_flair_background_color',
       'author_flair_css_class', 'author_flair_richtext',
       'author_flair_template_id', 'author_flair_text',
       ...
       'is_gallery', 'media_metadata', 'poll_data', 'crosspost_parent',
       'crosspost_parent_list', 'post_categories', '_meta',
       'previous_selftext', 'previous_visits', 'author_cakeday'],
      dtype='object', length=130)

In [20]:
filtered_archived = df_submissions_sample.loc[df_submissions_sample['archived'] != False, 'archived']
filtered_archived
filtered_author_is_blocked = df_submissions_sample.loc[df_submissions_sample['author_is_blocked'] == True, 'author_is_blocked']
filtered_author_is_blocked

missing_value_df = pd.DataFrame({'column_name': df_submissions_sample.columns,
                                 'percent_missing':
                                 [(sum(df_submissions_sample[column].isnull()) / len(df_submissions_sample[column])) * 100 for column in df_submissions_sample.columns]
                                 })
missing_value_df.reset_index(drop=True, inplace=True)
missing_value_df.sort_values(by='percent_missing', ascending=True)

Unnamed: 0,column_name,percent_missing
3,author,0.0
28,id,0.0
23,edited,0.0
19,created_utc,0.0
60,score,0.0
...,...,...
100,view_count,100.0
16,category,100.0
20,discussion_type,100.0
125,post_categories,100.0


In [21]:
for col in df_submissions_sample.columns:
    if col.startswith('subreddit'):
        print(col)

subreddit
subreddit_id
subreddit_subscribers
subreddit_type
subreddit_name_prefixed


In [22]:
grouped = df_submissions_sample[['subreddit', 'subreddit_id', 'subreddit_subscribers']].groupby('subreddit')
grouped.size()

subreddit
AcademicPsychology        21
ArtificialInteligence     91
ChatGPTCoding              8
ImaginaryTechnology       28
NLP                        3
StocksAndTrading          24
StocksInFocus            321
climatechange             45
cogsci                    19
edtech                    10
fintech                   26
stocks                   388
stockstobuytoday          16
dtype: int64

In [23]:
df_submissions_sample[['subreddit_name_prefixed']].groupby('subreddit_name_prefixed').size()

subreddit_name_prefixed
r/AcademicPsychology         8
r/ArtificialInteligence     70
r/ChatGPTCoding              8
r/ImaginaryTechnology       12
r/NLP                        1
r/StocksAndTrading          13
r/climatechange             31
r/cogsci                    13
r/edtech                     3
r/fintech                    7
r/stocks                   179
r/stockstobuytoday          16
dtype: int64

In [28]:
df_submissions_sample['author_fullname']

0       t2_94dr1pt3
1               NaN
2       t2_2w5r7ai4
3               NaN
4               NaN
           ...     
995     t2_1aswnkzc
996     t2_6prdem2z
997     t2_vhxeaazt
998    t2_eulrfgjz4
999     t2_6la7kogw
Name: author_fullname, Length: 1000, dtype: object

In [49]:
df_comments_sample[~df_comments_sample['replies'].isna()].groupby('replies').size()

controversiality
0    9890
1     110
dtype: int64

In [59]:
df_comments_sample.groupby('edited').size()

edited
False           9519
True              85
1339298667         1
1341945690         1
1357881158         1
                ... 
1703006535.0       1
1703088347.0       1
1703140876.0       1
1703300251.0       1
1703543515.0       1
Length: 398, dtype: int64

In [39]:
df_submissions_sample['selftext']

0      Shopify is an interesting one. It seems like a...
1                                                       
2                                                       
3                                                       
4                                                       
                             ...                        
995    When reviewing sample portfolios, I noticed th...
996                                            [removed]
997     \n\nMoon Equity Holdings Corp. (“MONI” or “th...
998    The stock price for Tesla jumped up close to 2...
999                                                     
Name: selftext, Length: 1000, dtype: object