In [1]:
import zipfile
import os

comments_data_path = os.path.join("..", "data", "extracted_data", "comments_data.ndjson")
submissions_data_path = os.path.join("..", "data", "extracted_data", "submissions_data.ndjson")


## Extract the Data

In [1]:
def extract_zip(zip_path, extract_to):
    """
    Extracts a ZIP folder to the specified directory.

    Args:
        zip_path (str): Path to the ZIP file.
        extract_to (str): Directory to extract the contents to.
    """
    try:
        # Ensure the extraction directory exists
        os.makedirs(extract_to, exist_ok=True)
        
        # Open the ZIP file
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            # Extract all contents
            zip_ref.extractall(extract_to)
            print(f"Successfully extracted {zip_path} to {extract_to}")
    except FileNotFoundError:
        print(f"Error: The file {zip_path} was not found.")
    except zipfile.BadZipFile:
        print(f"Error: The file {zip_path} is not a valid ZIP archive.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Example usage
zip_file_path = os.path.join("..", "data", "reddit_data.zip")  # Replace with the path to your ZIP file
output_folder = os.path.join("..", "data", "extracted_data")  # Replace with your desired output directory

extract_zip(zip_file_path, output_folder)

Successfully extracted ..\data\reddit_data.zip to ..\data\extracted_data


## Explore the Data

Since both datasets are extremeley large we will sample a smaller chunk to see what the json looks like accross a few random samples.

In [2]:
import json
import pandas as pd
import random

def sample_ndjson(file_path, sample_size=1000):
    """
    Randomly sample lines from an NDJSON file.

    Args:
        file_path (str): Path to the NDJSON file.
        sample_size (int): Number of lines to sample.

    Returns:
        pd.DataFrame: A DataFrame containing the sampled data.
    """
    with open(file_path, 'r') as f:
        lines = f.readlines()
    sampled_lines = random.sample(lines, sample_size)
    sampled_data = [json.loads(line) for line in sampled_lines if line.strip()]
    return pd.DataFrame(sampled_data)

# Example Usage
# file_path = "path/to/large_file.ndjson"
# sampled_df = sample_ndjson(file_path, sample_size=1000)
# print(sampled_df.head())

## Comments dataset

### Analyze the Comments dataset

#### Sample & explore the dataset

In [3]:
df_comments_sample = sample_ndjson(comments_data_path, sample_size=10000)

In [4]:
df_comments_sample.head()

Unnamed: 0,all_awardings,archived,associated_award,author,author_created_utc,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,...,quarantined,rte_mode,steward_reports,retrieved_utc,editable,media_metadata,author_cakeday,body_sha1,nest_level,body_html
0,[],False,,Lugubrious_Lothario,1637543000.0,,,[],,,...,,,,,,,,,,
1,[],False,,AutoModerator,,,,[],,,...,,,,,,,,,,
2,,True,,[deleted],,,,,,,...,,,,,,,,,,
3,,True,,WiretapStudios,,,,,,,...,,,,,,,,,,
4,[],False,,[deleted],,,,,,,...,,,,,,,,,,


In [19]:
df_comments_sample.columns

Index(['all_awardings', 'approved_at_utc', 'approved_by', 'archived',
       'associated_award', 'author', 'author_flair_background_color',
       'author_flair_css_class', 'author_flair_richtext',
       'author_flair_template_id', 'author_flair_text',
       'author_flair_text_color', 'author_flair_type', 'author_fullname',
       'author_is_blocked', 'author_patreon_flair', 'author_premium',
       'awarders', 'banned_at_utc', 'banned_by', 'body', 'can_gild',
       'can_mod_post', 'collapsed', 'collapsed_because_crowd_control',
       'collapsed_reason', 'collapsed_reason_code', 'comment_type',
       'controversiality', 'created', 'created_utc', 'distinguished', 'downs',
       'edited', 'gilded', 'gildings', 'id', 'is_submitter', 'likes',
       'link_id', 'locked', 'mod_note', 'mod_reason_by', 'mod_reason_title',
       'mod_reports', 'name', 'no_follow', 'num_reports', 'parent_id',
       'permalink', 'removal_reason', 'replies', 'report_reasons',
       'retrieved_on', 'saved'

#### Missing values
There are many missing values in this dataset. Given the huge number of fields, we can try constructing `missing_value_df` using the same technique as in Project 4 to omit some columns that consist mostly of `NaN`.

In [20]:
missing_value_df = pd.DataFrame({'column_name': df_comments_sample.columns,
    'percent_missing': 100 * df_comments_sample.isnull().sum() / len(df_comments_sample)
})
missing_value_df.reset_index(drop=True, inplace=True)
missing_value_df

Unnamed: 0,column_name,percent_missing
0,all_awardings,26.35
1,approved_at_utc,100.00
2,approved_by,100.00
3,archived,24.53
4,associated_award,100.00
...,...,...
78,media_metadata,99.80
79,body_html,99.88
80,expression_asset_data,99.99
81,body_sha1,99.96


In [23]:
# define threshold value
threshold = 75

# columns with more than threshold% of missing values
over_threshold_missing = missing_value_df[missing_value_df['percent_missing'] > threshold].sort_values('percent_missing', ascending=False)
display(over_threshold_missing)
print(f'Number of columns with more than {threshold}% missing values:', len(over_threshold_missing))

Unnamed: 0,column_name,percent_missing
1,approved_at_utc,100.0
2,approved_by,100.0
4,associated_award,100.0
19,banned_by,100.0
18,banned_at_utc,100.0
42,mod_reason_by,100.0
38,likes,100.0
41,mod_note,100.0
27,comment_type,100.0
24,collapsed_because_crowd_control,100.0


Number of columns with more than 75% missing values: 34


In [24]:
# only include columns that have fewer than threshold% missing values
df_comments_sample = df_comments_sample.iloc[:, missing_value_df[missing_value_df['percent_missing'] <= threshold].index]
df_comments_sample

Unnamed: 0,all_awardings,archived,author,author_flair_richtext,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,author_premium,awarders,...,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_type,total_awards_received,treatment_tags,updated_on,ups,user_reports,author_created_utc
0,[],False,El_Kalku,[],text,t2_6gk5ry84,False,False,False,[],...,ImaginaryTechnology,t5_2tf7t,r/ImaginaryTechnology,public,0.0,[],1.693044e+09,1.0,[],
1,[],False,Groundbreaking-Egg13,[],text,t2_5vm4hgbg,,False,False,,...,ArtificialInteligence,t5_3crzr,r/ArtificialInteligence,public,0.0,[],,,,1.596724e+09
2,[],False,helliot98,[],text,t2_hff1s,,False,False,,...,cogsci,t5_2qh0k,r/cogsci,public,0.0,[],,,,1.405513e+09
3,,True,bannana,,,,,,,,...,cogsci,t5_2qh0k,,,,,,13.0,,
4,[],False,schmobin88,[],text,t2_4909hth9,False,False,False,[],...,ArtificialInteligence,t5_3crzr,r/ArtificialInteligence,public,0.0,[],1.690149e+09,2.0,[],
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,[],,CJP_UX,[],text,t2_16taiq,,False,,[],...,AcademicPsychology,t5_2sluh,r/AcademicPsychology,public,0.0,,,,,1.491404e+09
9996,,False,One_Giant_Nostril,[],text,t2_4adwv,,,,,...,ImaginaryTechnology,t5_2tf7t,r/ImaginaryTechnology,public,,,,,,1.283044e+09
9997,[],False,No-Bridge-7124,[],text,t2_tj6apeoj,False,False,False,[],...,NLP,t5_2qqpg,r/NLP,public,0.0,[],1.691971e+09,1.0,[],
9998,[],,mbmuenster,[],text,t2_111r8rk,,False,False,[],...,NLP,t5_2qqpg,r/NLP,public,0.0,[],,,,


In [25]:
df_comments_sample.columns

Index(['all_awardings', 'archived', 'author', 'author_flair_richtext',
       'author_flair_type', 'author_fullname', 'author_is_blocked',
       'author_patreon_flair', 'author_premium', 'awarders', 'body',
       'can_gild', 'can_mod_post', 'collapsed', 'controversiality', 'created',
       'created_utc', 'downs', 'edited', 'gilded', 'gildings', 'id',
       'is_submitter', 'link_id', 'locked', 'mod_reports', 'name', 'no_follow',
       'num_reports', 'parent_id', 'permalink', 'replies', 'report_reasons',
       'retrieved_on', 'saved', 'score', 'score_hidden', 'send_replies',
       'stickied', 'subreddit', 'subreddit_id', 'subreddit_name_prefixed',
       'subreddit_type', 'total_awards_received', 'treatment_tags',
       'updated_on', 'ups', 'user_reports', 'author_created_utc'],
      dtype='object')

#### Fields Select
From the remaining columns, tentatively choose 20 most relevant columns to save to database. We can add or remove more down the line if necessary.

In [38]:
relevent_comment_columns = [
    'author',
    'author_created_utc',
    'body',
    'created_utc',
    'edited',
    'id',
    'locked',
    'parent_id',
    'permalink',
    'retrieved_on',
    'score',
    'subreddit',
    'subreddit_id',
    'subreddit_name_prefixed',
    'subreddit_type',
    'archived',
    'downs',
    'ups'
]

print('Number of relevant columns:', len(relevent_comment_columns))

Number of relevant columns: 18


In [39]:
missing_value_df[missing_value_df['column_name'].isin(relevent_comment_columns)]

Unnamed: 0,column_name,percent_missing
3,archived,24.53
5,author,0.0
20,body,0.0
30,created_utc,0.0
32,downs,55.72
33,edited,0.0
36,id,0.0
40,locked,26.31
48,parent_id,0.0
49,permalink,19.59


#### Construct new dataframe keeping only relevant columns

In [40]:
comments = df_comments_sample[relevent_comment_columns]
comments

Unnamed: 0,author,author_created_utc,body,created_utc,edited,id,locked,parent_id,permalink,retrieved_on,score,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_type,archived,downs,ups
0,El_Kalku,,Thanks!!!,1686664935.0,False,jnztbiy,False,t1_jnzijlk,/r/ImaginaryTechnology/comments/14867ry/ariel_...,1.686665e+09,1,ImaginaryTechnology,t5_2tf7t,r/ImaginaryTechnology,public,False,0.0,1.0
1,Groundbreaking-Egg13,1.596724e+09,Artbreeder? The way you speak makes me think i...,1668517861,False,iwgd97b,False,t1_iwg7cwf,/r/ArtificialInteligence/comments/yvqei5/help/...,1.671124e+09,1,ArtificialInteligence,t5_3crzr,r/ArtificialInteligence,public,False,,
2,helliot98,1.405513e+09,I have wanted to do this program here since i ...,1637163978,False,hkzz6z0,False,t3_qw1zj6,/r/cogsci/comments/qw1zj6/online_cognitive_sci...,1.645993e+09,4,cogsci,t5_2qh0k,r/cogsci,public,False,,
3,bannana,,"Wow two pages of:\n\n'Well, maybe it's bad but...",1234478798,True,c07mvo3,,t3_7wz2l,,1.425923e+09,13,cogsci,t5_2qh0k,,,True,0.0,13.0
4,schmobin88,,I looked at the conversation. I found it inte...,1680584704.0,False,jevtgoi,False,t3_12b3at0,/r/ArtificialInteligence/comments/12b3at0/i_ga...,1.687395e+09,2,ArtificialInteligence,t5_3crzr,r/ArtificialInteligence,public,False,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,CJP_UX,1.491404e+09,No requests for personal diagnosis,1565628037,False,ewow67c,False,t3_cpdnpm,/r/AcademicPsychology/comments/cpdnpm/how_to_g...,1.574023e+09,1,AcademicPsychology,t5_2sluh,r/AcademicPsychology,public,,,
9996,One_Giant_Nostril,1.283044e+09,"Sidebar: **Requests:** For payable requests, t...",1532277909,False,e2u6o8y,,t3_90yviq,/r/ImaginaryTechnology/comments/90yviq/request...,1.536376e+09,2,ImaginaryTechnology,t5_2tf7t,r/ImaginaryTechnology,public,False,,
9997,No-Bridge-7124,,The spinning is a trip. Why does the mind do t...,1683855114.0,False,jjtgikh,False,t1_jhyj8b3,/r/NLP/comments/1304pqe/i_wonder_who_knows_rhe...,1.687798e+09,1,NLP,t5_2qqpg,r/NLP,public,False,0.0,1.0
9998,mbmuenster,,of course ;-),1599235971,False,g40331g,False,t1_g4026ak,/r/NLP/comments/imigzf/looking_for_a_good_prim...,1.611689e+09,1,NLP,t5_2qqpg,r/NLP,public,,,


#### Type exploration

In [42]:
comments.dtypes

author                      object
author_created_utc         float64
body                        object
created_utc                 object
edited                      object
id                          object
locked                      object
parent_id                   object
permalink                   object
retrieved_on               float64
score                        int64
subreddit                   object
subreddit_id                object
subreddit_name_prefixed     object
subreddit_type              object
archived                    object
downs                      float64
ups                        float64
dtype: object

For most columns, the column type can be inferred from the column name and values. However, for `edited` column this is a little more tricky.

In [44]:
comments.groupby('edited').size()

edited
False           9521
True              84
1345273484         1
1347467183         1
1356071602         1
                ... 
1703199437.0       1
1703365142.0       1
1703539546.0       1
1703804523.0       1
1703807356.0       1
Length: 397, dtype: int64

Most of the column values of `edited` boolean. However, there are also a handful that are not. Without the metadata of the dataset, intepreting this may be difficult. Therefore, we will also omit this column for now.

In [46]:
comments = comments.drop('edited', axis=1)
comments

Unnamed: 0,author,author_created_utc,body,created_utc,id,locked,parent_id,permalink,retrieved_on,score,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_type,archived,downs,ups
0,El_Kalku,,Thanks!!!,1686664935.0,jnztbiy,False,t1_jnzijlk,/r/ImaginaryTechnology/comments/14867ry/ariel_...,1.686665e+09,1,ImaginaryTechnology,t5_2tf7t,r/ImaginaryTechnology,public,False,0.0,1.0
1,Groundbreaking-Egg13,1.596724e+09,Artbreeder? The way you speak makes me think i...,1668517861,iwgd97b,False,t1_iwg7cwf,/r/ArtificialInteligence/comments/yvqei5/help/...,1.671124e+09,1,ArtificialInteligence,t5_3crzr,r/ArtificialInteligence,public,False,,
2,helliot98,1.405513e+09,I have wanted to do this program here since i ...,1637163978,hkzz6z0,False,t3_qw1zj6,/r/cogsci/comments/qw1zj6/online_cognitive_sci...,1.645993e+09,4,cogsci,t5_2qh0k,r/cogsci,public,False,,
3,bannana,,"Wow two pages of:\n\n'Well, maybe it's bad but...",1234478798,c07mvo3,,t3_7wz2l,,1.425923e+09,13,cogsci,t5_2qh0k,,,True,0.0,13.0
4,schmobin88,,I looked at the conversation. I found it inte...,1680584704.0,jevtgoi,False,t3_12b3at0,/r/ArtificialInteligence/comments/12b3at0/i_ga...,1.687395e+09,2,ArtificialInteligence,t5_3crzr,r/ArtificialInteligence,public,False,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,CJP_UX,1.491404e+09,No requests for personal diagnosis,1565628037,ewow67c,False,t3_cpdnpm,/r/AcademicPsychology/comments/cpdnpm/how_to_g...,1.574023e+09,1,AcademicPsychology,t5_2sluh,r/AcademicPsychology,public,,,
9996,One_Giant_Nostril,1.283044e+09,"Sidebar: **Requests:** For payable requests, t...",1532277909,e2u6o8y,,t3_90yviq,/r/ImaginaryTechnology/comments/90yviq/request...,1.536376e+09,2,ImaginaryTechnology,t5_2tf7t,r/ImaginaryTechnology,public,False,,
9997,No-Bridge-7124,,The spinning is a trip. Why does the mind do t...,1683855114.0,jjtgikh,False,t1_jhyj8b3,/r/NLP/comments/1304pqe/i_wonder_who_knows_rhe...,1.687798e+09,1,NLP,t5_2qqpg,r/NLP,public,False,0.0,1.0
9998,mbmuenster,,of course ;-),1599235971,g40331g,False,t1_g4026ak,/r/NLP/comments/imigzf/looking_for_a_good_prim...,1.611689e+09,1,NLP,t5_2qqpg,r/NLP,public,,,


Now that we have understood more about the `comments` dataset, we can start building the `comments` database with the selected columns as attributes. 

#### Schema of `Comments` database

```sql
CREATE TABLE IF NOT EXISTS comments
(
    id                          VARCHAR(255)     NOT NULL,
    archived                    BOOLEAN,
    author                      VARCHAR(255),
    author_created_utc          BIGINT,
    body                        TEXT,
    created_utc                 BIGINT,
    downs                       INT,
    locked                      BOOLEAN,
    parent_id                   VARCHAR(255),
    permalink                   VARCHAR(255),
    retrieved_on                BIGINT,
    score                       INT,
    subreddit                   VARCHAR(255),
    subreddit_id                VARCHAR(255),
    subreddit_name_prefixed     VARCHAR(255),
    subreddit_type              VARCHAR(255),
    updated_on                  BIGINT,
    ups                         INT
);
```

## Sample the submissions dataset

In [5]:
df_submissions_sample = sample_ndjson(submissions_data_path, sample_size=1000)
df_submissions_sample.head()

Unnamed: 0,all_awardings,allow_live_comments,archived,author,author_created_utc,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,...,rte_mode,post_categories,selftext_html,media_metadata,crosspost_parent,crosspost_parent_list,gallery_data,is_gallery,poll_data,author_cakeday
0,[],False,True,RepresentativePriz,1580095000.0,,,[],,,...,,,,,,,,,,
1,[],True,False,c-opacetic,1607296000.0,,,[],,,...,,,,,,,,,,
2,[],False,False,IkoraReyddit,1538521000.0,,,[],,,...,,,,,,,,,,
3,,,False,kaezermusik,,,,,,,...,,,,,,,,,,
4,[],False,False,h3xadecimal2,1505675000.0,,,[],,,...,,,,,,,,,,


In [6]:
df_submissions_sample.columns

Index(['all_awardings', 'allow_live_comments', 'archived', 'author',
       'author_created_utc', 'author_flair_background_color',
       'author_flair_css_class', 'author_flair_richtext',
       'author_flair_template_id', 'author_flair_text',
       ...
       'rte_mode', 'post_categories', 'selftext_html', 'media_metadata',
       'crosspost_parent', 'crosspost_parent_list', 'gallery_data',
       'is_gallery', 'poll_data', 'author_cakeday'],
      dtype='object', length=128)

In [7]:
filtered_archived = df_submissions_sample.loc[df_submissions_sample['archived'] != False, 'archived']
filtered_archived
filtered_author_is_blocked = df_submissions_sample.loc[df_submissions_sample['author_is_blocked'] == True, 'author_is_blocked']
filtered_author_is_blocked

missing_value_df = pd.DataFrame({'column_name': df_submissions_sample.columns,
                                 'percent_missing':
                                 [(sum(df_submissions_sample[column].isnull()) / len(df_submissions_sample[column])) * 100 for column in df_submissions_sample.columns]
                                 })
missing_value_df.reset_index(drop=True, inplace=True)
missing_value_df.sort_values(by='percent_missing', ascending=True)

Unnamed: 0,column_name,percent_missing
3,author,0.0
28,id,0.0
23,edited,0.0
19,created_utc,0.0
60,score,0.0
...,...,...
96,banned_at_utc,100.0
119,post_categories,100.0
117,from,100.0
116,from_kind,100.0


In [8]:
for col in df_submissions_sample.columns:
    if col.startswith('subreddit'):
        print(col)

subreddit
subreddit_id
subreddit_subscribers
subreddit_type
subreddit_name_prefixed


In [12]:
grouped = df_submissions_sample[['subreddit', 'subreddit_id', 'subreddit_subscribers']].groupby('subreddit')
grouped.size()

subreddit
AcademicPsychology        24
ArtificialInteligence     80
ChatGPTCoding              2
ImaginaryTechnology       22
NLP                        1
StocksAndTrading          38
StocksInFocus            320
climatechange             31
cogsci                    20
edtech                    12
fintech                   23
stocks                   411
stockstobuytoday          16
dtype: int64

In [16]:
df_submissions_sample[['subreddit_name_prefixed']].groupby('subreddit_name_prefixed').size()

subreddit_name_prefixed
r/AcademicPsychology        13
r/ArtificialInteligence     60
r/ChatGPTCoding              2
r/ImaginaryTechnology       10
r/StocksAndTrading          23
r/climatechange             15
r/cogsci                     9
r/edtech                     4
r/fintech                   12
r/stocks                   190
r/stockstobuytoday          16
dtype: int64