In [2]:
import zipfile
import os

comments_data_path = os.path.join("..", "data", "extracted_data", "comments_data.ndjson")
submissions_data_path = os.path.join("..", "data", "extracted_data", "submissions_data.ndjson")


## Extract the Data

In [1]:


def extract_zip(zip_path, extract_to):
    """
    Extracts a ZIP folder to the specified directory.

    Args:
        zip_path (str): Path to the ZIP file.
        extract_to (str): Directory to extract the contents to.
    """
    try:
        # Ensure the extraction directory exists
        os.makedirs(extract_to, exist_ok=True)
        
        # Open the ZIP file
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            # Extract all contents
            zip_ref.extractall(extract_to)
            print(f"Successfully extracted {zip_path} to {extract_to}")
    except FileNotFoundError:
        print(f"Error: The file {zip_path} was not found.")
    except zipfile.BadZipFile:
        print(f"Error: The file {zip_path} is not a valid ZIP archive.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Example usage
zip_file_path = os.path.join("..", "data", "reddit_data.zip")  # Replace with the path to your ZIP file
output_folder = os.path.join("..", "data", "extracted_data")  # Replace with your desired output directory

extract_zip(zip_file_path, output_folder)

Successfully extracted ..\data\reddit_data.zip to ..\data\extracted_data


## Explore the Data

Since both datasets are extremeley large we will sample a smaller chunk to see what the json looks like accross a few random samples.

In [4]:
import json
import pandas as pd
import random

def sample_ndjson(file_path, sample_size=1000):
    """
    Randomly sample lines from an NDJSON file.

    Args:
        file_path (str): Path to the NDJSON file.
        sample_size (int): Number of lines to sample.

    Returns:
        pd.DataFrame: A DataFrame containing the sampled data.
    """
    with open(file_path, 'r') as f:
        lines = f.readlines()
    sampled_lines = random.sample(lines, sample_size)
    sampled_data = [json.loads(line) for line in sampled_lines if line.strip()]
    return pd.DataFrame(sampled_data)

# Example Usage
# file_path = "path/to/large_file.ndjson"
# sampled_df = sample_ndjson(file_path, sample_size=1000)
# print(sampled_df.head())

## Sample the comments dataset

In [5]:
df_comments_sample = sample_ndjson(comments_data_path, sample_size=1000)

In [6]:
df_comments_sample.head()

Unnamed: 0,all_awardings,archived,associated_award,author,author_created_utc,author_flair_background_color,author_flair_css_class,author_flair_template_id,author_flair_text,author_flair_text_color,...,num_reports,replies,report_reasons,saved,updated_on,user_reports,rte_mode,steward_reports,media_metadata,author_cakeday
0,[],False,,[deleted],,,,,,dark,...,,,,,,,,,,
1,,False,,facepalm-germany,,,,,,,...,,,,,,,,,,
2,,True,,[deleted],,,,,,,...,,,,,,,,,,
3,[],False,,LakeSun,1542210000.0,,,,,,...,,,,,,,,,,
4,[],,,_humanFromSpace,,,,,,,...,,,,,,,,,,


## Sample the submissions dataset

In [7]:
df_submissions_sample = sample_ndjson(submissions_data_path, sample_size=1000)
df_submissions_sample.head()

Unnamed: 0,archived,author,author_flair_css_class,author_flair_text,brand_safe,contest_mode,created_utc,distinguished,domain,edited,...,media_metadata,post_categories,rte_mode,_meta,updated_on,poll_data,previous_selftext,author_cakeday,gallery_data,is_gallery
0,False,usaperfectgirls,,,False,False,1511823773,,ovocollege.com,False,...,,,,,,,,,,
1,False,Primary_Map4140,,,,False,1612193408,,self.stocks,False,...,,,,,,,,,,
2,False,Leroy--Brown,,,,,1466785856,,self.stocks,False,...,,,,,,,,,,
3,False,cowaterdog73,,,,False,1620532739,,self.climatechange,False,...,,,,,,,,,,
4,False,h3xadecimal2,,,,False,1618518850,,self.ArtificialInteligence,False,...,,,,,,,,,,
