In [105]:
import pandas as pd
import random
import re
import numpy as np
import csv


In [106]:
# Step 1: simulate scrambling the original data
# TODO: Could maybe scramble more?

def scramble_data(df: pd.DataFrame):
    scrambled_lines = []
    
    for _, row in df.iterrows():
        # Extract each column's value
        type_field = row['type']
        id_field = row['id']
        subreddit_id = row['subreddit.id']
        subreddit_name = row['subreddit.name']
        subreddit_nsfw = row['subreddit.nsfw']
        created_utc = row['created_utc']
        permalink = row['permalink']
        # Replace '|' in body to avoid conflicts with new delimiter
        body = row['body'].replace('|', '__PIPE__')  
        sentiment = row['sentiment']
        score = row['score']

        # Some scramble logic here:
        type_field = (type_field == "comment")  # Convert to boolean

        # Create a "scrambled" line with the "|" as delimiter and moving some parts around
        row_str = f"{type_field}|{id_field}|{subreddit_id}|{subreddit_name}|{subreddit_nsfw}|{created_utc}|{sentiment}|{permalink}|{body}|{score}\n"

        # Randomly break the row into two parts
        if random.random() > 0.7:  # 30% chance to split a row into two parts
            split_point = len(row_str) // 2
            row_str = row_str[:split_point] + '\n' + row_str[split_point:]

        scrambled_lines.append(row_str)

    scrambled_data = "".join(scrambled_lines)
    return scrambled_data

In [107]:
# Define a function to clean body text by removing both single and double quotes from the beginning and end
def clean_body_text(body):
    # Restore original pipe symbol
    body = body.replace('__PIPE__', '|')

    # Remove single and double quotation marks from the beginning and end of the body text
    body_cleaned = re.sub(r"^[\s,]*['\"]|['\"][\s,]*$", '', body)
    
    # Optionally, collapse multiple spaces into a single space
    body_cleaned = re.sub(r'\s+', ' ', body_cleaned).strip()

    return body_cleaned

In [108]:
# Step 2: cleaning the scrambled data
# TODO: This should be done in SPARK (Lecturer did something to the body field here for body text in emails)

def clean_data(scrambled_data: str):
    # Split the unstructured data into individual lines
    lines = scrambled_data.split('\n')

    # Step 1: Rejoin broken lines
    cleaned_lines = []
    current_line = ""

    for line in lines:
        if line.strip():  # Ignore empty lines
            current_line += line.strip() + " "  # Append with a space to avoid issues
            # Check if we have a complete line
            if current_line.count('|') == 9:  # Exactly 10 fields
                cleaned_lines.append(current_line.strip())  # Add completed line
                current_line = ""  # Reset for the next line

    # Handle any remaining line that wasn't appended
    if current_line:
        cleaned_lines.append(current_line.strip())

    # Step 2: Parse each cleaned line back into columns
    parsed_data = []
    error_data = []  # List to hold error information

    for i, line in enumerate(cleaned_lines):
        fields = line.split('|')

        # Ensure we have at least 10 fields before processing
        if len(fields) < 10:
            error_info = {
                'line_index': i,
                'fields_found': fields,
                'original_line': line
            }
            error_data.append(error_info)
            continue
        
        # Extract fields
        type_field = "comment" if fields[0] == "True" else "post"  # Convert back to string
        id_field = fields[1]
        subreddit_id = fields[2]
        subreddit_name = fields[3]
        subreddit_nsfw = "nsfw" if fields[4] == "True" else "not_nsfw"  # Convert to string
        created_utc = fields[5]
        sentiment = fields[6]
        permalink = fields[7]
        body = clean_body_text(fields[8])
        score = fields[9]

        # Convert the timestamp to a human-readable format
        created_utc_human = pd.to_datetime(int(created_utc), unit='s').strftime('%Y-%m-%d %H:%M:%S')

        # Create the row dictionary
        row_dict = {
            'type': type_field,
            'id': id_field,
            'subreddit_id': subreddit_id,
            'subreddit_name': subreddit_name,
            'subreddit_nsfw': subreddit_nsfw,
            'created_utc': created_utc_human,
            'sentiment': sentiment,
            'permalink': permalink,
            'body': body,
            'score': score
        }
        parsed_data.append(row_dict)

    # Optionally, return both parsed data and error data
    return pd.DataFrame(parsed_data), error_data


In [109]:
# Example usage
NUM_ROWS = 20

# Load a small subset of the CSV (for testing)
input_file = 'data/subset/100000-reddit-covid-comments.csv'
df = pd.read_csv(input_file).head(NUM_ROWS)  # Load first N rows for testing

In [110]:
# Step 1: Scramble the dataset and save to a .txt file
scrambled_text = scramble_data(df)
scrambled_file = "data/scrambled/scrambled-reddit-covid-comments.txt"

with open(scrambled_file, 'w') as file:
    file.write(scrambled_text)

print(f"Scrambled text (first {NUM_ROWS} lines):\n")
with open(scrambled_file, 'r') as file:
    scrambled_data = file.readlines()
    for line in scrambled_data[:NUM_ROWS]:
        print(line.strip())

Scrambled text (first 20 lines):

True|hi0xdct|2qh7q|florida|False|1635191579|-0.5666|https://old.reddit.com/r/florida/comments/qf3xp4/desantis_recruiting_unvaccinated_out_of_state/hi0xdct/|&gt; COVID-19 is legitimately dangerous*

*for the old and/or unhealthy.

Average age of death is like 80. 76% of all deaths are 65+.

&gt; we STILL don't know what the long-term effects are

[The CDC estimates 120.2 million cases from 2/20 to 5/21 with 6.2 million hospitalizations.](https://www.cdc.gov/coronavirus/2019-ncov/cases-updates/burden.html)

How many people do you think are walking around with "long-term effects"? How long is "long-term"?

What if, much like every other disease, there are none, except from severe cases or being absurdly unlucky?

&gt; ~750k wasn't enough

Considering all the ignored red flags raised about [the PCR test](https://www.nytimes.com/2020/08/29/health/coronavirus-testing.html), we have very good reason to think that number is way too high.

&gt; "In Massachusett

In [111]:
# Step 2: Load the scrambled .txt file (to simulate real-world scenario)
with open(scrambled_file, 'r') as file:
    scrambled_data = file.read()

# Clean the dataset (bring it back to a clean state)
df_cleaned, error_data = clean_data(scrambled_data)

print(f"\nCleaned Data (first {NUM_ROWS} rows):")
print(df_cleaned.head(NUM_ROWS))

# Step 3: Save the cleaned data back to a Parquet file
cleaned_output_file_parquet = "data/cleaned/cleaned-reddit-covid-comments.parquet"
df_cleaned.to_parquet(cleaned_output_file_parquet, index=False)

# Also save the cleaned data back to a CSV for easier comparison
cleaned_output_file = "data/cleaned/cleaned-reddit-covid-comments.csv"
df_cleaned.to_csv(cleaned_output_file, index=False)

print(f"Cleaned data saved to {cleaned_output_file}")


Cleaned Data (first 20 rows):
       type       id subreddit_id         subreddit_name subreddit_nsfw  \
0   comment  hi0xdct        2qh7q                florida       not_nsfw   
1   comment  hi16118        2y77d               antiwork       not_nsfw   
2   comment  hi1mkh7        2qhsa      interestingasfuck       not_nsfw   
3   comment  hi15pqu        2z2wm          cryptomarkets       not_nsfw   
4   comment  hi16y0z        2qqd2        greenbaypackers       not_nsfw   
5   comment  hi0wt1o        2wtmm             edcorlando       not_nsfw   
6   comment  hi1sjao        2yrq6         publicfreakout       not_nsfw   
7   comment  hi132e0        2qh1i              askreddit       not_nsfw   
8   comment  hi0wo8l        2qtwb              childfree       not_nsfw   
9   comment  hi1t85o       2n4vyh          anime_titties       not_nsfw   
10  comment  hi1gjs2        2tasy  personalfinancecanada       not_nsfw   
11  comment  hi0q4vl        2qh33                  funny       not_ns

In [112]:
# Check for any errors
if len(error_data) > 0:
    print(f"\nTotal Errors Encountered: {len(error_data)}")
    for error in error_data:
        print(f"Error in line {error['line_index']}: {error['fields_found']}")
else:
    print("No errors found.")

No errors found.
