In [101]:
import pandas as pd
import random
import re
import numpy as np

In [102]:
# Step 1: simulate scrambling the original data
# TODO: Could maybe scramble more?

def scramble_data(df: pd.DataFrame):
    scrambled_lines = []
    
    for _, row in df.iterrows():
        # Extract each column's value
        type_field = row['type']
        id_field = row['id']
        subreddit_id = row['subreddit.id']
        subreddit_name = row['subreddit.name']
        subreddit_nsfw = row['subreddit.nsfw']
        created_utc = row['created_utc']
        permalink = row['permalink']
        # Replace '|' in body to avoid conflicts with new delimiter
        body = row['body'].replace('|', '__PIPE__')  
        sentiment = row['sentiment']
        score = row['score']

        # Create a scrambled line with the pipe as delimiter
        row_str = f"{type_field}|{id_field}|{subreddit_id}|{subreddit_name}|{subreddit_nsfw}|{created_utc}|{permalink}|{sentiment}|{score}|{body}\n"

        # Randomly break the row into two parts
        if random.random() > 0.7:  # 30% chance to split a row into two parts
            split_point = len(row_str) // 2
            row_str = row_str[:split_point] + '\n' + row_str[split_point:]

        scrambled_lines.append(row_str)

    scrambled_data = "".join(scrambled_lines)
    return scrambled_data

In [103]:
# Step 2: cleaning the scrambled data
# TODO: This should be done in SPARK? (Lecturer did something to the body field here for body text in emails)

def clean_data(scrambled_data: str):
    # Split the unstructured data into individual lines
    lines = scrambled_data.split('\n')

    # Step 1: Rejoin broken lines
    cleaned_lines = []
    current_line = ""

    for line in lines:
        if line.strip():  # Ignore empty lines
            current_line += line.strip() + " "  # Append with a space to avoid issues
            # Check if we have a complete line
            if current_line.count('|') == 9:  # Exactly 10 fields
                cleaned_lines.append(current_line.strip())  # Add completed line
                current_line = ""  # Reset for the next line

    # Handle any remaining line that wasn't appended
    if current_line:
        cleaned_lines.append(current_line.strip())

    # Step 2: Parse each cleaned line back into columns
    parsed_data = []
    error_data = []  # List to hold error information

    for i, line in enumerate(cleaned_lines):
        fields = line.split('|')

        # Ensure we have at least 10 fields before processing
        if len(fields) < 10:
            error_info = {
                'line_index': i,
                'fields_found': fields,
                'original_line': line
            }
            error_data.append(error_info)
            continue
        
        row_dict = {
            'type': fields[0],
            'id': fields[1],
            'subreddit.id': fields[2],
            'subreddit.name': fields[3],
            'subreddit.nsfw': fields[4],
            'created_utc': fields[5],
            'permalink': fields[6],
            'sentiment': fields[7],
            'score': fields[8],
            'body': fields[9].replace('__PIPE__', '|')  # Restore original pipe symbol
        }
        parsed_data.append(row_dict)

    # Optionally, return both parsed data and error data
    return pd.DataFrame(parsed_data), error_data


In [104]:
# Example usage
NUM_ROWS = 20

# Load a small subset of the CSV (for testing)
input_file = 'data/subset/100000-reddit-covid-comments.csv'
df = pd.read_csv(input_file).head(NUM_ROWS)  # Load first N rows for testing

In [105]:
# Step 1: Scramble the dataset and save to a .txt file
scrambled_text = scramble_data(df)
scrambled_file = "data/scrambled/scrambled-reddit-covid-comments.txt"

with open(scrambled_file, 'w') as file:
    file.write(scrambled_text)

print(f"Scrambled text (first {NUM_ROWS} lines):\n")
with open(scrambled_file, 'r') as file:
    scrambled_data = file.readlines()
    for line in scrambled_data[:NUM_ROWS]:
        print(line.strip())

Scrambled text (first 20 lines):

comment|hi0xdct|2qh7q|florida|False|1635191579|https://old.reddit.com/r/florida/comments/qf3xp4/desantis_recruiting_unvaccinated_out_of_state/hi0xdct/|-0.5666|-1|&gt; COVID-19 is legitimately dangerous*

*for the old and/or unhealthy.

Average age of death is like 80. 76% of all deaths are 65+.

&gt; we STILL don't know what the long-term effects are

[The CDC estimates 120.2 million cases from 2/20 to 5/21 with 6.2 million hospitalizations.](https://www.cdc.gov/coronavirus/2019-ncov/cases-updates/burden.html)

How many people do you think are walking around with "long-term effects"? How long is "long-term"?

What if, much like every other disease, there are none, except from severe cases or being absurdly unlucky?

&gt; ~750k wasn't enough

Considering all the ignored red flags raised about [the PCR test](https://www.nytimes.com/2020/08/29/health/coronavirus-testing.html), we have very good reason to think that number is way too high.

&gt; "In Massac

In [106]:
# Step 2: Load the scrambled .txt file (to simulate real-world scenario)
with open(scrambled_file, 'r') as file:
    scrambled_data = file.read()

# Clean the dataset (bring it back to a clean state)
df_cleaned, error_data = clean_data(scrambled_data)

print(f"\nCleaned Data (first {NUM_ROWS} rows):")
print(df_cleaned.head(NUM_ROWS))

# Step 3: Save the cleaned data back to a Parquet file
cleaned_output_file_parquet = "data/cleaned/cleaned-reddit-covid-comments.parquet"
df_cleaned.to_parquet(cleaned_output_file_parquet, index=False)

# Also save the cleaned data back to a CSV for easier comparison
cleaned_output_file = "data/cleaned/cleaned-reddit-covid-comments.csv"
df_cleaned.to_csv(cleaned_output_file, index=True)

print(f"Cleaned data saved to {cleaned_output_file}")


Cleaned Data (first 20 rows):
                                                 type       id subreddit.id  \
0                                             comment  hi0xdct        2qh7q   
1   *for the old and/or unhealthy. Average age of ...  hi16118        2y77d   
2                                             comment  hi1mkh7        2qhsa   
3                                             comment  hi15pqu        2z2wm   
4                                             comment  hi16y0z        2qqd2   
5                                             comment  hi0wt1o        2wtmm   
6   And tbh the vaccine is less effective than hav...  hi1sjao        2yrq6   
7                                             comment  hi132e0        2qh1i   
8   to www.vigiaccess.org scroll down and accept t...  hi0wo8l        2qtwb   
9   But sadly, under normally circumstances (not a...  hi1t85o       2n4vyh   
10  Is not the same as this paste: https://www.tra...  hi1gjs2        2tasy   
11  One in Montreal; 

In [107]:
# Check for any errors
if len(error_data) > 0:
    print(f"\nTotal Errors Encountered: {len(error_data)}")
    for error in error_data:
        print(f"Error in line {error['line_index']}: {error['fields_found']}")
else:
    print("No errors found.")

No errors found.
