In [4]:
'''Imports'''
import json
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [5]:
'''Reading Data'''
data_dir_comments = r"C:\Users\gungo\OneDrive\Desktop\stocks_comments.ndjson"
data_dir_sub = r"C:\Users\gungo\OneDrive\Desktop\stocks_submissions.ndjson"
df_com = pd.read_json(data_dir_comments, lines=True)
df_sub = pd.read_json(data_dir_sub, lines=True)

In [6]:
'''Reducing Data'''
df_com_reduced = df_com[['created_utc','score','body']]
df_sub_reduced = df_sub[['created_utc','score','selftext']]

In [7]:
'''Pre-Processing'''
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    """Cleans, tokenizes, removes stopwords, and stems text."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'&amp;#x200B;', '', text)
    text = re.sub('&amp;', '', text) # remove some special characters from the data &amp; corresponds to &
    text = re.sub(r'\s+', ' ', text)  # eliminate duplicate whitespaces using regex
    text = re.sub(r'\[[^]]*\]', '', text)  # remove text in square brackets
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = ' '.join(word for word in text.split() if word not in stop_words)
    #text = ' '.join(stemmer.stem(word) for word in text.split() if word not in stop_words)
    return text

def preprocess(df):
    """Preprocesses the 'body' or 'selftext' column and removes '[removed]' entries."""
    # Determine which column to process
    if 'body' in df.columns:
        text_col = 'body'
    elif 'selftext' in df.columns:
        text_col = 'selftext'
    else:
        raise ValueError("DataFrame must contain either 'body' or 'selftext' column.")

    # Remove NaN and '[removed]' rows
    df = df[df[text_col].notna()]
    df = df[~df[text_col].str.contains(r'\[removed\]', na=False)]
    df = df[~df[text_col].str.contains(r'\[deleted\]', na=False)]

    # Apply text preprocessing
    df['processed_text'] = df[text_col].apply(preprocess_text)

    return df.reset_index(drop=True)

pre_processed_sub = preprocess(df_sub_reduced)
pre_processed_com = preprocess(df_com_reduced)

In [None]:
'''Comment Linking'''
#df_com.info()
#df_sub.info()
df_com_test = df_com[['created_utc','score','id','parent_id','body']]
df_sub_test = df_sub[['created_utc','score','id','selftext']]
#df_com_test.head(20)

In [13]:
'''Display'''
#pre_processed_sub.head(20)

'Display'

In [None]:
'''Export'''
#pre_processed_sub.to_csv('preprocessed-sub.csv', index=False)