In [49]:
'''Imports'''
import json
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [50]:
data_dir_comments = r"C:\Users\gungo\OneDrive\Desktop\stocks_comments.ndjson"
data_dir_sub = r"C:\Users\gungo\OneDrive\Desktop\stocks_submissions.ndjson"
df_com = pd.read_json(data_dir_comments, lines=True)
df_sub = pd.read_json(data_dir_sub, lines=True)

In [51]:
df_com_reduced = df_com[['created_utc','score','body']]
df_sub_reduced = df_sub[['created_utc','score','selftext']]

In [59]:
'''Pre-Processing'''
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation, numbers, special characters, etc.
    text = re.sub('[^a-z]', ' ', text)
    # Remove stopwords and stem
    text = ' '.join(stemmer.stem(word) for word in text.split() if word not in stop_words)
    return text

def preprocess(df):
    rows_to_delete = []
    df_text = pd.DataFrame(columns=["processed text"])
    if 'body' in df.columns:
        for index, row in df.iterrows():
            if '[removed]' in row['body']:
                rows_to_delete.append(index)
            else:
                df_text.loc[len(df_text)] = preprocess_text(row['body'])
    elif 'selftext' in df.columns:
        for index, row in df.iterrows():
            if '[removed]' in row['selftext']:
                rows_to_delete.append(index)
            else:
                df_text.loc[len(df_text)] = preprocess_text(row['selftext'])
    df_reduced = df.drop(rows_to_delete)
    result = pd.concat([df_reduced, df_text], axis=1)
    return result

In [71]:
def preprocess_text(text):
    """Cleans, tokenizes, removes stopwords, and stems text."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub('&amp;', '', text) # remove some special characters from the data &amp; corresponds to &
    text = re.sub(r'\s+', ' ', text)  # eliminate duplicate whitespaces using regex
    text = re.sub(r'\[[^]]*\]', '', text)  # remove text in square brackets
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = ' '.join(stemmer.stem(word) for word in text.split() if word not in stop_words)
    return text

def preprocess(df):
    """Preprocesses the 'body' or 'selftext' column and removes '[removed]' entries."""
    # Determine which column to process
    if 'body' in df.columns:
        text_col = 'body'
    elif 'selftext' in df.columns:
        text_col = 'selftext'
    else:
        raise ValueError("DataFrame must contain either 'body' or 'selftext' column.")

    # Remove NaN and '[removed]' rows
    df = df[df[text_col].notna()]
    df = df[~df[text_col].str.contains(r'\[removed\]', na=False)]
    df = df[~df[text_col].str.contains(r'\[deleted\]', na=False)]

    # Apply text preprocessing
    df['processed_text'] = df[text_col].apply(preprocess_text)

    return df.reset_index(drop=True)

In [75]:
processed_sub = preprocess(df_sub_reduced)
processed_com = preprocess(df_com_reduced)

In [76]:
processed_sub.head(15)
processed_com.head(15)

Unnamed: 0,created_utc,score,body,processed_text
0,1654041658,-1,Musk is a clown. He knew 50% of his followers ...,musk clown. knew 50% follow bots. knew twitter...
1,1654041696,100,What's the cumulative short loss? $50 billion ...,what' cumul short loss? $50 billion counting? ...
2,1654041706,2,"Quantum computing is physics, but physics isn'...","quantum comput physics, physic business. probl..."
3,1654041743,62,MANGA,manga
4,1654041839,8,AMD?\n\nThey sell on the merits of their produ...,"amd? sell merit products, open sourc softwar s..."
5,1654041840,12,Highly coincidental that this drastic drop in ...,highli coincident drastic drop price happen im...
6,1654041851,2,"Of course you can time the market, on a macro ...","cours time market, macro basi - follow fed. do..."
7,1654041856,1,However the issue is with the decay. It may s...,howev issu decay. may show 100% gain hit targe...
8,1654041865,13,They exclude the 5% they know about.\n\nAnd it...,exclud 5% know about. matter. advertis get eng...
9,1654041907,11,The board dgaf what Dorsey days.,board dgaf dorsey days.
