In [49]:
'''Imports'''
import json
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [50]:
data_dir_comments = r"C:\Users\gungo\OneDrive\Desktop\stocks_comments.ndjson"
data_dir_sub = r"C:\Users\gungo\OneDrive\Desktop\stocks_submissions.ndjson"
df_com = pd.read_json(data_dir_comments, lines=True)
df_sub = pd.read_json(data_dir_sub, lines=True)

In [51]:
df_com_reduced = df_com[['created_utc','score','body']]
df_sub_reduced = df_sub[['created_utc','score','selftext']]

In [53]:
'''Pre-Processing'''
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation, numbers, special characters, etc.
    text = re.sub('[^a-z]', ' ', text)
    # Remove stopwords and stem
    text = ' '.join(stemmer.stem(word) for word in text.split() if word not in stop_words)
    return text

def preprocess(df):
    rows_to_delete = []
    df_text = pd.DataFrame(columns=["processed text"])
    if 'body' in df.columns:
        for index, row in df.iterrows():
            if '[removed]' in row['body']:
                rows_to_delete.append(index)
            else:
                df_text.loc[len(df_text)] = preprocess_text(row['body'])
    elif 'selftext' in df.columns:
        for index, row in df.iterrows():
            if '[removed]' in row['selftext']:
                rows_to_delete.append(index)
            else:
                df_text.loc[len(df_text)] = preprocess_text(row['selftext'])
    df = df.drop(rows_to_delete)
    result = pd.concat([df, df_text], axis=1)
    return result

In [56]:
processed_sub = preprocess(df_sub_reduced)

In [58]:
processed_sub.head(10)

Unnamed: 0,created_utc,score,selftext,processed text
6,1654090000.0,7.0,Is it stupid to purchase the same stock in a d...,http www cnbc com jami dimon say brace econom ...
9,1654092000.0,13.0,"Electric vehicle giant Tesla, Inc. (TSLA) in A...",watch bloomberg market america believ ev sale ...
11,1654093000.0,165.0,AAPL is moving some of its iPad production fro...,delet
13,1654094000.0,2.0,I got a second call today about a newly listed...,http sec report cik insid trade http sec repor...
14,1654094000.0,11.0,"Bought $APPS at \~43USD, thought it would cont...",hi wonder anyon knew tool report show stock di...
15,1654094000.0,1.0,[deleted],hi guy titl suggest look suggest regard non us...
16,1654094000.0,2954.0,https://www.cnbc.com/2022/06/01/jamie-dimon-sa...,delet
19,1654095000.0,1.0,If you ask someone to guess how many jellybean...,delet
20,1654095000.0,6.0,Been searching but can’t find my specific answ...,thought wwdc bullish bearish http www stockten...
21,1654096000.0,37.0,Watching Bloomberg Markets: Americas - they be...,amp x b tesla inc chief execut offic elon musk...
