In [49]:
'''Imports'''
import json
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [50]:
data_dir_comments = r"C:\Users\gungo\OneDrive\Desktop\stocks_comments.ndjson"
data_dir_sub = r"C:\Users\gungo\OneDrive\Desktop\stocks_submissions.ndjson"
df_com = pd.read_json(data_dir_comments, lines=True)
df_sub = pd.read_json(data_dir_sub, lines=True)

In [51]:
df_com_reduced = df_com[['created_utc','score','body']]
df_sub_reduced = df_sub[['created_utc','score','selftext']]

In [None]:
'''Pre-Processing'''
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    """Cleans, tokenizes, removes stopwords, and stems text."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub('&amp;', '', text) # remove some special characters from the data &amp; corresponds to &
    text = re.sub(r'\s+', ' ', text)  # eliminate duplicate whitespaces using regex
    text = re.sub(r'\[[^]]*\]', '', text)  # remove text in square brackets
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = ' '.join(stemmer.stem(word) for word in text.split() if word not in stop_words)
    return text

def preprocess(df):
    rows_to_delete = []
    df_text = pd.DataFrame(columns=["processed text"])
    if 'body' in df.columns:
        for index, row in df.iterrows():
            if '[removed]' in row['body']:
                rows_to_delete.append(index)
            else:
                df_text.loc[len(df_text)] = preprocess_text(row['body'])
    elif 'selftext' in df.columns:
        for index, row in df.iterrows():
            if '[removed]' in row['selftext']:
                rows_to_delete.append(index)
            else:
                df_text.loc[len(df_text)] = preprocess_text(row['selftext'])
    df_reduced = df.drop(rows_to_delete)
    result = pd.concat([df_reduced, df_text], axis=1)
    return result

In [71]:
def preprocess_text(text):
    """Cleans, tokenizes, removes stopwords, and stems text."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub('&amp;', '', text) # remove some special characters from the data &amp; corresponds to &
    text = re.sub(r'\s+', ' ', text)  # eliminate duplicate whitespaces using regex
    text = re.sub(r'\[[^]]*\]', '', text)  # remove text in square brackets
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = ' '.join(stemmer.stem(word) for word in text.split() if word not in stop_words)
    return text

def preprocess(df):
    """Preprocesses the 'body' or 'selftext' column and removes '[removed]' entries."""
    # Determine which column to process
    if 'body' in df.columns:
        text_col = 'body'
    elif 'selftext' in df.columns:
        text_col = 'selftext'
    else:
        raise ValueError("DataFrame must contain either 'body' or 'selftext' column.")

    # Remove NaN and '[removed]' rows
    df = df[df[text_col].notna()]
    df = df[~df[text_col].str.contains(r'\[removed\]', na=False)]
    df = df[~df[text_col].str.contains(r'\[deleted\]', na=False)]

    # Apply text preprocessing
    df['processed_text'] = df[text_col].apply(preprocess_text)

    return df.reset_index(drop=True)

In [75]:
processed_sub = preprocess(df_sub_reduced)
processed_com = preprocess(df_com_reduced)

In [86]:
df_com.info()
df_sub.info()
df_com_test = df_com[['created_utc','score','id','parent_id','body']]
df_sub_test = df_sub[['created_utc','score','id','selftext']]
print(df_com_test.head(20))
print(df_sub_test.head(20))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689866 entries, 0 to 689865
Data columns (total 52 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   all_awardings                    689866 non-null  object 
 1   archived                         689866 non-null  bool   
 2   associated_award                 0 non-null       float64
 3   author                           689866 non-null  object 
 4   author_created_utc               603083 non-null  float64
 5   author_flair_background_color    79536 non-null   object 
 6   author_flair_css_class           0 non-null       float64
 7   author_flair_richtext            610330 non-null  object 
 8   author_flair_template_id         0 non-null       float64
 9   author_flair_text                0 non-null       float64
 10  author_flair_text_color          79536 non-null   object 
 11  author_flair_type                610330 non-null  object 
 12  au

In [87]:
processed_sub.head(20)

Unnamed: 0,created_utc,score,selftext,processed_text
0,1654089814,7,Is it stupid to purchase the same stock in a d...,stupid purchas stock differ account? example: ...
1,1654092072,13,"Electric vehicle giant Tesla, Inc. (TSLA) in A...","electr vehicl giant tesla, inc. (tsla) austin,..."
2,1654093064,165,AAPL is moving some of its iPad production fro...,aapl move ipad product china vietnam strict co...
3,1654093501,2,I got a second call today about a newly listed...,got second call today newli list compani found...
4,1654093999,11,"Bought $APPS at \~43USD, thought it would cont...","bought $app \~43usd, thought would continu imm..."
5,1654094485,2954,https://www.cnbc.com/2022/06/01/jamie-dimon-sa...,jpmorgan chase ceo jami dimon say prepar bigge...
6,1654095324,1,If you ask someone to guess how many jellybean...,"ask someon guess mani jellybean jar, time gues..."
7,1654095499,6,Been searching but can’t find my specific answ...,search can’t find specif answer - bought one a...
8,1654095621,37,Watching Bloomberg Markets: Americas - they be...,watch bloomberg markets: america - believ ev s...
9,1654098372,6,Read full article: [https://www.cnbc.com/2022/...,"read full article: ( accord article, goldman s..."
