In [1]:
'''Imports'''
import json
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
'''Reading Data'''
data_dir_comments = r"C:\Users\gungo\OneDrive\Desktop\stocks_comments.ndjson"
data_dir_sub = r"C:\Users\gungo\OneDrive\Desktop\stocks_submissions.ndjson"
df_com = pd.read_json(data_dir_comments, lines=True)
df_sub = pd.read_json(data_dir_sub, lines=True)

In [3]:
'''Reducing Data'''
df_com_reduced = df_com[['created_utc','score','body']]
df_sub_reduced = df_sub[['created_utc','score','selftext']]

In [4]:
'''Pre-Processing'''
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    """Cleans, tokenizes, removes stopwords, and stems text."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'&amp;#x200B;', '', text)
    text = re.sub('&amp;', '', text) # remove some special characters from the data &amp; corresponds to &
    text = re.sub(r'\s+', ' ', text)  # eliminate duplicate whitespaces using regex
    text = re.sub(r'\[[^]]*\]', '', text)  # remove text in square brackets
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = ' '.join(word for word in text.split() if word not in stop_words)
    #text = ' '.join(stemmer.stem(word) for word in text.split() if word not in stop_words)
    return text

def preprocess(df):
    """Preprocesses the 'body' or 'selftext' column and removes '[removed]' entries."""
    # Determine which column to process
    if 'body' in df.columns:
        text_col = 'body'
    elif 'selftext' in df.columns:
        text_col = 'selftext'
    else:
        raise ValueError("DataFrame must contain either 'body' or 'selftext' column.")

    # Remove NaN and '[removed]' rows
    df = df[df[text_col].notna()]
    df = df[~df[text_col].str.contains(r'\[removed\]', na=False)]
    df = df[~df[text_col].str.contains(r'\[deleted\]', na=False)]

    # Apply text preprocessing
    df['processed_text'] = df[text_col].apply(preprocess_text)

    return df.reset_index(drop=True)

pre_processed_sub = preprocess(df_sub_reduced)
pre_processed_com = preprocess(df_com_reduced)

In [5]:
'''Comment Linking'''
#df_com.info()
#df_sub.info()
df_com_test = df_com[['created_utc','score','id','parent_id','body']]
df_sub_test = df_sub[['created_utc','score','id','selftext']]
#df_com_test.head(20)

In [7]:
'''Display'''
pre_processed_sub.head(20)

Unnamed: 0,created_utc,score,selftext,processed_text
0,1654089814,7,Is it stupid to purchase the same stock in a d...,stupid purchase stock different account? examp...
1,1654092072,13,"Electric vehicle giant Tesla, Inc. (TSLA) in A...","electric vehicle giant tesla, inc. (tsla) aust..."
2,1654093064,165,AAPL is moving some of its iPad production fro...,aapl moving ipad production china vietnam stri...
3,1654093501,2,I got a second call today about a newly listed...,got second call today newly listed company fou...
4,1654093999,11,"Bought $APPS at \~43USD, thought it would cont...","bought $apps \~43usd, thought would continue i..."
5,1654094485,2954,https://www.cnbc.com/2022/06/01/jamie-dimon-sa...,jpmorgan chase ceo jamie dimon says preparing ...
6,1654095324,1,If you ask someone to guess how many jellybean...,"ask someone guess many jellybeans jar, time gu..."
7,1654095499,6,Been searching but can’t find my specific answ...,searching can’t find specific answer - bought ...
8,1654095621,37,Watching Bloomberg Markets: Americas - they be...,watching bloomberg markets: americas - believe...
9,1654098372,6,Read full article: [https://www.cnbc.com/2022/...,"read full article: ( according article, goldma..."


In [11]:
'''Search'''
def search_all_pattern(pattern, data):
    if 'body' in data.columns:
        text_col = 'body'
    elif 'selftext' in data.columns:
        text_col = 'selftext'
    else:
        raise ValueError("DataFrame must contain either 'body' or 'selftext' column.")
    # list to store our data
    all_matches = []

    # keep track of the comment index to store it in the output (acts as a unique identifier for each comment in this simple example)
    i = 0

    # iterate through the comments
    for comment in data[text_col]:

        # search for our regex pattern
        matches = re.finditer(pattern, comment)

        # iterate through all matches in the comment
        for match in matches:
            # lets store all and the matches start and end in a json object in our list (one json object for each match)
            all_matches.append({'comment_index': i, 'comment': comment, "match_start": match.start(), "match_end": match.end()})

        # increase the comment index
        i+=1
    # return the matching comments and the start/end of each match in a pandas DataFrame
    return pd.DataFrame(all_matches)

regex_pattern = 'hey'
all_matches = search_all_pattern(regex_pattern, pre_processed_sub)
print(f"Overall we find {all_matches.shape[0]} matches for '{regex_pattern}' in our dataset!")
all_matches[:20]

Overall we find 4217 matches for 'hey' in our dataset!


Unnamed: 0,comment_index,comment,match_start,match_end
0,3,I got a second call today about a newly listed...,176,179
1,6,If you ask someone to guess how many jellybean...,570,573
2,6,If you ask someone to guess how many jellybean...,999,1002
3,6,If you ask someone to guess how many jellybean...,1774,1777
4,6,If you ask someone to guess how many jellybean...,2118,2121
5,6,If you ask someone to guess how many jellybean...,2152,2155
6,6,If you ask someone to guess how many jellybean...,2185,2188
7,6,If you ask someone to guess how many jellybean...,2304,2307
8,6,If you ask someone to guess how many jellybean...,2394,2397
9,6,If you ask someone to guess how many jellybean...,2488,2491


In [8]:
'''Export'''
pre_processed_sub.to_csv('preprocessed-sub.csv', index=False)