In [1]:
import os
import pandas as pd
import time
import praw
from dotenv import load_dotenv
import re

load_dotenv()

True

In [2]:
reddit = praw.Reddit(
    client_id = os.getenv('client_id'),
    client_secret = os.getenv('client_secret'),
    user_agent = 'genalphaslang'
)

In [3]:
mcgill_subreddit = reddit.subreddit('mcgill')

In [4]:
all_submissions = []
for submission in mcgill_subreddit.top(limit = 200):
    submissions_dict = {}
    submissions_dict['date_created'] = submission.created_utc
    submissions_dict['title'] = submission.title
    submissions_dict['description'] = submission.selftext
    submission.comments.replace_more(limit = 0)
    submissions_dict['comments'] = [comment.body for comment in submission.comments.list()]
    submissions_dict['upvotes'] = submission.score

    # Add to all submissions
    all_submissions.append(submissions_dict)


In [5]:
df = pd.DataFrame(all_submissions)

In [6]:
df

Unnamed: 0,date_created,title,description,comments,upvotes
0,1.668445e+09,I drew McGill! [OC],,"[Sick work bruh, I like your immortalization o...",1082
1,1.702496e+09,I've Cheated on Every Evaluation so far at McGill,I cheat on every eval and exam I take at McGil...,"[What's your chess.com Elo though?, https://pr...",999
2,1.607009e+09,Just a friendly reminder that we will not ente...,,"[Apologies for any PTSD this might induce!, I ...",973
3,1.618612e+09,Some Words of Encouragement for Finals!,,"[well this was unexpected, Does this come with...",950
4,1.603921e+09,A quick recap of the semester so far ft You gu...,,[finally I have an answer if anyone asks me wh...,823
...,...,...,...,...,...
195,1.598462e+09,my friends are not respecting the quarantine act,Might be an unpopular opinion on this sub but ...,[As someone who actually just finished my quar...,330
196,1.600389e+09,Every class this semester 🤦‍♂️,,[the amount of readings required and the hours...,327
197,1.596327e+09,The Advanced McGillian,,[But what are the best places to have sex on c...,329
198,1.586194e+09,yeet.,,"[F, When the corona gives you the S/U option.,...",326


In [7]:
df.to_csv("mcgill.csv", index=False)

In [8]:
def combine_text(text): 
    if isinstance(text, list):
        return ' '.join([word for word in text if isinstance(word, str) and '[deleted]' not in word.lower()])
    elif isinstance(text, str): 
        return text
    return ''

In [9]:
df['combined_text'] = (df['title'].fillna('') + ' ' +
                       df['description'].fillna('') + ' ' +
                       df['comments'].apply(combine_text))

In [10]:
df['combined_text'] = df['combined_text'].str.lower().str.strip()

In [11]:
#Natural Language Processing Packages
import re
import nltk

## Download Resources
# nltk.download("vader_lexicon")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")
nltk.download('averaged_perceptron_tagger_eng')
nltk.download("wordnet")
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gracelin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/gracelin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/gracelin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gracelin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/gracelin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/gracelin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [12]:
def tokenize_and_lemmatize(text): 
    tokens = word_tokenize(text)
    words = [word for word in tokens if word.isalpha()]
    words = [word for word in words if word not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

In [13]:
df['lemmatized_tokens'] = df['combined_text'].apply(tokenize_and_lemmatize)
df.head()

Unnamed: 0,date_created,title,description,comments,upvotes,combined_text,lemmatized_tokens
0,1668445000.0,I drew McGill! [OC],,"[Sick work bruh, I like your immortalization o...",1082,"i drew mcgill! [oc] sick work bruh, i like yo...","[drew, mcgill, oc, sick, work, bruh, like, imm..."
1,1702496000.0,I've Cheated on Every Evaluation so far at McGill,I cheat on every eval and exam I take at McGil...,"[What's your chess.com Elo though?, https://pr...",999,i've cheated on every evaluation so far at mcg...,"[cheated, every, evaluation, far, mcgill, chea..."
2,1607009000.0,Just a friendly reminder that we will not ente...,,"[Apologies for any PTSD this might induce!, I ...",973,just a friendly reminder that we will not ente...,"[friendly, reminder, enter, yr, apology, ptsd,..."
3,1618612000.0,Some Words of Encouragement for Finals!,,"[well this was unexpected, Does this come with...",950,some words of encouragement for finals! well ...,"[word, encouragement, final, well, unexpected,..."
4,1603921000.0,A quick recap of the semester so far ft You gu...,,[finally I have an answer if anyone asks me wh...,823,a quick recap of the semester so far ft you gu...,"[quick, recap, semester, far, ft, guy, finally..."
