In [33]:
# Import libs
import pandas as pd
from tqdm import tqdm
import math
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer



In [2]:
# Read in csv for all universities
df = pd.read_csv('all_posts.csv')
df.shape

(2996, 7)

Social media tends to have a lot of memes/non-sensical discussion. For this project it would be valuable to filter out these types of posts and focus on posts that are more likely to have criticisms about the university.

In [33]:
# Want to see post flairs, and subsequently filter for non-shitposts
df['flair'].unique()

array(['Transfers', 'Discussion', 'Humour', 'Serious', 'News', 'Other',
       'Rant', 'Health', 'Clubs/Sports', 'Social', 'Event', 'Courses',
       'Waterloo #173', 'Academics', 'Advice', 'Confirmed', 'Meta', nan,
       'Shitpost', 'Lost & Found', 'Question', 'Politics', 'Life Advice',
       'Admissions', 'Finances', 'Programs', 'ACORN/Quercus/Outlook',
       'Free Speech', 'Waterloo #201–250', 'UTM/UTSC',
       "I'm in High School", 'Jobs', 'Photography & Art',
       'Pho(ur seasons)tography & Art', '@ SFU (Exception)', 'Megathread',
       'Congrats, you made it!', 'Photography &amp; Art',
       'Humour - Satire', 'SFU = Studying For UBC', 'Prose', '🍁',
       '100% super duper confirmed by the r/byssey', '🔥🔥🔥',
       'Ghost-type Humour', 'Unverified', 'Lost Dog', '🎉🎉🎉',
       'Read Comments Section for full context', 'Missing Person', 'F',
       'Spicy', 'HQ Post', 'Spicy Meme', 'We did it, reddit!',
       'HQ shitpost', 'Certified Dank', 'shitpost', 'Political',
       

In [34]:
# Most popular flairs
df['flair'].value_counts()[:10]

flair
Humour               961
Discussion           223
Other                159
Photography & Art    116
shitpost              73
News                  49
Academics             28
Advice                22
Courses               22
HQ Post               21
Name: count, dtype: int64

In [35]:
df = df[df['flair'].isin(['Discussion', 'News', 'Academics', 'Advice', 'Courses', 'HQ Post'])]
df.shape

(365, 7)

# Cleaning Data

In [None]:
# Init stopwords
stop_words = set(stopwords.words("english"))
stop_words.add('would')
stop_words.add('just')
stop_words.add('also')

def clean_text(df):
    '''
    This function does 3 things

    1. Combines text from title, description and comments
    2. Normalizes university names
    3. Tokenizes and lemmatizes text and removes stopwords + reddit specific words

    '''
    remove_words = ['removed', 'deleted', '[ removed by reddit ]', '[deleted]']
    # custom_stopwords = {'student', 'course', 'people', 'get', 'like', 'time', 'year', 'would'}
    # Combine text from title, description and comments
    def combine_text(text): 

        if isinstance(text, list):
            return ' '.join([word for word in text if isinstance(word, str) and word.lower() not in remove_words])
        elif isinstance(text, str): 
            # Filter out posts removed by reddit
            if re.search(r"\[?\s*removed by reddit.*?\]?", text, flags=re.IGNORECASE):
                return ''
            return text
        return ''

    
    
    # Normalize university names
    def normalize_university(text):
        # Regex to normalize uni names
        text = re.sub(r'u\sof\st', 'uoft', text)
        text = re.sub(r'university of toronto', 'uoft', text)
        text = re.sub(r'university of british columbia', 'ubc', text)
        return text

    # Tokenize + lematize + remove stopwords
    def tokenize_and_lemmatize(text): 

        # Tokenize
        tokens = word_tokenize(text)

        # Filter for strings
        words = [word for word in tokens if word.isalpha()]

        # Remove stopwords
        words = [word for word in words if word not in stop_words]
        
        # Lemmatize
        lemmatizer = WordNetLemmatizer()
        return [lemmatizer.lemmatize(word) for word in words]
    
    # Combine text
    df['combined_text'] = (df['title'].apply(combine_text).fillna('') + ' ' +
                       df['description'].apply(combine_text).fillna('') + ' ' +
                       df['comments'].apply(combine_text))

    # Lower + removing trailing charsa
    df['combined_text'] = df['combined_text'].str.lower().str.strip()
    
    # Normalize university names
    df['combined_text'] = df['combined_text'].apply(normalize_university)

    # Tokenize + lemmatize
    df['lemmatized_tokens'] = df['combined_text'].apply(tokenize_and_lemmatize)

    # Return cleaned df
    return df

df = clean_text(df)

df.head()

Unnamed: 0,date_created,title,description,comments,upvotes,flair,university,combined_text,lemmatized_tokens
0,2025-02-12 18:36:14,To the student who got caught using AI on thei...,Thanks a lot you dumb f*ck. Due to your idiocy...,"['[deleted]', 'It’s crazy that there’s people ...",2942,Transfers,UofT,to the student who got caught using ai on thei...,"[student, got, caught, using, ai, exam, uoft, ..."
1,2024-11-08 03:24:09,[ Removed by Reddit ],[ Removed by Reddit on account of violating th...,"['Ik this man, he’s getting cooked. Maybe uoft...",2939,Discussion,UofT,"['ik this man, he’s getting cooked. maybe uoft...","[man, getting, cooked, maybe, uoft, anything, ..."
2,2020-11-13 19:48:06,I want a ps5,,"['i LOVE how low effort this looks', 'pain', ...",2749,Humour,UofT,i want a ps5 ['i love how low effort this lo...,"[want, love, low, effort, look, u, made, day, ..."
3,2016-12-14 00:38:33,If this thread gets over 300 upvotes in the ne...,"srsly don't fuck this up for me, we're already...","['[deleted]', '[deleted]', ""Found this in /r/a...",2485,Serious,UofT,if this thread gets over 300 upvotes in the ne...,"[thread, get, upvotes, next, change, sub, exam..."
4,2025-05-14 00:11:02,University of Toronto Faculty Association vote...,,"['Big W, honestly im surprised', 'W faculty', ...",2163,News,UofT,uoft faculty association votes to divest from ...,"[uoft, faculty, association, vote, divest, isr..."


## Investigating top k words

In [8]:
from collections import Counter

def get_topk_words(df, k):
    '''
    Takes a df and returns top k words from tokens
    '''
    all_tokens = [token for row in df['lemmatized_tokens'] for token in row]

    # Count frequencies
    token_counts = Counter(all_tokens)

    # Get top K words
    top_k = token_counts.most_common(k)

    return top_k

# Get top 10 most common words across all posts
print(f'---All Posts---\n{get_topk_words(df, 10)}')

# Get top 10 most common words across flairs
for flair in df['flair'].unique():
    print(f'---{flair}---\n{get_topk_words(df[df['flair'] == flair], 10)}')


---All Posts---
[('people', 15312), ('like', 14454), ('student', 13110), ('get', 11587), ('year', 9617), ('one', 9602), ('time', 8748), ('think', 8472), ('know', 7732), ('even', 7111)]
---Transfers---
[('ai', 60), ('college', 52), ('lawyer', 34), ('use', 32), ('law', 30), ('school', 28), ('university', 28), ('student', 26), ('using', 21), ('people', 21)]
---Discussion---
[('people', 3817), ('like', 3058), ('student', 2560), ('get', 2262), ('one', 1875), ('year', 1851), ('think', 1815), ('time', 1560), ('know', 1559), ('even', 1550)]
---Humour---
[('like', 2250), ('get', 1882), ('year', 1841), ('people', 1606), ('one', 1567), ('student', 1558), ('course', 1456), ('time', 1429), ('class', 1133), ('deleted', 1090)]
---Serious---
[('http', 21), ('deleted', 16), ('get', 12), ('goose', 11), ('meme', 10), ('u', 10), ('post', 9), ('hour', 9), ('uoft', 9), ('upvotes', 8)]
---News---
[('people', 699), ('student', 545), ('like', 378), ('get', 358), ('one', 328), ('think', 313), ('uoft', 269), ('k

It seems all posts feature words that are probably common throughout the entire subreddit. Need to figure out a way to explore specific topics people are talking about..

Mutual Information (MI). If we treat our flair as a rough topic label, we can use MI to tell us information that is shared in the non-linear relationships between tokens and flairs.

In [48]:
# 500 most common words
top_500 = get_topk_words(df, 500)

# Create matrix where columns have bool indicating whether specific post has a word 
words_df = pd.DataFrame(df['flair'])

for word, _ in top_500:
    words_df[word] = df['lemmatized_tokens'].apply(lambda tokens: int(word in tokens))

  words_df[word] = df['lemmatized_tokens'].apply(lambda tokens: int(word in tokens))
  words_df[word] = df['lemmatized_tokens'].apply(lambda tokens: int(word in tokens))
  words_df[word] = df['lemmatized_tokens'].apply(lambda tokens: int(word in tokens))
  words_df[word] = df['lemmatized_tokens'].apply(lambda tokens: int(word in tokens))
  words_df[word] = df['lemmatized_tokens'].apply(lambda tokens: int(word in tokens))
  words_df[word] = df['lemmatized_tokens'].apply(lambda tokens: int(word in tokens))
  words_df[word] = df['lemmatized_tokens'].apply(lambda tokens: int(word in tokens))
  words_df[word] = df['lemmatized_tokens'].apply(lambda tokens: int(word in tokens))
  words_df[word] = df['lemmatized_tokens'].apply(lambda tokens: int(word in tokens))
  words_df[word] = df['lemmatized_tokens'].apply(lambda tokens: int(word in tokens))
  words_df[word] = df['lemmatized_tokens'].apply(lambda tokens: int(word in tokens))
  words_df[word] = df['lemmatized_tokens'].apply(lambda tokens: i

In [40]:
from sklearn.metrics import mutual_info_score

def getMI(topk, df, label_column = 'flair'):

    # Get flair names
    unique_labels = df[label_column].unique()

    # Placeholder df
    overalldf = pd.DataFrame()

    for flair in unique_labels:
        miscore = []
        label = df[label_column].copy()


        label[label != flair] = 0
        label[label == flair] = 1

        # Get MI for topk words in flair
        for word in topk:
            miscore.append([word[0]] + [mutual_info_score(label, df[word[0]])] + [flair])

        # Combine scores of all words for flair into df
        miscoredf = pd.DataFrame(miscore).sort_values(1, ascending = False)
        miscoredf.columns = ['word', 'mi', 'flair']
        overalldf = pd.concat([overalldf, miscoredf])

    return overalldf

In [49]:
mi_scores = getMI(top_500, words_df)
mi_scores.set_index(['word']).groupby('flair')['mi'].nlargest(10).reset_index()

Unnamed: 0,flair,word,mi
0,100% super duper confirmed by the r/byssey,vaccine,0.001143
1,100% super duper confirmed by the r/byssey,french,0.001017
2,100% super duper confirmed by the r/byssey,staff,0.000835
3,100% super duper confirmed by the r/byssey,f,0.000806
4,100% super duper confirmed by the r/byssey,apply,0.000770
...,...,...,...
745,🔥🔥🔥,member,0.000820
746,🔥🔥🔥,teaching,0.000819
747,🔥🔥🔥,society,0.000815
748,🔥🔥🔥,company,0.000811


# Sentiment Analysis

In [11]:
from nltk.sentiment import SentimentIntensityAnalyzer

def evalsentences(sentences, to_df = False, columns = []):
    sid = SentimentIntensityAnalyzer()
    pdlist = []

    if to_df:
        for sentence in sentences:
            ss = sid.polarity_scores(sentence)
            pdlist.append([sentence] + [ss['compound']])
        df = pd.DataFrame(pdlist)
        df.columns = columns
        return df
    
    else:
        for sentence in sentences:
            print('\n' + sentence)
            ss = sid.polarity_scores(sentence)
            for k in sorted(ss):
                print('{0}: {1}, '.format(k, ss[k], end = ''))
            print()

In [12]:
comments = df['combined_text']
vader_df = evalsentences(comments, to_df = True, columns = ['comments', 'vader'])

In [13]:
# See what comments are rated negative from VADER
pd.set_option('display.max_colwidth', 1000)
vader_df[vader_df['vader'] < 0]

Unnamed: 0,comments,vader
1,"['ik this man, he’s getting cooked. maybe uoft won’t do anything but it’s over for him and his social life esp w his gf\xa0', ""he apparently deleted his twitter account too, it's geniuenly over for bro"", 'i think the worst part of this has to be the fact that he peaked top 500 na in valorant.', ""'christ is king' jesus would def not approve of what you're saying buddy 😭"", 'pretty deranged of him to say that you and he probably got that from nick fuentes who is a self described nazi. report him for sure.', 'saw this posted on twitter and a bunch of girls came out of the woodwork w their own screenshots too', 'has anyone ever been suspended from uoft for something like this?', '[deleted]', 'as a montreal canadiens fan, we do not claim him. he’s on his own smh. \n\nwhat an ass.', 'some people really don’t know how to be a normal human being', 'average utsg cs experience', 'anyone got his linkedin ? 😂', ""commenting to let you know that i'm not even a uoft student, and the algorithm just...",-0.9978
4,"uoft faculty association votes to divest from israel ['big w, honestly im surprised', 'w faculty', 'honestly amazing to hear!', 'proud of my alma mater!', 'finally', 'noone in the planet will be free until palestine is free.\n\nacademics (people with great and undeniable intelligence) understand that.\n\n👏 👏 👏', ""finally. now let's see them actually do it"", 'it took them long enough bro', 'about time', 'well done, uoft! i’m really surprised, to be honest. but good on you!', 'i think both sides of the conflict are assholes for asking anyone to advocate for their beliefs that lead to murder. \n\nbut equally appalling is giving either side $ to promote their cultures.', 'this is misleading. they voted to recommend to the board that they vote to divest. plus it was barely over 50% of a vote that only like 4% of faculty engaged is. this affects and means nothing', 'wait we’re not funding genocide. ??', 'absurd how a canadian university has to take a vote in order to stop funding a fore...",-0.9594
5,"uoft academic offence cases are literally so entertaining ['""the panel does not accept this explanation as it defies common sense and logic""\n\nmeme material', 'damn, which case is this i want to read it', 'imagine being expelled for cheating on a test worth 16%...', 'something more interesting about that case is the fact that when he went to retrieve the devices later that day, he brought someone (apparently his roommate) with him. the professor ended up returning the devices because she felt scared for her safety as she was alone at night.', 'they are soo entertaining i love reading them cause sometimes it’s shocking how much effort and money people would put into cheating instead of actually studying. i think the worst case i’ve read was probably the one where a student hired someone to personate him and then he assaulted the ta or something. also the one where a ta was paid over 1k to help students cheat. and the one where a student hacked and changed his grades 😭', ""after th...",-0.9184
13,"just graduated at convocation with encampment present just graduated and guess what, the encampment had little to no effect on the ceremony. cope mf’s who think the encampment was some huge negative for convocation. students were able to go on stage with keffiyeh and some went up with the palestinian flag and some with banners in support of the movement, and those students with the flag or banners probably got the loudest applause. overall great experience and no interruptions. \n\ncongrats to all my fellow grads!!! [""that's so good to hear! i'm a huge supporter of the encampment, but of course wouldn't want anyone's graduation to get severely impacted. i'm glad we can do both!"", 'echo chamber convocation.', '[deleted]', 'congratulations on graduating. now go be productive', ""a shame that there weren't lots of israeli flags represented on stage"", '[removed]', '[deleted]', 'gross', '[removed]', 'i’m so happy i declined my offer to that school 💀', 'technically the cultural and flag ...",-0.9983
15,"is this area safe? i’m new to earth. (01001101 01100001 01100100 01100101 01010101 01001100 01101111 01101111 01101011) hi all, i hope uoft has been a delightful experience so far. as a new applicant, i was curious as to whether it would be safe to reside in the area highlighted above. while i have already selected my area of study (yiddish lang) and my preferred place of residence (benches in rotman’s), over the last few weeks i’ve heard some disturbing things about this area. is it safe? should i still apply? ['not safe for your wallet lol', 'you’ll spontaneously combust the moment you enter circle', 'actually, the vast majority of it is not safe unless you have the technical knowledge of surviving in dense wilderness.', ""you'll get mauled by a rogue species known as the timbit. beware of his gaze, take a bite out of him if he comes near you. but if it's the sprinkled variant, just run, he tastes like shit anyways."", 'not at all. we have polar bears wandering around. no fence bet...",-0.9680
...,...,...
2977,"the auto mods on this sub are trash why the f*ck can’t i ask any question without being told it needs to go in a stupid sub thread that no one even looks at or replies to. like one keyword and it gets modded out. garbage ['fuck them mods', 'revolution ✊', 'preach', '[removed]', 'for real, no one cares if the post is not in the thread, plus it’s easier for people to find in the future', 'pov: [removed]', 'yeah fr', '😭😭😭😭midterms and my deferred final got me fucked up im so sorry']",-0.4019
2980,"saw this on twitter, thought it was funny ["" if you're a first or second year (u0 or u1) a sizeable amount of your classes will probably be online. the whole thing about going into the bathroom to socialize is straight up nonsensical though."", 'i mean yeah most of my classes are online, but i can still socialize outdoors without a mask and we can still hear eachother with masks on lmaooooo \n\n\nor sitting down in gerts, where you dont need a mask', 'do people realise they can socialise with masks on?', ""where's the joke?"", 'umm please wear your mask in bathrooms??? that’s gross why you tryna smell some shit and pee???? just go outside lmao', 'there are also things called cafeterias...', 'not from mcgill. can someone explain how courses are still virtual but students share the same bathroom? do you like.. have virtual teachers while you are physically in the classroom?', '[deleted]', 'my daughter’s classes are all online. especially annoying since i seem to recall mcgill announ...",-0.9944
2990,"new mcgill merch idea s/uze.\n\n\nin big letters. on a hoodie. ['4am quarantine thoughts', 'this. is. genius', ""you've heard of big suze. now get ready for: men's xl suze"", 'r/mcgill merch? u/catanoverlord', 'or on a mask!', 'i...want this so bad', '/u/arweavethis', '[deleted]', '**saved to the permaweb! [https://arweave.net/tw2ci_ncgpwtjhd-ioazrkeilrxr3anfnv5gbufr9cs](https://arweave.net/tw2ci_ncgpwtjhd-ioazrkeilrxr3anfnv5gbufr9cs)**\n \n*arweavethis is a bot that permanently stores posts and comment threads on an immutable ledger, combating censorship and [the memory hole](https://en.wikipedia.org/wiki/memory_hole).*', ""i'd buy it."", 'i feel attacked rn']",-0.2695
2992,"on sphr and the impeachment ​i attended last week’s general assembly, whose attendance was visibly segregated by a majority of sphr supporters and a minority of non-affiliated people. watching the majority of members talk and ask staged questions provided really good insight about sphr, what they want, and their modus operandi.\n\nit is clear, from what i saw at the assembly, that sphr is angered at ssmu for not holding a referendum to vote on a student-wide strike in support of palestine which would have occurred in november - a result of poor organization and the lack of understanding of student government. in retaliation, they are trying to impeach the ssmu president as a scapegoat.\n\nsphr aren't the only ones to blame for the lack of action in support of palestine - the ssmu president does have some responsibility for this, but has apologized during the meeting. however, as a result of the [ongoing provincial injunction](https://ssmu.ca/blog/2024/08/statement-regarding-legal-u...",-0.9991


In [15]:
# See what comments are rated negative from VADER
pd.set_option('display.max_colwidth', 1000)
vader_df[vader_df['vader'] > 0]

Unnamed: 0,comments,vader
0,"to the student who got caught using ai on their exam at uoft law thanks a lot you dumb f*ck. due to your idiocy and dumbassery my college (not in ontario) is now cracking down on access to our hard drives during exams, so now we have to print everything which will be a lot of time and money i don’t have. if you’re too incompetent to write a law school exam and require ai then you shouldn’t even be in law. thanks for ruining it for every other college across the country. ['[deleted]', 'it’s crazy that there’s people mad at you for venting.', 'gaining access to your hard drives feels like a huge violation of privacy? i’m surprised that’s even allowed.', '?', '[removed]', 'what canadian says ""college?""', 'sighs, sounds exhausting. best of luck', 'oof. i only use ai for helping me understand concepts (and actually backing up what it says with sources). idk why so many ppl use it to write their papers for them. it’s not that hard to write your own ideas!', 'thanks to the idiot who got c...",0.9993
2,"i want a ps5 ['i love how low effort this looks', 'pain', 'loool u made my day..', 'those boys and scalpers really took those things quick. shoppers lied to me and abandoned me while they stole every playstayion', 'lmfao 😭\n\ni hate this school. i can’t wait to graduate.', 'i managed to order one. coming next monday ;)', 'i live in problem set 5 but the ""et 5"" is silent...', 'this is a quality post', 'nice', 'yoooo this was hilarious. had my eyes wide open and i was cackling', 'this is too good', 'same, but no my mom wants to get “the family” a nintendo switch 😭😭', '没有ps5:(', 'same 😩', 'i don’t get it', 'did you get one?', '[you should find this ad on facebook marketplace. ](https://imgur.com/gallery/gkv44cr)', 'terrible', 'ummmm... so this is what uoft students do on their free time?', 'buy a computer for the same price and run the same games 10 times as fast wow am a genius who isn’t falling for sony’s shitty ass marketing because now you can play the same games better quality ...",0.9980
3,"if this thread gets over 300 upvotes in the next 24h i will change the sub to r/idealcatering until exams are over srsly don't fuck this up for me, we're already working on the css don't want this shit to go to waste\n\nedit: holy mother of god we're working on asap\n\nedit 2: yo r/uwaterloo [i guess we just won the meme war](https://www.reddit.com/r/uoft/comments/5i7f1g/if_this_thread_gets_over_300_upvotes_in_the_next/db6b2rg/)\n\nedit 3: ...and now i just got gilded what is this life, should have the update applied sometime tommorow\n\nedit 4: [it's been done](https://www.reddit.com/r/uoft/comments/5ic84a/welcome_to_ridealcatering/) ['[deleted]', '[deleted]', ""found this in /r/all/rising, i'll give my updoot."", ""/r/uwaterloo's top post in their sub's history got 670 upvotes. at the time of this writing, this post has gotten 757 upvotes in 4 hours...\n\nnot sure if i should feel proud about this."", '**any**', '[deleted]', "" /r/all here what's ideal catering"", ""the [ass goose](http...",0.9986
6,"i was playing chess with a girl, i ended up telling her she was so mateable as a cs major i have a hobby of chess, i finally found a girl who’ll play chess with me. she plays but clearly not as much as me, won like 5 times in a row. ended saying loudly ”why you so mateable!” trying to trash talk her. she laughed then looked at me weird and then people beside me looked at me weird. 2 hrs later i finally realized. i am making sure i won‘t be mating anytime irl. [""whenever stories like these pop up, the people in them insist on making it known that they're cs majors for some strange reason. \n\nlike, chill. we can already deduce this from the details provided."", 'ts can’t be fr vro 💔🥀', 'yikes', 'finally, cs student not being straight men kissing each other', '“why you so matable!” *stares for 3 seconds “check … mate, of course”', ""you should tell her she's breedable next time #unspokenrizz #bahenbro"", 'computer students being computer students :', 'you had 1 word to put in your sent...",0.9990
7,"lady drives into cement in front of st george station 😭 ['thank you for posting concrete evidence of her actions', 'i always wonder how these dumbasses end up getting past road closed signs and blockings', 'workers taking pics … 😂', 'dang a new car too, that license plate number is from the last few months with the dcxx 😭', 'what in the negative iq', 'this is real 😭😭😭?? oh my god how did she live this long!', 'rotman spotted in the wild', ""that's conk creat babey"", 'curious to know what happened after. did she try to reverse out? did the concrete dry? how did she get herself out?\xa0\n\n\nneed details.', 'people drive into streetcar tunnels sometimes lmfao', 'she must’ve taken that sign for granite.', '99% she gets angry instead admitting her fault', 'oopsie', 'did she stop because she realized there’s actual concrete or because her car wouldn’t move anymore?', 'apple maps user', 'i was the driver in the cement truck, im the guy in the yellow. she drove into it a couple minutes af...",0.9985
...,...,...
2988,"the library is pretty awesome my mind was blown today when i was browsing at mclennan basement and came across a shelf with the *original volumes* of encyclopedia britannica from 1879, just sitting there. how much undiscovered treasure is hidden in plain sight on those dusty shelves?\n\nthat's all. ['as a history major, i can confirm that the library is a great place. especially when i checked out a book that hadn’t been checked out since 1987.', ""i'm currently reading a book about all the horrors that happened at mcgill during cold war time, written by a montrealler. i sometimes go to buildings where those things happened, sit there and read the chapters. paints a totally different picture.\n\nnot related to library but i overshare. what can i do."", ""agreed!! i love that they have a wide genre of books outside of academic works such as classics, children's books, ya fiction and contemporary literature."", '[removed]', ""the expanse of old books is truly amazing, as a history student...",0.9969
2989,"do we need to go to classroom when snows? or move classroom from a building near the mountain to a building near the city? ['""tell me you\'re not canadian without telling me you\'re not canadian""', 'the only time class is cancelled is when you see a polar bear on sherbrooke.', 'this made me laugh quite a bit, thanks for that', ""idk if this is satire or a poor soul who hasn't seen snow before, either way very amusing. to answer your question though, no, it snows way often for that. you're expected to go to classes when it snows the same way you're expected to go when it's raining."", 'i’m dead. this poor international student omg', 'the education building used to have a rope that you could use to climb up the hill if the sidewalk was unwalkable.', ""welcome to the great white north. you'll get used to the snow quick enough."", 'technically you dont need to go to school at all lol', ""you should get them snow boots with the grips integrated cause you gna be walking up the snowy mountains...",0.5552
2991,"is it just me or grads and undergrads really disconnected? why are grads and undergrads so disconnected? why do i need to be a ta to be connected with undergrads?\n\nthe thing is, grads have a lot to offer in terms of experience to the undergrad, an experience which does not have to be confined to the lab, and vice versa. i have personally learned a lot since i joined my masters program. report writing, expectations, stress management, deliverables, work quality, and so many things other things that are not necessarily technical in nature, but are just important traits to have.\n\nit could be a very casual relationship, where the undergrad and grad complain to each other about problems, potentially offer solutions, or maybe just listen. i am not saying strictly technical problems and homework. it could be anything. it can also help the undergrad get a glimpse on gradlife. in turn, the grad student would feel fulfilled to have talked to someone outside their extremely small circle.\...",0.9994
2994,"person who was vomiting at 9am exam in fieldhouse: are you ok? that was intense. ['that was crazy!! sounded like he was dying and he had a full on brown paper bag. but also unsure why the invigilators let him sit there for so long', ""in science, if you vomit at the exam you still get the grade...\nit's the shittiest thing."", ""yikes! he should have been removed from his seat much more quickly than he was for his own sake and everyone else's. it could be that the invigilators helping him were new and didn't know what to do right away. wishing him well!"", 'fuck! i was in the first rows and could still hear him very loudly. i hope he’s okay!!', ""how do you even get a doctors note if you don't have a doctor? i feel like people who are sick and can hardly stand and don't have doctors are super fucked and there's nothing they can do about it."", 'a few years ago, final exams took place at the scotia bank cinema and a student puked on the carpet of one of the rooms. that was something.', 'd...",0.9750


In [61]:
combined_df = pd.concat([vader_df, words_df], axis=1)
combined_df['ground_truth'] = combined_df['vader'].apply(lambda x: 'positive' if x > 0 else 'negative')
cols = combined_df.columns.to_list()
cols.insert(1, cols.pop(cols.index('ground_truth')))
combined_df = combined_df[cols]
combined_df.head()

Unnamed: 0,comments,ground_truth,vader,flair,people,like,student,get,year,one,...,news,seriously,tried,teacher,war,bring,happens,towards,office,easier
0,"to the student who got caught using ai on their exam at uoft law thanks a lot you dumb f*ck. due to your idiocy and dumbassery my college (not in ontario) is now cracking down on access to our hard drives during exams, so now we have to print everything which will be a lot of time and money i don’t have. if you’re too incompetent to write a law school exam and require ai then you shouldn’t even be in law. thanks for ruining it for every other college across the country. ['[deleted]', 'it’s crazy that there’s people mad at you for venting.', 'gaining access to your hard drives feels like a huge violation of privacy? i’m surprised that’s even allowed.', '?', '[removed]', 'what canadian says ""college?""', 'sighs, sounds exhausting. best of luck', 'oof. i only use ai for helping me understand concepts (and actually backing up what it says with sources). idk why so many ppl use it to write their papers for them. it’s not that hard to write your own ideas!', 'thanks to the idiot who got c...",positive,0.9993,Transfers,1,1,1,1,1,1,...,0,0,0,0,0,1,1,0,1,0
1,"['ik this man, he’s getting cooked. maybe uoft won’t do anything but it’s over for him and his social life esp w his gf\xa0', ""he apparently deleted his twitter account too, it's geniuenly over for bro"", 'i think the worst part of this has to be the fact that he peaked top 500 na in valorant.', ""'christ is king' jesus would def not approve of what you're saying buddy 😭"", 'pretty deranged of him to say that you and he probably got that from nick fuentes who is a self described nazi. report him for sure.', 'saw this posted on twitter and a bunch of girls came out of the woodwork w their own screenshots too', 'has anyone ever been suspended from uoft for something like this?', '[deleted]', 'as a montreal canadiens fan, we do not claim him. he’s on his own smh. \n\nwhat an ass.', 'some people really don’t know how to be a normal human being', 'average utsg cs experience', 'anyone got his linkedin ? 😂', ""commenting to let you know that i'm not even a uoft student, and the algorithm just...",negative,-0.9978,Discussion,1,1,1,1,1,1,...,1,1,1,1,1,1,0,0,1,0
2,"i want a ps5 ['i love how low effort this looks', 'pain', 'loool u made my day..', 'those boys and scalpers really took those things quick. shoppers lied to me and abandoned me while they stole every playstayion', 'lmfao 😭\n\ni hate this school. i can’t wait to graduate.', 'i managed to order one. coming next monday ;)', 'i live in problem set 5 but the ""et 5"" is silent...', 'this is a quality post', 'nice', 'yoooo this was hilarious. had my eyes wide open and i was cackling', 'this is too good', 'same, but no my mom wants to get “the family” a nintendo switch 😭😭', '没有ps5:(', 'same 😩', 'i don’t get it', 'did you get one?', '[you should find this ad on facebook marketplace. ](https://imgur.com/gallery/gkv44cr)', 'terrible', 'ummmm... so this is what uoft students do on their free time?', 'buy a computer for the same price and run the same games 10 times as fast wow am a genius who isn’t falling for sony’s shitty ass marketing because now you can play the same games better quality ...",positive,0.998,Humour,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
3,"if this thread gets over 300 upvotes in the next 24h i will change the sub to r/idealcatering until exams are over srsly don't fuck this up for me, we're already working on the css don't want this shit to go to waste\n\nedit: holy mother of god we're working on asap\n\nedit 2: yo r/uwaterloo [i guess we just won the meme war](https://www.reddit.com/r/uoft/comments/5i7f1g/if_this_thread_gets_over_300_upvotes_in_the_next/db6b2rg/)\n\nedit 3: ...and now i just got gilded what is this life, should have the update applied sometime tommorow\n\nedit 4: [it's been done](https://www.reddit.com/r/uoft/comments/5ic84a/welcome_to_ridealcatering/) ['[deleted]', '[deleted]', ""found this in /r/all/rising, i'll give my updoot."", ""/r/uwaterloo's top post in their sub's history got 670 upvotes. at the time of this writing, this post has gotten 757 upvotes in 4 hours...\n\nnot sure if i should feel proud about this."", '**any**', '[deleted]', "" /r/all here what's ideal catering"", ""the [ass goose](http...",positive,0.9986,Serious,0,1,1,1,1,1,...,0,0,0,0,1,0,0,0,0,0
4,"uoft faculty association votes to divest from israel ['big w, honestly im surprised', 'w faculty', 'honestly amazing to hear!', 'proud of my alma mater!', 'finally', 'noone in the planet will be free until palestine is free.\n\nacademics (people with great and undeniable intelligence) understand that.\n\n👏 👏 👏', ""finally. now let's see them actually do it"", 'it took them long enough bro', 'about time', 'well done, uoft! i’m really surprised, to be honest. but good on you!', 'i think both sides of the conflict are assholes for asking anyone to advocate for their beliefs that lead to murder. \n\nbut equally appalling is giving either side $ to promote their cultures.', 'this is misleading. they voted to recommend to the board that they vote to divest. plus it was barely over 50% of a vote that only like 4% of faculty engaged is. this affects and means nothing', 'wait we’re not funding genocide. ??', 'absurd how a canadian university has to take a vote in order to stop funding a fore...",negative,-0.9594,News,1,1,1,1,0,0,...,1,1,0,0,1,0,0,0,0,0


In [62]:
def pmiForAllCal(df, topk_word, gt_sentiment, label_column='ground_truth'):
    #Try calculate all the pmi for top k and store them into one pmidf dataframe

    index = [x[0] for x in topk_word]
    pmiDf = pd.DataFrame(index=index, columns=['pmi'])

    for (word, count) in tqdm(topk_word):
        pmiDf.at[word, 'pmi'] = pmiCalc(df,word,gt_sentiment,label_column)

    return pmiDf


def pmiCalc(df, word, gt_sentiment, label_column='ground_truth'):

    N = df.shape[0]

    px = sum(df[label_column]==gt_sentiment)
    py = sum(df[word]==True)
    pxy = len(df[(df[label_column]==gt_sentiment) & (df[word]==True)])

    if pxy==0 and (px != 0 and py != 0):#Log 0 cannot happen
        pmi = math.log((pxy+0.0001)*N/(px*py))
    elif px == 0 or py == 0 or pxy == 0:
        pmi = math.log(0.0001)
    else:
        pmi = math.log(pxy*N/(px*py))
    return pmi

In [63]:
pmiposdf = pmiForAllCal(combined_df,get_topk_words(df, 500),'negative')
pmiposdf.sort_values('pmi',ascending=0).head(10)

100%|██████████| 500/500 [00:00<00:00, 1177.19it/s]


Unnamed: 0,pmi
israel,1.098874
indigenous,0.867755
police,0.843353
racist,0.779285
protest,0.724935
war,0.630714
action,0.59495
black,0.482641
child,0.473261
white,0.462087
