In [None]:
# Import libs
import pandas as pd

In [90]:
# Read in csv for all universities
df = pd.read_csv('all_posts.csv')
df.shape

(2996, 6)

Social media tends to have a lot of memes/non-sensical discussion. For this project it would be valuable to filter out these types of posts and focus on posts that are more likely to have criticisms about the university.

In [91]:
# Want to see post flairs, and subsequently filter for non-shitposts
df['flair'].unique()

array(['Transfers', 'Discussion', 'Humour', 'Serious', 'News', 'Other',
       'Rant', 'Health', 'Clubs/Sports', 'Social', 'Event', 'Courses',
       'Waterloo #173', 'Academics', 'Advice', 'Confirmed', 'Meta', nan,
       'Shitpost', 'Lost & Found', 'Question', 'Politics', 'Life Advice',
       'Admissions', 'Finances', 'Programs', 'ACORN/Quercus/Outlook',
       'Free Speech', 'Waterloo #201–250', 'UTM/UTSC',
       "I'm in High School", 'Jobs', 'Photography & Art',
       'Pho(ur seasons)tography & Art', '@ SFU (Exception)', 'Megathread',
       'Congrats, you made it!', 'Photography &amp; Art',
       'Humour - Satire', 'SFU = Studying For UBC', 'Prose', '🍁',
       '100% super duper confirmed by the r/byssey', '🔥🔥🔥',
       'Ghost-type Humour', 'Unverified', 'Lost Dog', '🎉🎉🎉',
       'Read Comments Section for full context', 'Missing Person', 'F',
       'Spicy', 'HQ Post', 'Spicy Meme', 'We did it, reddit!',
       'HQ shitpost', 'Certified Dank', 'shitpost', 'Political',
       

In [92]:
# Most popular flairs
df['flair'].value_counts()[:10]

flair
Humour               962
Discussion           223
Other                159
Photography & Art    116
shitpost              73
News                  49
Academics             28
Advice                22
Courses               22
HQ Post               21
Name: count, dtype: int64

In [93]:
df = df[df['flair'].isin(['Discussion', 'News', 'Academics', 'Advice', 'Courses', 'HQ Post'])]
df.shape

(365, 6)

# Cleaning Data

In [112]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Init stopwords
stop_words = set(stopwords.words("english"))

def clean_text(df):
    '''
    This function does 3 things

    1. Combines text from title, description and comments
    2. Normalizes university names
    3. Tokenizes and lemmatizes text and removes stopwords + reddit specific words

    '''
    remove_words = ['removed', 'deleted', '[ removed by reddit ]', '[deleted]']
    # custom_stopwords = {'student', 'course', 'people', 'get', 'like', 'time', 'year', 'would'}
    # Combine text from title, description and comments
    def combine_text(text): 

        if isinstance(text, list):
            return ' '.join([word for word in text if isinstance(word, str) and word.lower() not in remove_words])
        elif isinstance(text, str): 
            # Filter out posts removed by reddit
            if re.search(r"\[?\s*removed by reddit.*?\]?", text, flags=re.IGNORECASE):
                return ''
            return text
        return ''

    
    
    # Normalize university names
    def normalize_university(text):
        # Regex to normalize uni names
        text = re.sub(r'u\sof\st', 'uoft', text)
        text = re.sub(r'university of toronto', 'uoft', text)
        text = re.sub(r'university of british columbia', 'ubc', text)
        return text

    # Tokenize + lematize + remove stopwords
    def tokenize_and_lemmatize(text): 

        # Tokenize
        tokens = word_tokenize(text)

        # Filter for strings
        words = [word for word in tokens if word.isalpha()]

        # Remove stopwords
        words = [word for word in words if word not in stop_words]
        
        # Lemmatize
        lemmatizer = WordNetLemmatizer()
        return [lemmatizer.lemmatize(word) for word in words]
    
    # Combine text
    df['combined_text'] = (df['title'].apply(combine_text).fillna('') + ' ' +
                       df['description'].apply(combine_text).fillna('') + ' ' +
                       df['comments'].apply(combine_text))

    # Lower + removing trailing charsa
    df['combined_text'] = df['combined_text'].str.lower().str.strip()
    
    # Normalize university names
    df['combined_text'] = df['combined_text'].apply(normalize_university)

    # Tokenize + lemmatize
    df['lemmatized_tokens'] = df['combined_text'].apply(tokenize_and_lemmatize)

    # Return cleaned df
    return df

df = clean_text(df)

df.head()

Unnamed: 0,date_created,title,description,comments,upvotes,flair,combined_text,lemmatized_tokens
1,2024-11-08 03:24:09,[ Removed by Reddit ],[ Removed by Reddit on account of violating th...,"['Ik this man, he’s getting cooked. Maybe uoft...",2929,Discussion,"['ik this man, he’s getting cooked. maybe uoft...","[man, getting, cooked, maybe, uoft, anything, ..."
4,2025-05-14 00:11:02,University of Toronto Faculty Association vote...,,"['Big W, honestly im surprised', 'W faculty', ...",2157,News,uoft faculty association votes to divest from ...,"[uoft, faculty, association, vote, divest, isr..."
13,2024-06-03 21:51:09,Just graduated at Convocation with encampment ...,"Just graduated and guess what, the encampment ...","[""that's so good to hear! i'm a huge supporter...",1612,Discussion,just graduated at convocation with encampment ...,"[graduated, convocation, encampment, present, ..."
22,2023-12-19 14:19:36,Is this MAT224 final average fr? (not my class...,,['overconfident library vegetable dinosaurs ab...,1284,Courses,is this mat224 final average fr? (not my class...,"[final, average, fr, class, friend, sent, libr..."
27,2024-07-04 00:49:45,All that’s left of the encampment after todays...,Photo by @a1please on instagram,"['the grass is fucked 😭', ""Out of the loop, wh...",1271,News,all that’s left of the encampment after todays...,"[left, encampment, today, event, photo, instag..."


## Investigating Diction by Flair

In [113]:
from collections import Counter

def get_topk_words(df, k):
    '''
    Takes a df and returns top k words from tokens
    '''
    all_tokens = [token for row in df['lemmatized_tokens'] for token in row]

    # Count frequencies
    token_counts = Counter(all_tokens)

    # Get top K words
    top_k = token_counts.most_common(k)

    return top_k

# Get 10 top most common words in each flair 
for flair in df['flair'].unique():
    print(f'{flair}\n{get_topk_words(df[df['flair'] == flair], 5)}')


Discussion
[('people', 3817), ('like', 3058), ('student', 2560), ('get', 2262), ('would', 1921)]
News
[('people', 699), ('student', 545), ('like', 378), ('get', 358), ('would', 343)]
Courses
[('course', 336), ('student', 262), ('like', 218), ('class', 211), ('time', 149)]
Academics
[('student', 374), ('course', 234), ('exam', 194), ('get', 181), ('people', 172)]
Advice
[('year', 765), ('time', 598), ('get', 465), ('people', 402), ('like', 397)]
HQ Post
[('people', 121), ('like', 79), ('time', 62), ('mcgill', 61), ('would', 59)]


In [114]:
df['joined_text'] = df['lemmatized_tokens'].apply(lambda tokens: ' '.join(tokens))

In [115]:
from sklearn.feature_extraction.text import CountVectorizer

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.8, min_df=25, token_pattern=r'\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(df['joined_text']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names_out()

In [116]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 5


model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

In [118]:
model.fit(tf)

In [119]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [120]:
no_top_words = 15
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights
0,people,449.8,student,1690.3,people,2176.3,student,1600.3,school,920.5
1,would,433.5,course,1619.5,would,782.4,people,1344.9,people,875.6
2,tip,396.3,exam,994.2,think,761.7,think,606.1,work,808.9
3,go,385.8,prof,973.0,right,660.4,would,596.4,good,706.2
4,student,329.3,class,946.2,say,537.2,ubc,508.5,life,692.7
5,pay,324.6,would,740.5,thing,526.0,university,499.1,really,669.8
6,ubc,298.0,professor,543.7,want,488.2,uoft,429.3,feel,656.8
7,make,285.8,think,529.4,also,487.1,make,425.0,take,643.0
8,think,278.3,grade,481.8,make,457.9,school,398.5,first,634.9
9,place,268.3,people,475.4,really,447.6,deleted,395.2,friend,576.6
