# Topic Modeling - BERT

#### The Reddit texts are too short for coherent/interpretable topics from LDA, so let's try BERT Topic modeling.

In [1]:
import pandas as pd
import numpy as np
import spacy

  return torch._C._cuda_getDeviceCount() > 0


#### Read in data

In [2]:
from ast import literal_eval
def converter(x):
    return literal_eval(x)

# full corpus
full_df = pd.read_csv('../data/full_df_processed.csv', converters={'tokens_new':converter, 'normalized_tokens':converter})

# csv file/df of posts and comments not scraped from r/asianamerican
non_aa_df = pd.read_csv('../data/not_asianamerican_df.csv', converters={'tokens_new':converter, 'normalized_tokens':converter})

#### Word Embeddings

Need to tokenize by sentence, not individual words for CBOW or skipgrams to make sense.

In [3]:
full_df.head(5)

Unnamed: 0.1,Unnamed: 0,id,parent_id,username,time_created,flair,body,subreddit,tokens_new,word_count,normalized_tokens,normalized_tokens_count,subreddit_name,sr_is_asian
0,0,14m8mf4,,Tungsten_,2023-06-29 10:54:44,,[Megathread] Supreme Court Ruling on Affirmati...,t5_2rfyw,"[Megathread, Supreme, Court, Ruling, on, Affir...",78,"[megathread, supreme, court, ruling, affirmati...",62,asianamerican,True
1,1,jq5du0z,t3_14m8mf4,Tungsten_,2023-06-30 11:33:11,,Thanks to everyone who engaged in insightful a...,t5_2rfyw,"[Thanks, to, everyone, who, engaged, in, insig...",20,"[thank, engage, insightful, respectful, discou...",9,asianamerican,True
2,2,jq0dgzx,t3_14m8mf4,ProudBlackMatt,2023-06-29 11:16:15,Chinese-American,I would prefer using a process that takes into...,t5_2rfyw,"[I, would, prefer, using, a, process, that, ta...",103,"[prefer, process, take, account, poverty, inst...",52,asianamerican,True
3,3,jq0cg7k,t3_14m8mf4,TomatoCanned,2023-06-29 11:09:47,,"u/Tungsten_, Thanks for creating a section jus...",t5_2rfyw,"[u/Tungsten_,, Thanks, for, creating, a, secti...",269,"[u/tungsten_,, thank, create, section, discuss...",126,asianamerican,True
4,4,jq0f52k,t3_14m8mf4,bad-fengshui,2023-06-29 11:26:41,,As with anything related to Asians in politics...,t5_2rfyw,"[As, with, anything, related, to, Asians, in, ...",59,"[relate, asians, politic, m, see, lot, non, as...",25,asianamerican,True


In [4]:
# make deep copy of full_df
full_df_new = full_df.copy(deep=True)
full_df_new.drop(columns=['Unnamed: 0','tokens_new', 'normalized_tokens', 'normalized_tokens_count'], inplace=True)

In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
def word_tokenize(word_list, model=nlp, MAX_LEN=1500000):
    '''
    note that word_list here is in fact a string. If it happens to be a list, we convert it to string format. 
    '''
    tokenized = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list]) 
    # since we're only tokenizing, I remove RAM intensive operations and increase max text size

    model.max_length = MAX_LEN
    doc = model(word_list, disable=["parser", "tagger", "ner"])

    for token in doc:
        if not token.is_punct and len(token.text.strip()) > 0:
            tokenized.append(token.text)
    return tokenized

In [7]:
def sent_tokenize(word_list, model=nlp):
    doc = model(word_list)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

In [8]:
def normalizeTokens(word_list, extra_stop=[], model=nlp, lemma=True, MAX_LEN=1500000):
    #We can use a generator here as we just need to iterate over it
    normalized = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list]) 

    # since we're only normalizing, I remove RAM intensive operations and increase max text size

    model.max_length = MAX_LEN
    doc = model(word_list.lower(), disable=["parser", "ner"])

    if len(extra_stop) > 0:
        for stopword in extra_stop:
            lexeme = nlp.vocab[stopword]
            lexeme.is_stop = True

    # we check if we want lemmas or not earlier to avoid checking every time we loop
    if lemma:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.lemma_))
    else:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.text.strip()))

    return normalized

In [9]:
full_df_new['tokens'] = full_df['body'].apply(lambda x: [word_tokenize(s) for s in sent_tokenize(x)]) # takes 10 mins and a half to run



In [17]:
full_df_new

AttributeError: 'NoneType' object has no attribute 'items'

      Unnamed: 0       id   parent_id              username  \
0              0  14m8mf4         NaN             Tungsten_   
1              1  jq5du0z  t3_14m8mf4             Tungsten_   
2              2  jq0dgzx  t3_14m8mf4        ProudBlackMatt   
3              3  jq0cg7k  t3_14m8mf4          TomatoCanned   
4              4  jq0f52k  t3_14m8mf4          bad-fengshui   
...          ...      ...         ...                   ...   
4515        4515   osjkh6         NaN             yellowmix   
4516        4516  iw0q5sn   t3_yr5o90             yellowmix   
4517        4517   uyzxgz         NaN      Kamala_Metamorph   
4518        4518  l20ftbs  t3_1cfw1ru  Extension_River_9901   
4519        4519   1khnmw         NaN              Swordbow   

             time_created             flair  \
0     2023-06-29 10:54:44               NaN   
1     2023-06-30 11:33:11               NaN   
2     2023-06-29 11:16:15  Chinese-American   
3     2023-06-29 11:09:47               NaN   
4     20

In [10]:
full_df_new['normalized_tokens'] = full_df_new['tokens'].apply(lambda x: [normalizeTokens(s, lemma=False) for s in x]) # takes a minute and half to run

In [11]:
full_df_new.to_csv('../data/full_df_processed_sent.csv')

In [12]:
full_df_new

Unnamed: 0,id,parent_id,username,time_created,flair,body,subreddit,word_count,subreddit_name,sr_is_asian,tokens,normalized_tokens
0,14m8mf4,,Tungsten_,2023-06-29 10:54:44,,[Megathread] Supreme Court Ruling on Affirmati...,t5_2rfyw,78,asianamerican,True,"[[Megathread, Supreme, Court, Ruling, on, Affi...","[[megathread, supreme, court, ruling, affirmat..."
1,jq5du0z,t3_14m8mf4,Tungsten_,2023-06-30 11:33:11,,Thanks to everyone who engaged in insightful a...,t5_2rfyw,20,asianamerican,True,"[[Thanks, to, everyone, who, engaged, in, insi...","[[thanks, engaged, insightful, respectful, dis..."
2,jq0dgzx,t3_14m8mf4,ProudBlackMatt,2023-06-29 11:16:15,Chinese-American,I would prefer using a process that takes into...,t5_2rfyw,103,asianamerican,True,"[[I, would, prefer, using, a, process, that, t...","[[prefer, process, takes, account, poverty, in..."
3,jq0cg7k,t3_14m8mf4,TomatoCanned,2023-06-29 11:09:47,,"u/Tungsten_, Thanks for creating a section jus...",t5_2rfyw,269,asianamerican,True,"[[u, Tungsten, Thanks, for, creating, a, secti...","[[u, tungsten, thanks, creating, section, disc..."
4,jq0f52k,t3_14m8mf4,bad-fengshui,2023-06-29 11:26:41,,As with anything related to Asians in politics...,t5_2rfyw,59,asianamerican,True,"[[As, with, anything, related, to, Asians, in,...","[[related, asians, politics, m, seeing, lot, n..."
...,...,...,...,...,...,...,...,...,...,...,...,...
4515,osjkh6,,yellowmix,2021-07-27 09:32:14,,This Is the End of Affirmative Action. What ar...,t5_2qhgd,15,racism,False,"[[This, Is, the, End, of, Affirmative, Action]...","[[end, affirmative, action], [going]]"
4516,iw0q5sn,t3_yr5o90,yellowmix,2022-11-12 01:09:36,,What do you mean? That nepotism comes from whi...,t5_2qhgd,32,racism,False,"[[What, do, you, mean], [That, nepotism, comes...","[[mean], [nepotism, comes, white, supremacy], ..."
4517,uyzxgz,,Kamala_Metamorph,2022-05-27 14:52:16,​,How to have a conversation with an open-minded...,t5_38jid,237,MensLib,False,"[[How, to, have, a, conversation, with, an, op...","[[conversation, open, minded, disadvantaged, g..."
4518,l20ftbs,t3_1cfw1ru,Extension_River_9901,2024-04-30 22:56:39,New user,Democrats that want to expand education .Fun...,t5_3amv4,349,aznidentity,True,"[[Democrats, that, want, to, expand, education...","[[democrats, want, expand, education, .funding..."


### Word Embeddings

In [16]:
import gensim

ImportError: cannot import name 'suppress_warnings' from 'numpy.testing' (unknown location)