In [1]:
import pandas as pd
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import pyLDAvis.gensim_models
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel

In [7]:
def load(file):

    df = pd.read_csv(file, nrows=100)
    content = df['content']
    return content

In [8]:
def token_filter(token):
    return not (token.is_punct | token.is_space | token.is_stop | 
                len(token.text) <= 2 | token.is_ascii | token.like_url | 
                token.like_num | token.like_email)

In [9]:
def clean(content):
    nlp = spacy.load("en_core_web_sm") # could exclude things like
    # tagger, ner, etc. 
    
    filtered_tokens = [] 
    # uses list() for batch procesing
    for i in list(nlp.pipe(content)):
        tokens = [token.lemma_.lower() for token in i if
    token_filter(token)]
        filtered_tokens.append(tokens)

    return filtered_tokens

In [12]:
def run_lda(filtered_tokens):
    dictionary = Dictionary(filtered_tokens)
    dictionary.filter_extremes(
        no_below=5, 
        no_above=0.5 
        #keep_n=1000
    )
    corpus = [dictionary.doc2bow(doc) for doc in filtered_tokens]
    lda_model = LdaMulticore(
        corpus=corpus, 
        id2word=dictionary, 
        iterations=100, 
        num_topics=16, 
        workers = 2, 
        passes=100
    )

    return lda_model.print_topics(-1)

In [11]:
content = load("lgbtcol.csv")
cleaned = clean(content) 
run_lda(cleaned)

[(0,
  '0.034*"view" + 0.030*"soulforce" + 0.022*"post" + 0.021*"forums" + 0.019*"community" + 0.013*"press" + 0.013*"task" + 0.012*"force" + 0.012*"search" + 0.012*"forum"'),
 (1,
  '0.026*"glaad" + 0.026*"prop" + 0.019*"post" + 0.018*"marriage" + 0.017*"what" + 0.016*"california" + 0.015*"july" + 0.015*"watch" + 0.014*"blog" + 0.014*"monday"'),
 (2,
  '0.019*"life" + 0.014*"they" + 0.012*"abortion" + 0.010*"their" + 0.009*"’s" + 0.008*"child" + 0.008*"say" + 0.008*"be" + 0.008*"will" + 0.007*"family"'),
 (3,
  '0.021*"photo" + 0.011*"campaign" + 0.011*"learn" + 0.011*"group" + 0.011*"comment" + 0.011*"click" + 0.011*"sign" + 0.011*"donate" + 0.011*"help" + 0.011*"creative"'),
 (4,
  '0.030*"say" + 0.026*"reply" + 0.026*"2015" + 0.025*"january" + 0.021*"chief" + 0.019*"right" + 0.018*"mayor" + 0.012*"fire" + 0.011*"’s" + 0.011*"what"'),
 (5,
  '0.027*"cabin" + 0.025*"republicans" + 0.024*"pride" + 0.020*"calendar" + 0.019*"center" + 0.016*"chapter" + 0.014*"event" + 0.014*"events" + 0