In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:

cleaned_comments_df = \
pd.read_csv(os.path.join("static","data","cleaned_comments_239858.csv"),\
    converters={"body_tokens": lambda x: x.strip("[]").replace("'","").split(", "),
               "score": lambda x: int(x)})

In [15]:
cleaned_comments_df.shape

(235579, 6)

In [4]:
cleaned_comments_df["body_tokens_spaced"] = cleaned_comments_df["body_tokens"].apply(lambda x: ' '.join(x))

In [5]:
tf_vectorizer = CountVectorizer()
tf = tf_vectorizer.fit_transform(cleaned_comments_df["body_tokens_spaced"])
tf_feature_names = tf_vectorizer.get_feature_names()

In [6]:
n_topics = 249
lda = LatentDirichletAllocation(n_components=n_topics, random_state=4).fit(tf)

In [9]:
import pickle
with open(os.path.join("models", "reddit_LDA_model.pkf"), "wb") as f:
    model = pickle.dump(lda, f)

In [7]:
def display_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic: {topic_idx}")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

n_top_words = 10
display_topics(lda, tf_feature_names, n_top_words)

Topic: 0
question ask history basic developer answer specific seek response possibility
Topic: 1
patch esp realistic stable legendary convenient overhaul career stability bolt
Topic: 2
money cost tax buy pay plastic fund spend return bottle
Topic: 3
cat private owner sims proud hype big volunteer rescue dependent
Topic: 4
kid eat parent fair holy crap tire enough younger peace
Topic: 5
phone fat fell healthy call bridge relative bang iphone appearance
Topic: 6
model paint daughter clue pirate texture delivery plug seat flash
Topic: 7
correct music hop video suggestion artist audio appropriate doe genre
Topic: 8
lol hero trip ridiculous murder manager even leak never bad
Topic: 9
thread topic argue review post question disagree ask direct please
Topic: 10
com imgur jpg png pool duty disk gallery injure bacteria
Topic: 11
must class extremely effort refer proper curse row low begin
Topic: 12
rat catch craft illegal layer coffee instant atleast hawk bio
Topic: 13
support steam account ban

hurt dumb bag trouble alive print anybody doe asian see
Topic: 152
enjoy latest grade beer objective atm wine awhile enjoyable walmart
Topic: 153
update april ship fool 1st til whoever scary entitle bios
Topic: 154
pay bill month payment insurance afford loan smile french need
Topic: 155
kill wave freeze economy perspective priority justify differently doe could
Topic: 156
spot police blind officer chase alliance dust bright horde rage
Topic: 157
amazon walk door respond bed bully neighbor apartment night attitude
Topic: 158
comment answer report delete item helpful repeat earn find see
Topic: 159
dollar intend million era shake belt symbol capitalism popularity pity
Topic: 160
value seriously union protect material genius goat cache mat 100k
Topic: 161
egg australia notification alt border permanent sample equally alternate province
Topic: 162
dude wish heart random could logic best scroll luck good
Topic: 163
lmao mate try yup rich trick frustrate ppl ignorant boom
Topic: 164
sick pa

In [10]:
lda_output = lda.transform(tf)

# column names
topicnames = ["Topic" + str(i) for i in range(n_topics)]

# index names
docnames = ["Doc" + str(i) for i in range(len(cleaned_comments_df))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

In [11]:
def color_blue(val):
    color = 'blue' if val > .1 else 'white'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_blue).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,Topic25,Topic26,Topic27,Topic28,Topic29,Topic30,Topic31,Topic32,Topic33,Topic34,Topic35,Topic36,Topic37,Topic38,Topic39,Topic40,Topic41,Topic42,Topic43,Topic44,Topic45,Topic46,Topic47,Topic48,Topic49,Topic50,Topic51,Topic52,Topic53,Topic54,Topic55,Topic56,Topic57,Topic58,Topic59,Topic60,Topic61,Topic62,Topic63,Topic64,Topic65,Topic66,Topic67,Topic68,Topic69,Topic70,Topic71,Topic72,Topic73,Topic74,Topic75,Topic76,Topic77,Topic78,Topic79,Topic80,Topic81,Topic82,Topic83,Topic84,Topic85,Topic86,Topic87,Topic88,Topic89,Topic90,Topic91,Topic92,Topic93,Topic94,Topic95,Topic96,Topic97,Topic98,Topic99,Topic100,Topic101,Topic102,Topic103,Topic104,Topic105,Topic106,Topic107,Topic108,Topic109,Topic110,Topic111,Topic112,Topic113,Topic114,Topic115,Topic116,Topic117,Topic118,Topic119,Topic120,Topic121,Topic122,Topic123,Topic124,Topic125,Topic126,Topic127,Topic128,Topic129,Topic130,Topic131,Topic132,Topic133,Topic134,Topic135,Topic136,Topic137,Topic138,Topic139,Topic140,Topic141,Topic142,Topic143,Topic144,Topic145,Topic146,Topic147,Topic148,Topic149,Topic150,Topic151,Topic152,Topic153,Topic154,Topic155,Topic156,Topic157,Topic158,Topic159,Topic160,Topic161,Topic162,Topic163,Topic164,Topic165,Topic166,Topic167,Topic168,Topic169,Topic170,Topic171,Topic172,Topic173,Topic174,Topic175,Topic176,Topic177,Topic178,Topic179,Topic180,Topic181,Topic182,Topic183,Topic184,Topic185,Topic186,Topic187,Topic188,Topic189,Topic190,Topic191,Topic192,Topic193,Topic194,Topic195,Topic196,Topic197,Topic198,Topic199,Topic200,Topic201,Topic202,Topic203,Topic204,Topic205,Topic206,Topic207,Topic208,Topic209,Topic210,Topic211,Topic212,Topic213,Topic214,Topic215,Topic216,Topic217,Topic218,Topic219,Topic220,Topic221,Topic222,Topic223,Topic224,Topic225,Topic226,Topic227,Topic228,Topic229,Topic230,Topic231,Topic232,Topic233,Topic234,Topic235,Topic236,Topic237,Topic238,Topic239,Topic240,Topic241,Topic242,Topic243,Topic244,Topic245,Topic246,Topic247,Topic248,dominant_topic
Doc0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.14,0,0,0.0,0,0.14,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.0,0.0,0,0.0,0,0,0,0,0,0,0,0,0.14,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.45,0,0,0,0,0,0,0.0,0.0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0.0,0,186
Doc1,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0.0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0.0,0,0,0.0,0,0,0.42,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.0,0.0,0,0.0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.38,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0.0,0,67
Doc2,0,0,0.0,0.14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.26,0,0,0,0,0.0,0,0,0.0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.0,0.0,0,0.0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,0,0,0,0,0.2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0.1,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0.1,0.0,0,0,0,0.09,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0.0,0,35
Doc3,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0.0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.67,0.0,0,0.0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0.0,0,95
Doc4,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0.48,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.0,0.0,0,0.0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.17,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.19,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0.0,0,43
Doc5,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0.0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.0,0.0,0,0.0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.39,0,0.0,0,0,0,0,0,0,0.0,0.31,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.17,0,0,0,0,0.0,0,184
Doc6,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0.0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.0,0.0,0,0.0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0.47,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0.2,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.17,0.0,0,0,0,0,0.0,0,146
Doc7,0,0,0.07,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0.0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0.41,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.21,0,0.0,0.11,0,0.1,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0.0,0,57
Doc8,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0.0,0,0.0,0,0,0,0,0,0.13,0,0,0,0,0,0.0,0,0,0,0.0,0,0,0.24,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.0,0.0,0,0.0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.13,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.13,0.0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0.24,0,64
Doc9,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0.0,0,0.0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.0,0.0,0,0.5,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0.0,0,98


In [None]:
# module to visualize topics
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer, mds='mmds')

In [13]:
lda

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=249, n_jobs=None,
                          perp_tol=0.1, random_state=4, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [14]:
df_document_topic.shape

(235579, 250)

In [25]:
# len(lda_output[0] + lda_output[1])
# len(set(cleaned_comments_df["subreddit"].value_counts))
cleaned_comments_df["subreddit"].value_counts()["totalwar"]

995

In [23]:
subreddit_topic_scores = {}
subreddits = set(cleaned_comments_df["subreddit"])
for subreddit in subreddits:
    subreddit_topic_scores[subreddit] = [0.]*249

In [26]:
for i, comment_topic_scores in enumerate(lda_output):
    subreddit = cleaned_comments_df["subreddit"][i]
    subreddit_topic_scores[subreddit] += comment_topic_scores

KeyboardInterrupt: 