In [1]:
import string

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import pandas as pd
np.random.seed(2018)

my_stop_words = STOPWORDS.union(set(['httpstco']))

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/ed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
stemmer = SnowballStemmer('english')
punct_str = '''!"$%&'()*+,-./:;<=>?[\]^_`{|}~'''

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


def preprocess(text):
    text = text.replace('\\n',' ')
    text = text.translate(str.maketrans(' ', ' ', punct_str))
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in my_stop_words and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [4]:
podcast1 = pd.read_csv('./pod_tweets/follower_twts/KnowledgeFight.csv')
podcast2 = pd.read_csv('./pod_tweets/follower_twts/SeincastASeinfeldPodcast.csv')
data = podcast1.append(podcast2, ignore_index=True)
data = data[data.tweets != '[]']
data

FileNotFoundError: [Errno 2] File b'./follower_twts/KnowledgeFight.csv' does not exist: b'./follower_twts/KnowledgeFight.csv'

In [132]:
data_text = data[['tweets']]
data_text['index'] = data_text.index
documents = data_text

print(len(documents))
print(documents[:5])

8600
                                              tweets  index
0  ['Just a thought: if we replaced the police wi...      0
1  ['What the actual fuck? https://t.co/RB6QH6AmK...      1
3  ['@knowledge_fight Listening to your 14 Aug 20...      3
4  ['@Communism_Kills Boring', '@JasonVarheinum @...      4
5  ['@milesofgray sir, I feel dumb, but a guest f...      5


In [133]:
doc_sample = documents[documents['index'] == 0].values[0][0]

punct_str = '''!"$%&'()*+,-./:;<=>?[\]^_`{|}~'''

# WordNetLemmatizer().lemmatize(doc_sample, pos='v')
print('--original document: ')
doc_sample = doc_sample.replace('\\n',' ')
print(doc_sample)
print('--rm punc:')
doc_sample = doc_sample.translate(str.maketrans(' ', ' ', punct_str))
print(doc_sample)
print('--Separated:')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

--original document: 
['Just a thought: if we replaced the police with furries, one immediate benefit we would see is less dead dogs.', "@KelsCara @PugetSoundJBGC @IwriteOK said it the best in his latest episode of @bastardspod. Everyone knows there's something wrong and that they should be angry at something. It's just that folks like these have been tricked into thinking it's the left that is causing all their problems.", '@BernieSanders https://t.co/v3ALhSdZhS', "@IwriteOK There's something about a book. I can't bear to throw them away. I mourn the ones that have been lost in one of my many moves.", "@BillyWayneDavis Do you want a robot uprising? That's for sure how you get a robot uprising.", '@TheZoneCast @peacockTV https://t.co/Pns0UvwDEf', '@katystoll Katy, we need you as a juror! Sane, logical people that understand how unfair the legal system is.', 'This will keep me goin for a goos bit more. https://t.co/PBAKBqdEK2', 'I’m telling racists to go back to Europe all 2020', '@bast

In [134]:
processed_docs = documents['tweets'].map(preprocess)
processed_docs[:10]

0     [think, replac, polic, furri, immedi, benefit,...
1     [actual, fuck, httpstcorb, yike, mean, know, f...
3     [knowledgefight, listen, talk, ashyana, dean, ...
4     [communismkil, bore, jasonvarheinum, cernovich...
5     [milesofgray, feel, dumb, guest, fair, recent,...
6     [good, block, work, memori, forget, password, ...
7     [enter, chanc, custom, robeytech, coolermast, ...
8     [care, endors, berni, sander, point, matter, c...
11    [thedailybeast, aint, cheif, jkenney, like, di...
12    [planter, tell, stori, death, peanut, zora, th...
Name: tweets, dtype: object

In [135]:
dictionary = gensim.corpora.Dictionary(processed_docs)

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break
print('dictionary len: {}'.format(len(dictionary)))

0 alhsdzh
1 angri
2 away
3 bastardspod
4 bear
5 benefit
6 berniesand
7 best
8 billywaynedavi
9 book
10 caus
dictionary len: 126733


In [144]:
dictionary.filter_extremes(no_below=30, no_above=0.3, keep_n=100000)
print('dictionary len: {}'.format(len(dictionary)))

dictionary len: 2755


In [145]:
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 30:
        break

0 angri


In [146]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(28, 2),
 (75, 1),
 (84, 1),
 (96, 1),
 (108, 1),
 (131, 1),
 (147, 2),
 (151, 1),
 (163, 1),
 (176, 1),
 (181, 1),
 (187, 1),
 (195, 1),
 (222, 1),
 (231, 1),
 (253, 2),
 (260, 1),
 (274, 1),
 (283, 1),
 (284, 1),
 (287, 1),
 (292, 2),
 (306, 1),
 (313, 1),
 (349, 1),
 (353, 1),
 (374, 1),
 (384, 1),
 (405, 1),
 (420, 1),
 (497, 1),
 (501, 1),
 (511, 1),
 (517, 1),
 (551, 1),
 (562, 1),
 (651, 1),
 (695, 1),
 (696, 1),
 (703, 1),
 (725, 1),
 (745, 2),
 (757, 1),
 (797, 1),
 (800, 1),
 (811, 1),
 (833, 1),
 (850, 1),
 (938, 1),
 (953, 1),
 (994, 1),
 (1024, 1),
 (1056, 1),
 (1080, 1),
 (1087, 1),
 (1107, 1),
 (1130, 1),
 (1175, 1),
 (1186, 1),
 (1210, 1),
 (1216, 2),
 (1240, 1),
 (1275, 1),
 (1281, 1),
 (1313, 1),
 (1324, 1),
 (1331, 1),
 (1397, 1),
 (1487, 2),
 (1594, 1),
 (1601, 2),
 (1626, 1),
 (1710, 1),
 (1721, 1),
 (2064, 1),
 (2132, 1),
 (2257, 1),
 (2273, 1),
 (2349, 2),
 (2375, 1),
 (2392, 1)]

In [147]:
bow_doc_4310 = bow_corpus[4310]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(
            bow_doc_4310[i][0], dictionary[bow_doc_4310[i][0]], bow_doc_4310[i][1]))

Word 28 ("peopl") appears 2 time.
Word 75 ("carri") appears 1 time.
Word 84 ("final") appears 1 time.
Word 96 ("promis") appears 1 time.
Word 108 ("year") appears 1 time.
Word 131 ("announc") appears 1 time.
Word 147 ("give") appears 2 time.
Word 151 ("help") appears 1 time.
Word 163 ("pete") appears 1 time.
Word 176 ("tri") appears 1 time.
Word 181 ("absolut") appears 1 time.
Word 187 ("christma") appears 1 time.
Word 195 ("httpstcof") appears 1 time.
Word 222 ("celebr") appears 1 time.
Word 231 ("evil") appears 1 time.
Word 253 ("live") appears 2 time.
Word 260 ("nation") appears 1 time.
Word 274 ("spend") appears 1 time.
Word 283 ("valley") appears 1 time.
Word 284 ("watch") appears 1 time.
Word 287 ("week") appears 1 time.
Word 292 ("world") appears 2 time.
Word 306 ("hous") appears 1 time.
Word 313 ("presid") appears 1 time.
Word 349 ("role") appears 1 time.
Word 353 ("state") appears 1 time.
Word 374 ("death") appears 1 time.
Word 384 ("honor") appears 1 time.
Word 405 ("statemen

In [148]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.17350044118710398),
 (1, 0.09933709763028145),
 (2, 0.36277649009686225),
 (3, 0.13273029030096511),
 (4, 0.16391450080290204),
 (5, 0.14462607848588757),
 (6, 0.07694368848606319),
 (7, 0.10897447863651102),
 (8, 0.1306301333571147),
 (9, 0.1325866245361997),
 (10, 0.16263225517441865),
 (11, 0.10239518200268184),
 (12, 0.19870211772407972),
 (13, 0.12200989557763872),
 (14, 0.13360383723039101),
 (15, 0.1454207575513756),
 (16, 0.16048709148144993),
 (17, 0.2491899220729396),
 (18, 0.18875717972085881),
 (19, 0.04956622360964723),
 (20, 0.1510338232847544),
 (21, 0.08895324949750409),
 (22, 0.15734242484404992),
 (23, 0.18517863072712085),
 (24, 0.09628401138610057),
 (25, 0.13682912790003307),
 (26, 0.06042565213969678),
 (27, 0.14855758305081965),
 (28, 0.05009031210129303),
 (29, 0.144040749115548),
 (30, 0.11989615728578926),
 (31, 0.14790924083322554),
 (32, 0.16952359100702066),
 (33, 0.06329896985689006),
 (34, 0.08747481885526978),
 (35, 0.07737961569320295),
 (36, 0.0

In [149]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=6, id2word=dictionary, passes=2, workers=2)

In [150]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.011*"time" + 0.010*"follow" + 0.009*"game" + 0.008*"want" + 0.007*"chanc" + 0.007*"know" + 0.007*"think" + 0.006*"watch" + 0.006*"need" + 0.006*"good"
Topic: 1 
Words: 0.012*"think" + 0.010*"great" + 0.008*"peopl" + 0.007*"look" + 0.007*"time" + 0.006*"realdonaldtrump" + 0.006*"say" + 0.006*"know" + 0.005*"work" + 0.005*"year"
Topic: 2 
Words: 0.013*"peopl" + 0.010*"think" + 0.009*"year" + 0.008*"know" + 0.006*"trump" + 0.006*"time" + 0.005*"want" + 0.005*"good" + 0.005*"need" + 0.005*"go"
Topic: 3 
Words: 0.012*"year" + 0.008*"know" + 0.007*"time" + 0.007*"today" + 0.007*"love" + 0.006*"team" + 0.006*"happi" + 0.006*"great" + 0.006*"right" + 0.006*"think"
Topic: 4 
Words: 0.011*"podcast" + 0.010*"go" + 0.009*"episod" + 0.008*"time" + 0.008*"love" + 0.008*"game" + 0.006*"play" + 0.006*"good" + 0.006*"year" + 0.006*"come"
Topic: 5 
Words: 0.010*"peopl" + 0.010*"trump" + 0.007*"say" + 0.007*"berni" + 0.007*"know" + 0.007*"want" + 0.007*"time" + 0.006*"thing" + 0.006*"g

In [151]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.003*"peopl" + 0.003*"think" + 0.003*"know" + 0.003*"podcast" + 0.003*"time" + 0.003*"go" + 0.003*"year" + 0.003*"love" + 0.003*"thank" + 0.002*"need"
Topic: 1 Word: 0.003*"love" + 0.003*"good" + 0.003*"think" + 0.003*"time" + 0.003*"year" + 0.002*"happi" + 0.002*"need" + 0.002*"right" + 0.002*"say" + 0.002*"peopl"
Topic: 2 Word: 0.004*"trump" + 0.003*"peopl" + 0.003*"know" + 0.002*"think" + 0.002*"love" + 0.002*"time" + 0.002*"good" + 0.002*"year" + 0.002*"game" + 0.002*"fuck"
Topic: 3 Word: 0.003*"love" + 0.003*"thank" + 0.003*"think" + 0.003*"year" + 0.003*"time" + 0.003*"know" + 0.002*"thing" + 0.002*"peopl" + 0.002*"look" + 0.002*"follow"
Topic: 4 Word: 0.003*"trump" + 0.003*"love" + 0.003*"go" + 0.003*"peopl" + 0.003*"time" + 0.003*"year" + 0.003*"thank" + 0.003*"think" + 0.003*"game" + 0.002*"retweet"
Topic: 5 Word: 0.003*"think" + 0.003*"game" + 0.003*"love" + 0.003*"look" + 0.003*"peopl" + 0.003*"know" + 0.003*"good" + 0.002*"time" + 0.002*"year" + 0.002*"thank