# Topic Analysis

In [2]:
import re

import pandas as pd
from nltk.corpus import stopwords

import gensim
from gensim.utils import simple_preprocess
from gensim import corpora
import en_core_web_sm
nlp = en_core_web_sm.load()

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

stop_words = set(stopwords.words('english'))

In [3]:
data = pd.read_csv("data/q_ethereum_merge_reddit_full10.csv")
data["time"] = pd.to_datetime(data["time"])
data_after = data[data["time"] > "2022-09-15 00:00:00"]
data_during = data[(data["time"] < "2022-09-15 00:00:00") & (data["time"] > "2020-12-01 00:00:00")]
data_before = data[data["time"] < "2020-12-01 00:00:00"]
print("before: " + str(len(data_before)))
print("during: " + str(len(data_during)))
print("after: " + str(len(data_after)))

before: 78
during: 799
after: 145


In [7]:
print(data["text"].apply(lambda x: len(x.split())).mean()) #Average text lengt in words

647.8825831702544


In [3]:
def remove_URL(sample):
    return re.sub(r"http\S+", "", sample)

def sub_abb(sample):
    return re.sub("eth", "ethereum", sample)

def tokenizer(texts):
    for text in texts:
        text = remove_URL(text)
        text = sub_abb(text)
        yield simple_preprocess(text, deacc=True)


tokens_list = list(tokenizer(data['text']))
bigram = gensim.models.Phrases(tokens_list, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[tokens_list], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def remove_stopwords(texts):
    return [[word for word in doc if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ']):
    texts_out = []
    for doc in texts:
        joined = nlp(" ".join(doc))
        texts_out.append([token.lemma_ for token in joined if token.pos_ in allowed_postags])
    return texts_out

In [4]:
tokens_list = remove_stopwords(tokens_list)
tokens_list = make_bigrams(tokens_list)
tokens_list = make_trigrams(tokens_list)
lemmas = lemmatization(tokens_list)

# Create Dictionary
dictionary = corpora.Dictionary(lemmas)
# Create Corpus, i.e. Document-Term Matrix
corpus = [dictionary.doc2bow(text) for text in lemmas]

In [5]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=4,
                                           random_state=1,
                                           eval_every = 20,
                                           update_every=1,
                                           chunksize=500,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=False)
lda_viz = gensimvis.prepare(lda_model, corpus, dictionary)
lda_viz

  default_term_info = default_term_info.sort_values(


## Before

In [6]:
tokens_before = list(tokenizer(data_before['text']))
tokens_before = remove_stopwords(tokens_before)
tokens_before = make_bigrams(tokens_before)
tokens_before = make_trigrams(tokens_before)
lemmas_before = lemmatization(tokens_before)

# Create Dictionary
dictionary_before = corpora.Dictionary(lemmas_before)
# Create Corpus, i.e. Document-Term Matrix
corpus_before = [dictionary_before.doc2bow(text) for text in lemmas_before]

lda_model_before = gensim.models.ldamodel.LdaModel(corpus=corpus_before,
                                           id2word=dictionary_before,
                                           num_topics=4,
                                           random_state=100,
                                           eval_every = 20,
                                           update_every=1,
                                           chunksize=500,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

lda_viz_before = gensimvis.prepare(lda_model_before, corpus_before, dictionary_before)
lda_viz_before

  default_term_info = default_term_info.sort_values(


## During

In [7]:
tokens_during = list(tokenizer(data_during['text']))
tokens_during = remove_stopwords(tokens_during)
tokens_during = make_bigrams(tokens_during)
tokens_during = make_trigrams(tokens_during)
lemmas_during = lemmatization(tokens_during)

# Create Dictionary
dictionary_during = corpora.Dictionary(lemmas_during)
# Create Corpus, i.e. Document-Term Matrix
corpus_during = [dictionary_during.doc2bow(text) for text in lemmas_during]

lda_model_during = gensim.models.ldamodel.LdaModel(corpus=corpus_during,
                                           id2word=dictionary_during,
                                           num_topics=6,
                                           random_state=100,
                                           eval_every = 20,
                                           update_every=1,
                                           chunksize=500,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

lda_viz_during = gensimvis.prepare(lda_model_during, corpus_during, dictionary_during)
lda_viz_during

  default_term_info = default_term_info.sort_values(


## After

In [8]:
tokens_after = list(tokenizer(data_after['text']))
tokens_after = remove_stopwords(tokens_after)
tokens_after = make_bigrams(tokens_after)
tokens_after = make_trigrams(tokens_after)
lemmas_after = lemmatization(tokens_after)

# Create Dictionary
dictionary_after = corpora.Dictionary(lemmas_after)
# Create Corpus, i.e. Document-Term Matrix
corpus_after = [dictionary_after.doc2bow(text) for text in lemmas_after]

lda_model_after = gensim.models.ldamodel.LdaModel(corpus=corpus_after,
                                           id2word=dictionary_after,
                                           num_topics=6,
                                           random_state=100,
                                           eval_every = 20,
                                           update_every=1,
                                           chunksize=500,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

lda_viz_after = gensimvis.prepare(lda_model_after, corpus_after, dictionary_after)
lda_viz_after

  default_term_info = default_term_info.sort_values(
