In [1]:
import pandas as pd
pd.set_option("display.max_colwidth", 40)
import numpy as np
import re
import spacy
import nltk
from nltk import FreqDist
import gensim
from gensim import corpora

# libraries for visualization
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline

## Load  Data

In [3]:
data = pd.read_csv("cryptodata_test_lemmatized", sep=",", index_col=0)

In [5]:
data.shape

(500, 6)

In [6]:
data.head()

Unnamed: 0_level_0,body,stopword_reviews,body_as_str,tokens,review_lemmatized,body_lemmatized
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1543622368,CoinSpeakerProBit: Professional Digi...,"['coinspeakerprobit', 'professional'...",CoinSpeakerProBit: Professional Digi...,"['coinspeakerprobit', 'professional'...","['coinspeakerprob', 'professional', ...","['coinspeakerprob', 'professional', ..."
1543620634,Today in crypto heard rumors of Sato...,"['today', 'crypto', 'heard', 'rumors...",Today in crypto heard rumors of Sato...,"['today', 'crypto', 'heard', 'rumors...","['today', 'crypto', 'heard', 'rumor'...","['today', 'crypto', 'heard', 'rumor'..."
1543620300,Crashes in the crypto market resulte...,"['crashes', 'crypto', 'market', 'res...",Crashes in the crypto market resulte...,"['crashes', 'the', 'crypto', 'market...","['crash', 'the', 'crypto', 'market',...","['crash', 'the', 'crypto', 'market',..."
1543620202,SEC Chairman Jay Clayton has claimed...,"['sec', 'chairman', 'jay', 'clayton'...",SEC Chairman Jay Clayton has claimed...,"['sec', 'chairman', 'jay', 'clayton'...","['sec', 'chairman', 'jay', 'clayton'...","['sec', 'chairman', 'jay', 'clayton'..."
1543618813,There are four different Ethereum wo...,"['four', 'different', 'ethereum', 'w...",There are four different Ethereum wo...,"['there', 'are', 'four', 'different'...","['ther', 'are', 'four', 'different',...","['ther', 'are', 'four', 'different',..."


In [7]:
#the lemmantized_reviews are interpreted as a string by the read_csv command. We convert it into a list. 
data['stopword_reviews'] = data['stopword_reviews'].str.replace(r"[\[\]\']", "").str.split(", ")

In [8]:
data['stopword_reviews'] #checking if it's a list

time
1543622368    [coinspeakerprobit, professional, di...
1543620634    [today, crypto, heard, rumors, satos...
1543620300    [crashes, crypto, market, resulted, ...
1543620202    [sec, chairman, jay, clayton, claime...
1543618813    [four, different, ethereum, working,...
1543617903    [nov, 29, sirin, labs, announced, co...
1543616700    [switzerlands, oldest, university, u...
1543616305    [newer, hbus, cryptocurrency, exchan...
1543616240    [met, u, securities, exchange, commi...
1543615852    [latest, ethereum, 1x, meeting, cond...
1543615242    [chinese, cryptocurrency, mining, ha...
1543615229    [blockchain, phones, therage, right,...
1543614900    [check, point, research, cybersecuri...
1543613835    [belarus, following, model, set, for...
1543612078    [casual, gamers, unite, stay, thats,...
1543611643    [new, report, coinshares, bitcoin, m...
1543611641    [one, co, founders, second, largest,...
1543611625    [co, founder, ethereum, vitalik, but...
1543611600    [2017, ea

In [9]:
data.dropna(subset=['stopword_reviews'], inplace=True)

In [10]:
data.shape

(500, 6)

## Remove words

In [11]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/elliot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
import stopwords 

SyntaxError: Missing parentheses in call to 'print'. Did you mean print(lang)? (__init__.py, line 84)

In [13]:
# We list the words we want to get rid of"
# list_stop_words = ["i","me","my","myself","we","our","ours","ourselves","you","your","yours","yourself","yourselves","he","him","his","himself","she","her","hers","herself","it","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will","just","don","should","now"]
list_stop_words = ["&", ";", "2", "get","de", "got", "went","much","well","day","stay","even","could","area","also","uk","us","one","center","really","would","4","great","good","cottage","parc","parcs","park","parks"]

In [14]:
def remove_stop_word(x, list_stop_words=list_stop_words):
    clean_x = [l for l in x if l not in list_stop_words]
    return clean_x

In [15]:
data['stopword_reviews'] = data['stopword_reviews'].apply(remove_stop_word)

In [16]:
data.head()

Unnamed: 0_level_0,body,stopword_reviews,body_as_str,tokens,review_lemmatized,body_lemmatized
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1543622368,CoinSpeakerProBit: Professional Digi...,"[coinspeakerprobit, professional, di...",CoinSpeakerProBit: Professional Digi...,"['coinspeakerprobit', 'professional'...","['coinspeakerprob', 'professional', ...","['coinspeakerprob', 'professional', ..."
1543620634,Today in crypto heard rumors of Sato...,"[today, crypto, heard, rumors, satos...",Today in crypto heard rumors of Sato...,"['today', 'crypto', 'heard', 'rumors...","['today', 'crypto', 'heard', 'rumor'...","['today', 'crypto', 'heard', 'rumor'..."
1543620300,Crashes in the crypto market resulte...,"[crashes, crypto, market, resulted, ...",Crashes in the crypto market resulte...,"['crashes', 'the', 'crypto', 'market...","['crash', 'the', 'crypto', 'market',...","['crash', 'the', 'crypto', 'market',..."
1543620202,SEC Chairman Jay Clayton has claimed...,"[sec, chairman, jay, clayton, claime...",SEC Chairman Jay Clayton has claimed...,"['sec', 'chairman', 'jay', 'clayton'...","['sec', 'chairman', 'jay', 'clayton'...","['sec', 'chairman', 'jay', 'clayton'..."
1543618813,There are four different Ethereum wo...,"[four, different, ethereum, working,...",There are four different Ethereum wo...,"['there', 'are', 'four', 'different'...","['ther', 'are', 'four', 'different',...","['ther', 'are', 'four', 'different',..."


In [17]:
data.shape

(500, 6)

## Building an LDA model

In [18]:
dictionary = corpora.Dictionary(data['stopword_reviews'])

In [19]:
len(dictionary)

4669

In [20]:
doc_term_matrix = [dictionary.doc2bow(rev) for rev in data['stopword_reviews']]

In [23]:
import warnings
warnings.filterwarnings('ignore')

# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

# Build LDA model
num_topics = 4
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, 
                num_topics=num_topics, 
                alpha=[0.0001] * num_topics, 
                eta=[0.0001] * len(dictionary),
                chunksize=2000,
                passes=4,
                random_state=100,
               )

In [22]:
lda_model.print_topics(num_words=5)

[(0,
  '0.019*"first" + 0.016*"post" + 0.015*"appeared" + 0.013*"ccn" + 0.013*"cryptocurrency"'),
 (1,
  '0.036*"bitcoin" + 0.017*"" + 0.016*"market" + 0.015*"first" + 0.013*"crypto"'),
 (2,
  '0.017*"first" + 0.014*"" + 0.014*"bitcoin" + 0.011*"appeared" + 0.011*"post"'),
 (3,
  '0.055*"$" + 0.025*"price" + 0.020*"bitcoin" + 0.016*"first" + 0.014*"post"'),
 (4,
  '0.023*"bitcoin" + 0.022*"" + 0.017*"first" + 0.015*"8217" + 0.015*"post"'),
 (5,
  '0.013*"platform" + 0.013*"first" + 0.012*"trading" + 0.011*"blockchain" + 0.011*"exchange"')]

## Topic visualization

In [96]:
# Visualize the topics
pd.options.display.max_colwidth = 2000
viz = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary, mds='tsne')

In [97]:
pyLDAvis.enable_notebook()
viz
#pyLDAvis.show(viz)