# Bitcoin Sentiment Analysis - News Article Topic Modeling

#### Key Resource - Topic Modeling with Gensim

https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

### Import Libraries

In [None]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
from itertools import product

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

import pickle

import warnings
warnings.filterwarnings("ignore")

# Set random state variable
rs = 13

### Import Data

In [2]:
# Pickle the combined and cleaned dataset
with open('sentiment_pickles/pickle_articles_modeling.pickle', 'rb') as read_file:
    bitcoin_articles = pickle.load(read_file)

In [3]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

## Reshape Data for Latent Dirilecht Allocation (LDA)

In [4]:
def article_to_words(articles):
    for article in articles:
        yield(gensim.utils.simple_preprocess(str(article), deacc=True))  # deacc=True removes punctuations

def remove_stopwords(articles):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in articles]

def make_bigrams(articles):
    return [bigram_mod[doc] for doc in articles]

def make_trigrams(articles):
    return [trigram_mod[bigram_mod[doc]] for doc in articles]

def lemmatization(articles, allowed_pos_tags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    lemmatized = []
    for doc in articles:
        doc = nlp(" ".join(doc)) 
        lemmatized.append([token.lemma_ for token in doc if token.pos_ in allowed_pos_tags])
    return lemmatized

In [5]:
# Replace \n and \xa0
bitcoin_articles['body'] = bitcoin_articles['body'].str.replace('\n', ' ')
bitcoin_articles['body'] = bitcoin_articles['body'].str.replace('\xa0', ' ')

# Convert articles to words
data = list(bitcoin_articles['body'])
data_words = list(article_to_words(data))

In [6]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Modify bigram and trigram for speed
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [7]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Form Trigrams
data_words_trigrams = make_trigrams(data_words_bigrams)

# Initialize spacy 'en_core_web_sm' model
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [8]:
# Lemmatize and keep nouns, adjectives, verbs, and adverbs
data_lemmatized = lemmatization(data_words_trigrams, allowed_pos_tags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [9]:
# Create dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create term document frequency Matrix (corpus)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

In [10]:
# Pickle the term document frequency matrix (corpus)
with open('sentiment_pickles/pickle_corpus_pre_PCA.pickle', 'wb') as to_write:
    pickle.dump(corpus, to_write)

In [11]:
# Load the term document frequency matrix (corpus)
with open('sentiment_pickles/pickle_corpus_pre_PCA.pickle', 'rb') as read_file:
    corpus = pickle.load(read_file)

## Filter Most Frequent and Infrequent Words

In [10]:
# Create a dataframe with all words and the number of articles in which they appear
word_index = list(id2word.dfs.keys())
word = [id2word[wi] for wi in word_index]
num_articles = list(id2word.dfs.values())
total_articles = len(bitcoin_articles)

word_doc_count = pd.DataFrame({'word_index' : word_index, 'word' : word, 'num_articles' : num_articles})
word_doc_count['perc_articles'] = word_doc_count['num_articles'] / total_articles
word_doc_count = word_doc_count.sort_values('num_articles', ascending=False)

In [11]:
word_doc_count.head(10)

Unnamed: 0,word_index,word,num_articles,perc_articles
7,13,bitcoin,18820,0.995925
38,1,also,13681,0.723977
182,213,time,12870,0.68106
1823,1806,btc,12567,0.665026
1706,1691,cryptocurrency,11761,0.622374
282,273,market,11631,0.615495
125,196,say,11077,0.586178
5,55,new,10741,0.568397
707,1250,price,10561,0.558872
270,308,year,10450,0.552998


In [12]:
word_doc_count.tail(10)

Unnamed: 0,word_index,word,num_articles,perc_articles
40956,40956,mcawesome,1,5.3e-05
40955,40957,pickard,1,5.3e-05
40954,40954,pointlessness,1,5.3e-05
40953,40953,kilic,1,5.3e-05
40949,40949,smallfuck,1,5.3e-05
40948,40947,is_stalemate,1,5.3e-05
40947,40950,white_has_won,1,5.3e-05
40946,40945,apply_move,1,5.3e-05
40943,40941,lovelace,1,5.3e-05
65250,65250,justicemate,1,5.3e-05


In [13]:
len(id2word)

65251

In [14]:
# Filter out words in 60%+ of articles or in less than 50 articles
id2word.filter_extremes(no_above=0.60, no_below=50)

In [15]:
len(id2word)

5841

In [16]:
# Rebuild the corpus with the filtered dictionary
corpus = [id2word.doc2bow(text) for text in texts]

## Gensim LDA Parameter Tuning

In [43]:
def lda_grid_search(corpus, id2word, rs, num_topics, decay, per_word_topics, update_every, alpha, chunksize, passes, iterations):
    """
    Given a corpus and dictionary, run through all combinations of given Gensim LDA parameters.
    Return a dataframe with the results, sorted by coherence.
    This can take hours to run depending on the number of parameters supplied.
    """
    try:
        x = 0

        coherence_values = []
        perplexity_values = []

        num_topics_list = []
        decay_list = []
        per_word_topics_list = []
        update_every_list = []
        alpha_list = []
        chunksize_list = []
        passes_list = []
        iterations_list = []

        for i in product(num_topics, decay, per_word_topics, update_every, alpha, chunksize, passes, iterations):
            model = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                                    num_topics=i[0], 
                                                    id2word=id2word,
                                                    decay=i[1],
                                                    per_word_topics=i[2],
                                                    update_every=i[3],
                                                    alpha=i[4],
                                                    chunksize=i[5],
                                                    passes=i[6],
                                                    iterations=i[7],
                                                    random_state=rs)

            coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=id2word, coherence='c_v')
            coherence_values.append(coherencemodel.get_coherence())

            perplexity_values.append(model.log_perplexity(corpus))

            num_topics_list.append(i[0])
            decay_list.append(i[1])
            per_word_topics_list.append(i[2])
            update_every_list.append(i[3])
            alpha_list.append(i[4])
            chunksize_list.append(i[5])
            passes_list.append(i[6])
            iterations_list.append(i[7])

    except KeyboardInterrupt:
        length = len(iterations_list)
        num_topics_list = num_topics_list[0 : length]
        decay_list = decay_list[0 : length]
        per_word_topics_list = per_word_topics_list[0 : length]
        update_every_list = update_every_list[0 : length]
        alpha_list = alpha_list[0 : length]
        chunksize_list = chunksize_list[0 : length]
        passes_list = passes_list[0 : length]
        iterations_list = iterations_list[0 : length]

    finally:
        lda_parameter_df = pd.DataFrame({'coherence' : coherence_values,
                                         'perplexity' : perplexity_values,
                                         'num_topics' : num_topics_list,
                                         'decay' : decay_list,
                                         'per_word_topics' : per_word_topics_list,
                                         'update_every' : update_every_list,
                                         'alpha' : alpha_list,
                                         'chunksize' : chunksize_list,
                                         'passes' : passes_list,
                                         'iterations' : iterations_list,
                                        })

        lda_parameter_df.sort_values('coherence', ascending=False)

        return lda_parameter_df

#### Parameter Tuning - Round 1

In [None]:
# Set up lists of parameters
num_topics = [20, 25, 30, 35, 40]
decay = [0.5, 0.75, 1.0]
per_word_topics = [True, False]
update_every = [0, 1, 2]
alpha = ['auto', 'symmetric', 'asymmetric']
chunksize = [1000]
passes = [1]
iterations = [100]

In [None]:
# Get performance df
lda_parameter_df = lda_grid_search(corpus, id2word, rs, num_topics, decay, per_word_topics, 
                                   update_every, alpha, chunksize, passes, iterations)

In [115]:
# Display the performance of tested combos
lda_parameter_df.sort_values('coherence', ascending=False).head(20)

Unnamed: 0,coherence,perplexity,num_topics,decay,per_word_topics,update_every,alpha,chunksize,passes,iterations
59,0.517308,-7.485459,25,0.5,True,1,asymmetric,1000,1,100
68,0.517308,-7.485459,25,0.5,False,1,asymmetric,1000,1,100
14,0.505328,-7.445682,20,0.5,False,1,asymmetric,1000,1,100
5,0.505328,-7.445682,20,0.5,True,1,asymmetric,1000,1,100
57,0.504658,-7.480384,25,0.5,True,1,auto,1000,1,100
58,0.504658,-7.484003,25,0.5,True,1,symmetric,1000,1,100
66,0.504658,-7.480384,25,0.5,False,1,auto,1000,1,100
67,0.504658,-7.484003,25,0.5,False,1,symmetric,1000,1,100
165,0.501112,-7.582662,35,0.5,True,1,auto,1000,1,100
174,0.501112,-7.582662,35,0.5,False,1,auto,1000,1,100


#### Parameter Tuning - Round 2

In [116]:
# Set up lists of parameters
num_topics = [23, 25, 27]
decay = [0.5]
per_word_topics = [True]
update_every = [1]
alpha = ['auto', 'symmetric', 'asymmetric']
chunksize = [1000, 500, 100]
passes = [1]
iterations = [100]

In [None]:
# Get the performance df
lda_parameter_df2 = lda_grid_search(corpus, id2word, rs, num_topics, decay, per_word_topics, 
                                   update_every, alpha, chunksize, passes, iterations)

In [119]:
# Display the performance for all tested combos
lda_parameter_df2.sort_values('coherence', ascending=False).head(20)

Unnamed: 0,coherence,perplexity,num_topics,decay,per_word_topics,update_every,alpha,chunksize,passes,iterations
15,0.517308,-7.485459,25,0.5,True,1,asymmetric,1000,1,100
6,0.511666,-7.467108,23,0.5,True,1,asymmetric,1000,1,100
0,0.510463,-7.462797,23,0.5,True,1,auto,1000,1,100
3,0.509711,-7.465809,23,0.5,True,1,symmetric,1000,1,100
12,0.504658,-7.484003,25,0.5,True,1,symmetric,1000,1,100
9,0.504658,-7.480384,25,0.5,True,1,auto,1000,1,100
4,0.494275,-7.919394,23,0.5,True,1,symmetric,500,1,100
1,0.494078,-7.912356,23,0.5,True,1,auto,500,1,100
19,0.492529,-8.045482,27,0.5,True,1,auto,500,1,100
25,0.490836,-8.061535,27,0.5,True,1,asymmetric,500,1,100


#### Parameter Tuning - Round 3

In [120]:
# Set up the lists of parameters
num_topics = [25]
decay = [0.5]
per_word_topics = [True]
update_every = [1]
alpha = ['asymmetric']
chunksize = [1000, 1500, 2000]
passes = [1, 3]
iterations = [100, 200]

In [None]:
# Get the performance df
lda_parameter_df3 = lda_grid_search(corpus, id2word, rs, num_topics, decay, per_word_topics, 
                                   update_every, alpha, chunksize, passes, iterations)

In [123]:
# Display the performance results for all tested combos
lda_parameter_df3.sort_values('coherence', ascending=False)

Unnamed: 0,coherence,perplexity,num_topics,decay,per_word_topics,update_every,alpha,chunksize,passes,iterations
7,0.527104,-7.339544,25,0.5,True,1,asymmetric,1500,3,200
3,0.525307,-7.46115,25,0.5,True,1,asymmetric,1000,3,200
2,0.519405,-7.445168,25,0.5,True,1,asymmetric,1000,3,100
0,0.517308,-7.485459,25,0.5,True,1,asymmetric,1000,1,100
11,0.514528,-7.280307,25,0.5,True,1,asymmetric,2000,3,200
10,0.512899,-7.269218,25,0.5,True,1,asymmetric,2000,3,100
6,0.511997,-7.330817,25,0.5,True,1,asymmetric,1500,3,100
1,0.503808,-7.500432,25,0.5,True,1,asymmetric,1000,1,200
5,0.496147,-7.388813,25,0.5,True,1,asymmetric,1500,1,200
4,0.494058,-7.378056,25,0.5,True,1,asymmetric,1500,1,100


## Model Selection

In [17]:
# Select the model with the best combination of coherence and perplexity
optimal_model = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                                num_topics=25, 
                                                id2word=id2word,
                                                decay=0.5,
                                                per_word_topics=True,
                                                update_every=1,
                                                alpha='asymmetric',
                                                chunksize=1500,
                                                passes=10,
                                                iterations=200, 
                                                random_state=rs)

coherencemodel = CoherenceModel(model=optimal_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
print('Coherence: ',coherencemodel.get_coherence())

Coherence:  0.523430550686085


In [26]:
# Print the topics from the model with top 10 keywords
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10, num_topics=100))

[(0,
  '0.016*"blockchain" + 0.014*"technology" + 0.013*"world" + 0.010*"people" + '
  '0.009*"new" + 0.009*"community" + 0.008*"work" + 0.008*"help" + '
  '0.008*"first" + 0.007*"project"'),
 (1,
  '0.041*"network" + 0.024*"blockchain" + 0.020*"lightning" + '
  '0.017*"transaction" + 0.015*"ethereum" + 0.014*"project" + 0.014*"use" + '
  '0.014*"protocol" + 0.012*"token" + 0.012*"developer"'),
 (2,
  '0.036*"claim" + 0.034*"court" + 0.022*"case" + 0.020*"file" + 0.019*"legal" '
  '+ 0.019*"order" + 0.019*"document" + 0.011*"lawsuit" + 0.010*"allege" + '
  '0.010*"sale"'),
 (3,
  '0.035*"attack" + 0.031*"hack" + 0.031*"steal" + 0.026*"security" + '
  '0.024*"scam" + 0.024*"fund" + 0.021*"hacker" + 0.021*"report" + '
  '0.018*"account" + 0.015*"user"'),
 (4,
  '0.036*"company" + 0.031*"crypto" + 0.022*"service" + 0.016*"new" + '
  '0.014*"bank" + 0.013*"announce" + 0.013*"platform" + 0.013*"launch" + '
  '0.012*"customer" + 0.011*"exchange"'),
 (5,
  '0.023*"country" + 0.014*"government

In [21]:
# Pickle the topic modeling model
with open('sentiment_pickles/pickle_lda_model.pickle', 'wb') as to_write:
    pickle.dump(optimal_model, to_write)

In [22]:
# Load the optimal topic model (if re-starting here)
with open('sentiment_pickles/pickle_lda_model.pickle', 'rb') as read_file:
    optimal_model = pickle.load(read_file)

## Store Topic Breakdowns by Article

In [27]:
# Save topic distributions by article to an array
topics_by_doc = optimal_model.get_document_topics(corpus, minimum_probability=0.0)
topics_by_doc = gensim.matutils.corpus2csc(topics_by_doc)
topics_by_doc = topics_by_doc.T.toarray()
topics_by_doc = pd.DataFrame(topics_by_doc)

In [28]:
topics_by_doc.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.000919,0.000765,0.01429,0.000574,0.335196,0.000459,0.000417,0.000383,0.000353,0.000328,...,0.00023,0.000219,0.088789,0.0002,0.52536,0.000184,0.000177,0.00017,0.029469,0.000158
1,0.470946,0.016316,0.000498,0.000435,0.000387,0.000348,0.000317,0.00029,0.000268,0.000249,...,0.020382,0.000166,0.000158,0.000151,0.000145,0.000139,0.000134,0.000129,0.000124,0.00012
2,0.060696,0.000577,0.061421,0.000433,0.176254,0.000346,0.000315,0.000288,0.000266,0.440862,...,0.000173,0.000165,0.000157,0.00015,0.185862,0.070506,0.000133,0.000128,0.000124,0.000119
3,0.072194,0.138473,0.000272,0.030261,0.096304,0.00019,0.000173,0.000159,0.000146,0.000136,...,0.016993,9.1e-05,8.6e-05,8.3e-05,7.9e-05,7.6e-05,7.3e-05,7e-05,0.096651,6.6e-05
4,0.15225,0.000837,0.056455,0.000628,0.609697,0.000502,0.000457,0.000419,0.000386,0.000359,...,0.000251,0.000239,0.000228,0.000218,0.075541,0.000201,0.000193,0.000186,0.000179,0.000173


In [21]:
# Pickle the topic model breakdown by article
with open('sentiment_pickles/pickle_topics_by_article.pickle', 'wb') as to_write:
    pickle.dump(topics_by_doc, to_write)

## Visualize Topic Modeling and Determine Topic Labels

In [29]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(optimal_model, corpus, id2word)

In [23]:
html_string = pyLDAvis.prepared_data_to_html(vis)

In [24]:
# Pickle the pyLDAvis html for presentation elsewhere
with open('sentiment_pickles/pickle_pyLDAvis.pickle', 'wb') as to_write:
    pickle.dump(html_string, to_write)

In [32]:
# Label the pyLDAvis topics
topic_labels = {'topic_1' : 'Famous Investors',
                'topic_2' : 'Bull Run',
                'topic_3' : 'Bitcoin as a Commodity',
                'topic_4' : 'Education',
                'topic_5' : 'Global Adoption',
                'topic_6' : 'Crypto Founders',
                'topic_7' : 'Price Analysis',
                'topic_8' : 'Bitcoin as a Currency',
                'topic_9' : 'Security',
                'topic_10' : 'Blockchain Innovation',
                'topic_11' : 'Government Regulation',
                'topic_12' : 'Crime',
                'topic_13' : 'Peer-to-Peer',
                'topic_14' : 'Institutitonal Investing',
                'topic_15' : 'Price Movement',
                'topic_16' : 'Mining',
                'topic_17' : 'Privacy',
                'topic_18' : 'Bitcoin ETF',
                'topic_19' : 'Bitcoin Improvement Proposals (BIP)',
                'topic_20' : 'Transaction Fees',
                'topic_21' : 'Pay with Bitcoin',
                'topic_22' : 'Bitcoin Cash (BCH)',
                'topic_23' : 'Lawsuits',
                'topic_24' : 'Hacks',
                'topic_25' : 'Community (Games, Charity)',
               }

In [33]:
# Save topic labels dataframe for use in TokenSense
label_df = pd.DataFrame({'Topic' : topic_labels.keys(), 'Label' : topic_labels.values()})
label_df['Topic'] = label_df['Topic'].str.replace('topic_', 'Topic ')

label_df.to_csv('sentiment_pickles/topic_labels.csv', index=False)