## Topic Modeling of Social Media Data regarding the 2019-nCoV Pandemic.
#### Data source: Twitter
#### Last data update: 01/29/2020 - 13:24

In [1]:
import json
import gensim
import nltk
import pandas as pd
import pickle
import pyLDAvis.gensim
import random
import spacy
from gensim import corpora
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
spacy.load('en_core_web_sm')
from spacy.lang.en import English
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/renziver/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/renziver/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
with open('datasets/data.json') as f:
    data = json.load(f)
data_list = []
for item in data:
    data_list.append(item['text'])
data_series = pd.Series(data_list)
data_series.head(5)

0    @lookner I taught a class in college on the co...
1                   this corona virus is scary as hell
2                     new corona virus is pretty scary
3    If Corona has a Virus?  I'm a dead man..  pic....
4    Corona Beer claims a conspiracy is behind the ...
dtype: object

### Data preprocessing

#### Text cleaning

In [3]:
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            pass
        elif token.orth_.startswith('@'):
            pass
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

en_stop = set(nltk.corpus.stopwords.words('english'))

In [4]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [5]:
text_data = []

for line in data_list:
    tokens = prepare_text_for_lda(line)
    if random.random() > .99:
        text_data.append(tokens)    

#### LDA Training

In [6]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open('artifacts/corpus.pkl', 'wb'))
dictionary.save('artifacts/dictionary.gensim')

In [7]:
NUM_TOPICS = 4
ldamodel = gensim.models.ldamodel.LdaModel(corpus,decay=0.6, num_topics = NUM_TOPICS, id2word=dictionary, passes=10)
ldamodel.save('artifacts/model.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.087*"virus" + 0.071*"corona" + 0.017*"china" + 0.013*"people" + 0.013*"world"')
(1, '0.048*"virus" + 0.042*"corona" + 0.012*"situation" + 0.012*"survive" + 0.012*"surprise"')
(2, '0.130*"corona" + 0.125*"virus" + 0.031*"china" + 0.013*"first" + 0.010*"chinese"')
(3, '0.063*"corona" + 0.052*"virus" + 0.039*"china" + 0.012*"everyone" + 0.007*"wuhan"')


#### LDA visualization

In [8]:
dictionary = gensim.corpora.Dictionary.load('artifacts/dictionary.gensim')
corpus = pickle.load(open('artifacts/corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('artifacts/model.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [9]:
pyLDAvis.save_html(lda_display, 'ncov-topic-model.html')