### Reference: Topic modeling with spaCy & scikit-learn LDA
#### https://www.kaggle.com/thebrownviking20/topic-modelling-with-spacy-and-scikit-learn/notebook

In [40]:
# regular imports
import numpy as np
import pandas as pd
import os
import time
import string
import matplotlib.pyplot as plt

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# imports for scikit-learn & LDA
import sklearn
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures

# imports for scikit-learn & LDA
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig

# Plotly based imports for visualization
from plotly import tools
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

In [2]:
# Load the spacy model installed (using the medium model)
nlp = spacy.load('en_core_web_md')

# Define the working directory & raw input datasets
rel_path = './'
infile = '050319_acled_all.csv'

# Define the directory for saving LDA visualizations as HTML files
lda_vis_path = './lda_vis/'

# Read in the raw file
df = pd.read_csv(os.path.join(rel_path, infile))

# Define the punctuations & stop words
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

In [5]:
df.notes[3]

'27 April. Rioters blocked a road in Save to protest the legislative election. Police called for reinforcements to control the situation. Injuries on both sides were reported. [size=no report]'

#### Named Entity Recognition

In [7]:
doc = nlp(df["notes"][5])
spacy.displacy.render(doc, style='ent',jupyter=True)

#### Lemmatization

In [8]:
# Lemmatization
review = str(" ".join([i.lemma_ for i in doc]))
doc = nlp(review)
spacy.displacy.render(doc, style='ent',jupyter=True)

#### Part-of-Speech Tagging

In [9]:
for i in nlp(review):
    print(i,"=>",i.pos_)

27 => NUM
April => PROPN
. => PUNCT
unknown => ADJ
actor => NOUN
destroy => VERB
election => NOUN
material => NOUN
of => ADP
the => DET
Autonomous => PROPN
National => PROPN
Electoral => PROPN
Commission => PROPN
( => PUNCT
CENA => PROPN
) => PUNCT
in => ADP
Akpadanou => PROPN
ahead => ADV
of => ADP
the => DET
legislative => ADJ
election => NOUN
. => PUNCT


#### Parser function to Tokenize conflict notes

In [16]:
# Parser for conlfict notes
parser = English()
def spacy_tokenizer(note):
    mytokens = parser(str(note))
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [15]:
conflict = df["notes"][5]
print('\n------ Raw Note ------\n', conflict)
print('\n---- Parsed Note ----\n', spacy_tokenizer(conflict))


------ Raw Note ------
 27 April. Unknown actors destroyed election materials of the Autonomous National Electoral Commission (CENA) in Akpadanou ahead of the legislative elections.

---- Parsed Note ----
 ['27', 'april', 'unknown', 'actor', 'destroy', 'election', 'material', 'autonomous', 'national', 'electoral', 'commission', 'cena', 'akpadanou', 'ahead', 'legislative', 'election']


In [17]:
# Tokenizing the entire dataset
tqdm.pandas()
df["processed_notes"] = df["notes"].progress_apply(spacy_tokenizer)

100%|██████████| 509157/509157 [03:43<00:00, 2279.10it/s]


### Topic Modeling

In [18]:
# Creating a vectorizer
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(df["processed_notes"])


invalid escape sequence \-


invalid escape sequence \-


invalid escape sequence \-


invalid escape sequence \-



In [19]:
# Number of Topics/Clusters to create
NUM_TOPICS = 10

In [20]:
%%time

# Running the LDA model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
Wall time: 24min 56s


In [38]:
%%time

# Running the LDA model
my_lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=2, learning_method='online')

Wall time: 0 ns


In [45]:
# data_lda = lda.fit_transform(data_vectorized)
# type(my_lda)
# type(vectorizer)
data_vectorized = vectorizer.fit_transform(df["processed_notes"])

In [51]:
# type(data_vectorized)
# len(vectorizer.get_feature_names())
NUM_TOPICS

10

In [52]:
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

In [54]:
%%time
lda.fit(data_vectorized)

Wall time: 24min 3s


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [55]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

In [56]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
[('bomb', 20168.683397616383), ('area', 14531.997497493341), ('house', 13101.151029355993), ('road', 12364.8489672896), ('injure', 11830.987120257585), ('civilian', 11539.908587226944), ('hit', 10600.476925249102), ('kill', 10197.848091250584), ('vehicle', 10168.767103843222), ('july', 9973.211162890988)]
Topic 1:
[('force', 20587.2032216591), ('report', 13423.041634953355), ('pro', 12840.003598229767), ('reportedly', 12084.925447286527), ('area', 11605.393663322573), ('casualty', 11008.828682537158), ('border', 10560.292201913864), ('district', 9681.694719133086), ('houthi', 8178.195460529578), ('governorate', 7035.943598418387)]
Topic 2:
[('unknown', 11306.268735330885), ('weapon', 8494.757551003835), ('explosion', 7340.348712778293), ('near', 5924.709303393687), ('total', 5514.758487750639), ('involve', 5138.005005686441), ('saturday', 4673.23109813958), ('tfg', 4424.344366672602), ('observe', 3538.550251412824), ('israeli', 2987.94235392474)]
Topic 3:
[('rebel',

In [57]:
%%time
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash

In [60]:
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='[a-zA-Z\\-][a-zA-Z\\-]{2,}',
        tokenizer=None, vocabulary=None)

In [None]:
df.notes.head(5)

In [64]:
type(dash)

pyLDAvis._prepare.PreparedData

#### Save the Topic Modeling Visualization created as an HTML file

In [77]:
lda_vis_path = './lda_vis/'
out_vis_file = 'full_data_10_topics.html'

In [78]:
pyLDAvis.save_html(dash, fileobj=os.path.join(lda_vis_path, out_vis_file))

In [90]:
type(data_vectorized)

scipy.sparse.csr.csr_matrix

In [94]:
%%time

check = lda.transform(data_vectorized)

Wall time: 1min 22s


In [105]:
lda.transform(data_vectorized[5])

array([[0.07986111, 0.00714286, 0.00714732, 0.16573761, 0.63212936,
        0.00714328, 0.00714301, 0.00714286, 0.0794094 , 0.00714319]])

In [106]:
type(df["processed_notes"])

pandas.core.series.Series

In [112]:
type(lda.transform(data_vectorized[5])[0])

numpy.ndarray

In [None]:
pyLDAvis.sklearn.prepare(lda, data_vectorized[:1000], vectorizer, mds='tsne')