### Reference: Topic modeling with spaCy & scikit-learn LDA
#### https://www.kaggle.com/thebrownviking20/topic-modelling-with-spacy-and-scikit-learn/notebook

In [37]:
# regular imports
import numpy as np
import pandas as pd
import os
import time
import string
import matplotlib.pyplot as plt

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# imports for scikit-learn & LDA
import sklearn
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures

# imports for scikit-learn & LDA
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig

# Plotly based imports for visualization
from plotly import tools
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

In [2]:
# Load the spacy model installed (using the medium model)
nlp = spacy.load('en_core_web_md')

# Define the working directory & raw input datasets
rel_path = './'
infile = '050319_acled_all.csv'

# Read in the raw file
df = pd.read_csv(os.path.join(rel_path, infile))

# Define the punctuations & stop words
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

In [5]:
df.notes[3]

'27 April. Rioters blocked a road in Save to protest the legislative election. Police called for reinforcements to control the situation. Injuries on both sides were reported. [size=no report]'

#### Named Entity Recognition

In [7]:
doc = nlp(df["notes"][5])
spacy.displacy.render(doc, style='ent',jupyter=True)

#### Lemmatization

In [8]:
# Lemmatization
review = str(" ".join([i.lemma_ for i in doc]))
doc = nlp(review)
spacy.displacy.render(doc, style='ent',jupyter=True)

#### Part-of-Speech Tagging

In [9]:
for i in nlp(review):
    print(i,"=>",i.pos_)

27 => NUM
April => PROPN
. => PUNCT
unknown => ADJ
actor => NOUN
destroy => VERB
election => NOUN
material => NOUN
of => ADP
the => DET
Autonomous => PROPN
National => PROPN
Electoral => PROPN
Commission => PROPN
( => PUNCT
CENA => PROPN
) => PUNCT
in => ADP
Akpadanou => PROPN
ahead => ADV
of => ADP
the => DET
legislative => ADJ
election => NOUN
. => PUNCT


#### Parser function to Tokenize conflict notes

In [16]:
# Parser for conlfict notes
parser = English()
def spacy_tokenizer(note):
    mytokens = parser(str(note))
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [15]:
conflict = df["notes"][5]
print('\n------ Raw Note ------\n', conflict)
print('\n---- Parsed Note ----\n', spacy_tokenizer(conflict))


------ Raw Note ------
 27 April. Unknown actors destroyed election materials of the Autonomous National Electoral Commission (CENA) in Akpadanou ahead of the legislative elections.

---- Parsed Note ----
 ['27', 'april', 'unknown', 'actor', 'destroy', 'election', 'material', 'autonomous', 'national', 'electoral', 'commission', 'cena', 'akpadanou', 'ahead', 'legislative', 'election']


In [17]:
# Tokenizing the entire dataset
tqdm.pandas()
df["processed_notes"] = df["notes"].progress_apply(spacy_tokenizer)

100%|██████████| 509157/509157 [03:43<00:00, 2279.10it/s]


### Topic Modeling

In [18]:
# Creating a vectorizer
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(df["processed_notes"])


invalid escape sequence \-


invalid escape sequence \-


invalid escape sequence \-


invalid escape sequence \-



In [19]:
# Number of Topics/Clusters to create
NUM_TOPICS = 10

In [20]:
%%time

# Running the LDA model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
Wall time: 24min 56s


In [38]:
%%time

# Running the LDA model
my_lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=2, learning_method='online')

Wall time: 0 ns


In [39]:
# data_lda = lda.fit_transform(data_vectorized)
type(my_lda)

NameError: name 'my_lda' is not defined

In [26]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

In [25]:
data_vectorized

NameError: name 'lda' is not defined

In [27]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:


NameError: name 'lda' is not defined

In [None]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash

In [None]:
df.head(5)

In [None]:
df.notes.head(5)

In [None]:
# Load the spacy model installed
nlp = spacy.load('en_core_web_md')

In [None]:
# process a sentence using the model
doc = nlp("This is some text that I am processing with Spacy")

In [None]:
check = doc[0].vector / len(doc)
for word in doc:
    print('\n', word, len(word.vector), '\n-----------------\n', word.vector)
    check = check + word.vector / len(doc)

check = check - doc[0].vector / len(doc)
print('\n', doc, len(doc.vector), '\n-----------------\n', doc.vector)
print('\n', doc, len(check), '\n-----------------\n', check)

In [None]:
df_acled = pd.read_csv('./050319_acled_all.csv')