### Loading relevant libraries

In [1]:
# regular imports
import os
import numpy as np
import pandas as pd
import time
import string
import matplotlib.pyplot as plt

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# imports for scikit-learn & LDA
import sklearn
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures

# imports for scikit-learn & LDA
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig

# Plotly based imports for visualization
from plotly import tools
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Define the punctuations & stop words
PUNCTUATIONS = string.punctuation
STOPWORDS = list(STOP_WORDS)

# Load the spacy model installed (using the medium model)
NLP = spacy.load('en_core_web_md')

  from collections import Callable



Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working


Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working



In [2]:
# Define the working directory & raw input datasets
REL_PATH = './'
INFILE = '050319_acled_all.csv'

# Define the directory for saving LDA visualizations as HTML files
LDA_VIS_PATH = './lda_vis/'

# Read in the raw file
df = pd.read_csv(os.path.join(REL_PATH, INFILE))

### Pre-processing

In [3]:
# Parser & Tokenizer function for conlfict notes
parser = English()
def spacy_tokenizer(note):
    mytokens = parser(str(note))
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in STOPWORDS and word not in PUNCTUATIONS ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens


In [4]:
tqdm.pandas()

# Parsing & Tokenizing the entire dataset
df["processed_notes"] = df["notes"].progress_apply(spacy_tokenizer)

100%|██████████| 509157/509157 [10:34<00:00, 802.87it/s] 


### Class definition for LDA Models

In [5]:
class LDA_Model(object):
    """docstring for LDA_Model."""

    def __init__(self, num_topics, max_iter, vectorizer_type):
        super(LDA_Model, self).__init__()
        self.num_topics = num_topics
        self.max_iter = max_iter
        self.lda = LatentDirichletAllocation(n_components=self.num_topics,
                                             max_iter=self.max_iter,
                                             learning_method='online',
                                             learning_offset=50.,
                                             random_state=0)
        self.count_vectorizer = CountVectorizer(min_df=5, max_df=0.9,
                                                stop_words=STOPWORDS,
                                                lowercase=True,
                                                token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
        self.tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                                stop_words=STOPWORDS)
        # Choose vectorizer type based on flag passed in
        self.vectorizer_type = vectorizer_type
        if self.vectorizer_type == 'tfidf':
            self.vectorizer = self.tfidf_vectorizer
        else:
            self.vectorizer = self.count_vectorizer

    def vectorize_data(self, list):
        self.data_vectorized = self.vectorizer.fit_transform(tqdm(list))

    def train_lda(self):
        self.lda.fit(self.data_vectorized)

    def selected_topics(self, top_n=10):
        for idx, topic in enumerate(self.lda.components_):
            print("LDA Model:")
            print("Topic %d:" % (idx))
            print([(self.vectorizer.get_feature_names()[i], topic[i]) \
                               for i in topic.argsort()[:-top_n - 1:-1]])

    def visualize(self, out_vis_file):
        # Build a Visualization using pyLDAvis
        self.dash = pyLDAvis.sklearn.prepare(self.lda,
                                             self.data_vectorized,
                                             self.vectorizer,
                                             mds='tsne')
        # Save the Visualization built as an HTML file
        pyLDAvis.save_html(self.dash, fileobj=os.path.join(LDA_VIS_PATH, out_vis_file))



invalid escape sequence \-


invalid escape sequence \-


invalid escape sequence \-


invalid escape sequence \-



### Model Runs

In [6]:
# Define Hyperparameters
grid = {'word2vec': {'dataset': ['full', '100+'], 
                     'tokens': ['unigrams', 'bigrams'],
                     'num_topics': [3, 5, 7, 10, 15],
                     'max_iter': [2, 5, 10],
                     'vectorizer_type': ['count', 'tfidf']
                    },
        'custom':  {'dataset': ['full', '100+'], 
                     'tokens': ['unigrams', 'bigrams'],
                     'num_topics': [3, 5, 7, 10, 15],
                     'max_iter': [2, 5, 10],
                     'vectorizer_type': ['count', 'tfidf']
                    }
       }

#### Model 1: 3 topics, 2 iterations, count vectorizer

In [7]:
# Test parameters
num_topics = 3
max_iter = 2
vectorizer_type = 'count'

In [8]:
# Instantiate the LDA Model
lda_3_2_count = LDA_Model(num_topics, max_iter, vectorizer_type)

In [9]:
# Vectorize Data
lda_3_2_count.vectorize_data(df["processed_notes"])

100%|██████████| 509157/509157 [00:18<00:00, 27826.77it/s]


In [10]:
%%time

# Train the model
lda_3_2_count.train_lda()

Wall time: 7min 55s


In [11]:
%%time

# Build & save Visualization as HTML
lda_3_2_count.visualize('lda_3_2_count_full_unigram.html')

Wall time: 19min 12s


#### Model 2: 3 topics, 5 iterations, count vectorizer

In [12]:
# Instantiate the LDA Model
lda_3_5_count = LDA_Model(num_topics=3, max_iter=5, vectorizer_type='count')

# Vectorize Data
lda_3_5_count.vectorize_data(df["processed_notes"])

100%|██████████| 509157/509157 [00:17<00:00, 28776.34it/s]


In [13]:
%%time

# Train the model
lda_3_5_count.train_lda()

Wall time: 16min 51s


In [14]:
%%time

# Build & save Visualization as HTML
lda_3_5_count.visualize('lda_3_5_count_full_unigram.html')

Wall time: 21min 45s


#### Model 3: 3 topics, 2 iterations, tfidf vectorizer

In [15]:
# Instantiate the LDA Model
lda_3_2_tfidf = LDA_Model(num_topics=3, max_iter=2, vectorizer_type='tfidf')

# Vectorize Data
lda_3_2_tfidf.vectorize_data(df["processed_notes"])

100%|██████████| 509157/509157 [00:21<00:00, 23794.43it/s]


In [16]:
%%time

# Train the model
lda_3_2_tfidf.train_lda()

Wall time: 8min 31s


In [17]:
%%time

# Build & save Visualization as HTML
lda_3_2_tfidf.visualize('lda_3_2_tfidf_full_unigram.html')

Wall time: 42min 57s


#### Model 4: 5 topics, 2 iterations, count vectorizer

In [18]:
# Instantiate the LDA Model
lda_5_2_count = LDA_Model(num_topics=5, max_iter=2, vectorizer_type='count')

# Vectorize Data
lda_5_2_count.vectorize_data(df["processed_notes"])

100%|██████████| 509157/509157 [00:18<00:00, 27116.46it/s]


In [19]:
%%time

# Train the model
lda_5_2_count.train_lda()

Wall time: 9min 5s


In [20]:
%%time

# Build & save Visualization as HTML
lda_5_2_count.visualize('lda_5_2_count_full_unigram.html')

Wall time: 18min 29s


#### Model 5: 5 topics, 2 iterations, tfidf vectorizer

In [21]:
# Instantiate the LDA Model
lda_5_2_tfidf = LDA_Model(num_topics=5, max_iter=2, vectorizer_type='tfidf')

# Vectorize Data
lda_5_2_tfidf.vectorize_data(df["processed_notes"])

100%|██████████| 509157/509157 [00:12<00:00, 42061.73it/s]


In [22]:
%%time

# Train the model
lda_5_2_tfidf.train_lda()

Wall time: 4min 51s


In [23]:
%%time

# Build & save Visualization as HTML
lda_5_2_tfidf.visualize('lda_5_2_tfidf_full_unigram.html')

Wall time: 25min 48s


#### Model 6: 7 topics, 2 iterations, count vectorizer

In [24]:
# Instantiate the LDA Model
lda_7_2_count = LDA_Model(num_topics=7, max_iter=2, vectorizer_type='count')

# Vectorize Data
lda_7_2_count.vectorize_data(df["processed_notes"])

100%|██████████| 509157/509157 [00:07<00:00, 70571.07it/s] 


In [25]:
%%time

# Train the model
lda_7_2_count.train_lda()

Wall time: 4min 29s


In [26]:
%%time

# Build & save Visualization as HTML
lda_7_2_count.visualize('lda_7_2_count_full_unigram.html')

Wall time: 13min 32s


#### Model 7: 7 topics, 2 iterations, tfidf vectorizer

In [27]:
# Instantiate the LDA Model
lda_7_2_tfidf = LDA_Model(num_topics=7, max_iter=2, vectorizer_type='tfidf')

# Vectorize Data
lda_7_2_tfidf.vectorize_data(df["processed_notes"])

100%|██████████| 509157/509157 [00:08<00:00, 58700.73it/s]


In [28]:
%%time

# Train the model
lda_7_2_tfidf.train_lda()

Wall time: 5min 38s


In [29]:
%%time

# Build & save Visualization as HTML
lda_7_2_tfidf.visualize('lda_7_2_tfidf_full_unigram.html')

Wall time: 25min 28s


#### Model 8: 10 topics, 2 iterations, count vectorizer

In [30]:
# Instantiate the LDA Model
lda_10_2_count = LDA_Model(num_topics=10, max_iter=2, vectorizer_type='count')

# Vectorize Data
lda_10_2_count.vectorize_data(df["processed_notes"])

100%|██████████| 509157/509157 [00:07<00:00, 69185.55it/s]


In [31]:
%%time

# Train the model
lda_10_2_count.train_lda()

Wall time: 5min 24s


In [32]:
%%time

# Build & save Visualization as HTML
lda_10_2_count.visualize('lda_10_2_count_full_unigram.html')

Wall time: 14min 12s


#### Model 9: 10 topics, 2 iterations, tfidf vectorizer

In [33]:
# Instantiate the LDA Model
lda_10_2_tfidf = LDA_Model(num_topics=10, max_iter=2, vectorizer_type='tfidf')

# Vectorize Data
lda_10_2_tfidf.vectorize_data(df["processed_notes"])

100%|██████████| 509157/509157 [00:08<00:00, 57884.97it/s]


In [34]:
%%time

# Train the model
lda_10_2_tfidf.train_lda()

Wall time: 7min 55s


In [35]:
%%time

# Build & save Visualization as HTML
lda_10_2_tfidf.visualize('lda_10_2_tfidf_full_unigram.html')

Wall time: 26min


### Saving the LDA Models as pickles

In [37]:
import pickle
pickle_path = './lda_pickles/'

In [38]:
lda_10_2_tfidf_pkl = 'lda_10_2_tfidf.pkl'

# Open the file to save as pkl file
lda_model_pkl = open(os.path.join(pickle_path, lda_10_2_tfidf_pkl), 'wb')
pickle.dump(lda_10_2_tfidf, lda_model_pkl)

# Close the pickle instances
lda_model_pkl.close()

In [40]:
%%time

# Loading the saved model pickle
lda_model_pkl = open(os.path.join(pickle_path, lda_10_2_tfidf_pkl), 'rb')
lda_10_2_tfidf = pickle.load(lda_model_pkl)
print("Loaded LDA model :: ", lda_10_2_tfidf)

Loaded LDA model ::  <__main__.LDA_Model object at 0x00000229ABE32160>
Wall time: 386 ms


In [44]:
lda_10_2_tfidf

<__main__.LDA_Model at 0x22a534ae3c8>

#### Define functions to create and load model pickles

In [54]:
pickle_dict = {lda_3_2_count : 'lda_3_2_count.pkl',
               lda_3_5_count : 'lda_3_5_count.pkl',
               lda_3_2_tfidf : 'lda_3_2_tfidf.pkl',
               lda_5_2_count : 'lda_5_2_count.pkl',
               lda_5_2_tfidf : 'lda_5_2_tfidf.pkl',
               lda_7_2_count : 'lda_7_2_count.pkl',
               lda_7_2_tfidf : 'lda_7_2_tfidf.pkl',
               lda_10_2_count: 'lda_10_2_count.pkl',
               lda_10_2_tfidf: 'lda_10_2_tfidf.pkl'
               }

def pickle_model(lda_obj):
    
    pkl_name = pickle_dict[lda_obj]
    
    # Open the file to save as pkl file
    lda_model_pkl = open(os.path.join(pickle_path, pkl_name), 'wb')
    pickle.dump(lda_obj, lda_model_pkl)

    # Close the pickle instances
    lda_model_pkl.close()

def load_model(lda_obj):
    
    pkl_name = pickle_dict[lda_obj]
    
    # Loading the saved model pickle
    lda_model_pkl = open(os.path.join(pickle_path, pkl_name), 'rb')
    lda_obj = pickle.load(lda_model_pkl)
    
    return lda_obj

##### Pickle all Models

In [59]:
for lda_obj in tqdm(pickle_dict.keys()):
    pickle_model(lda_obj)

100%|██████████| 9/9 [00:08<00:00,  1.10it/s]
