# Clustering: LDA w/ sklearn
Prototyping NB for Clustering EUvsVirus Hackathon projects based on LDA using sklearn.

In [1]:
import os, warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

import numpy as np
import pandas as pd

import spacy
from spacymoji import Emoji
import regex as re
import emoji

from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from joblib import dump, load

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

%matplotlib inline

In [2]:
random_state = 23

## Load

Data

In [3]:
data_path = "/mnt/DATA/RWData Dropbox/EUvsVirus"
data_fpath = os.path.join(data_path, "all_data.tsv")

In [4]:
data = pd.read_csv(data_fpath, sep="\t")
print(data.info())
data.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2159 entries, 0 to 2158
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Challenge     2159 non-null   object
 1   SubChallenge  2159 non-null   object
 2   ProjURL       2159 non-null   object
 3   title         2159 non-null   object
 4   text          2069 non-null   object
dtypes: object(5)
memory usage: 84.5+ KB
None


(2159, 5)

NLP Model

In [5]:
nlp = spacy.load("en_core_web_md", disable=["ner", "parser", "tagger"])
emoji = Emoji(nlp, merge_spans=False)
nlp.add_pipe(emoji, first=True)

## Preprocessing

In [6]:
data.text.replace("", np.nan, inplace=True)
nan_removed = data.dropna(subset=["text"])
data_cleaned = nan_removed.drop_duplicates(subset=["text", "title"], keep="first")
raw_corpus = data_cleaned.text

raw_corpus.shape

(2067,)

### Text  Cleansing

Maybe later: Remove header texts; Split and structure text parts

In [7]:
def spacy_tokenizer(doc):
    tokens = [token for token in nlp(doc)]
    cleaned_tokens = []
                                        
    for token in tokens:
        if not any(map(lambda x: getattr(token, x), ["is_punct", "is_digit",
                                                     "is_space", "like_url",
                                                     "like_num", "like_email",
                                                     "is_stop"
                                                     ])):
            cleaned_tokens.append(token)
    
    # Post-processing steps
    lemmas = [token.lemma_ for token in cleaned_tokens]
    punct_removed = [re.sub(r"[.!?+:/\\-]", "", lemma) for lemma in lemmas]
    num_removed = [token for token in punct_removed if token.isalpha()]
    single_char_removed = [token for token in num_removed if len(token) > 1]
    
    return single_char_removed

## Topic Modeling
TFIDF + LDA

### Vectorizer

In [8]:
tfidf = TfidfVectorizer(lowercase=True, tokenizer=spacy_tokenizer, max_features=20000)
# total tokens 25744 -> get rid of infrequent
tfidf_v = tfidf.fit_transform(raw_corpus)

tfidf_v.shape

(2067, 20000)

In [9]:
#tfidf.get_feature_names()

### Model

In [10]:
models = {
    "lda_032": LatentDirichletAllocation(n_components=32, n_jobs=1, random_state=random_state, verbose=1), # num subchallenges
    "lda_050": LatentDirichletAllocation(n_components=50, n_jobs=1, random_state=random_state, verbose=1), # best intrepret
    "lda_064": LatentDirichletAllocation(n_components=64, n_jobs=1, random_state=random_state, verbose=1), # 2*num subchallenges
    "lda_100": LatentDirichletAllocation(n_components=100, n_jobs=1, random_state=random_state, verbose=1)
}

In [11]:
model_persist_path = os.path.join("../models", "lda_sklearn")

for model in models:
    print("Fitting", model)
    models[model].fit(tfidf_v)
    print("Dump", model, "\n")
    dump(models[model], os.path.join(model_persist_path, model + ".joblib"))

Fitting lda_032
iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
Dump lda_032 

Fitting lda_050
iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
Dump lda_050 

Fitting lda_064
iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
Dump lda_064 

Fitting lda_100
iteration: 1 of ma

In [12]:
def print_scores(model, name):
    """ Prints Log-Likelihood and Perplexity scores of model. """
    print("Scores for", name)
    print("Log Likelihood:", model.score(tfidf_v))
    print("Perplexity:", model.perplexity(tfidf_v), "\n")

for model in models:
    print_scores(models[model], model)

Scores for lda_032
Log Likelihood: -231845.1044309936
Perplexity: 132858.9404528981 

Scores for lda_050
Log Likelihood: -249041.50173302894
Perplexity: 318715.0215234875 

Scores for lda_064
Log Likelihood: -267224.0882507981
Perplexity: 803909.9812752074 

Scores for lda_100
Log Likelihood: -303099.97383560886
Perplexity: 4988899.539773154 



In [13]:
viz_path = os.path.join("../reports/figures/", "lda_sklearn")

for model in models:
        p = None
        p = pyLDAvis.sklearn.prepare(models[model], tfidf_v, tfidf, mds="tsne")
        pyLDAvis.save_html(p, os.path.join(viz_path, "topics_" + model + ".html"))
        print("topics_" + model + " persisted")

topics_lda_032 persisted
topics_lda_050 persisted
topics_lda_064 persisted
topics_lda_100 persisted


# Conclusion

Best interpretable model consist of 50 topics. Which might be too less for desired use-case. In general, model performance would benefit from incorporating more precise preprocessing steps. Due to lack of time in project, this will not happen (resp. will happen, but in R land).

**Not production ready**