# Clustering: LDA w/ sklearn
Prototyping NB for Clustering EUvsVirus Hackathon projects based on LDA using sklearn.

In [1]:
import os, warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

import numpy as np
import pandas as pd

import spacy
from spacymoji import Emoji
import regex as re
import emoji

from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from joblib import dump, load

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

%matplotlib inline

In [2]:
random_state = 23

## Load

Data

In [3]:
data_path = "/mnt/DATA/RWData Dropbox/EUvsVirus"
data_fpath = os.path.join(data_path, "all_data.tsv")

In [4]:
data = pd.read_csv(data_fpath, sep="\t")
print(data.info())
data.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2159 entries, 0 to 2158
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Challenge     2159 non-null   object
 1   SubChallenge  2159 non-null   object
 2   ProjURL       2159 non-null   object
 3   title         2159 non-null   object
 4   text          2069 non-null   object
dtypes: object(5)
memory usage: 84.5+ KB
None


(2159, 5)

NLP Model

In [5]:
nlp = spacy.load("en_core_web_md", disable=["ner", "parser", "tagger"])
emoji = Emoji(nlp, merge_spans=False)
nlp.add_pipe(emoji, first=True)

## Preprocessing

In [6]:
data.text.replace("", np.nan, inplace=True)
nan_removed = data.dropna(subset=["text"])
data_cleaned = nan_removed.drop_duplicates(subset=["text", "title"], keep="first")
raw_corpus = data_cleaned.text

raw_corpus.shape

(2067,)

### Text  Cleansing

Maybe later: Remove header texts; Split and structure text parts

In [36]:
def spacy_tokenizer(doc):
    tokens = [token for token in nlp(doc)]
    cleaned_tokens = []
                                        
    for token in tokens:
        if not any(map(lambda x: getattr(token, x), ["is_punct", "is_digit",
                                                     "is_space", "like_url",
                                                     "like_num", "like_email",
                                                     "is_stop"
                                                     ])):
            cleaned_tokens.append(token)
    
    # Post-processing steps
    lemmas = [token.lemma_ for token in cleaned_tokens]
    punct_removed = [re.sub(r"[.!?+:/\\-]", "", lemma) for lemma in lemmas]
    num_removed = [token for token in punct_removed if token.isalpha()]
    single_char_removed = [token for token in num_removed if len(token) > 1]
    
    return single_char_removed

In [25]:
cleaned_txt = corpus.apply(spacy_tokenizer)

In [10]:
#corpus[26]

In [11]:
#cleaned_txt[26]

## Topic Modeling
TFIDF + LDA

### Vectorizer

In [39]:
tfidf = TfidfVectorizer(lowercase=True, tokenizer=spacy_tokenizer, max_features=20000)
# total tokens 25744 -> get rid of infrequent
tfidf_v = tfidf.fit_transform(raw_corpus)

tfidf_v.shape

(2067, 20000)

In [42]:
#tfidf.get_feature_names()

### Model

In [45]:
#FIXME
models = {
    "lda_20_t2": LatentDirichletAllocation(n_components=20, n_jobs=1, random_state=random_state, verbose=1),
    "lda_25_t2": LatentDirichletAllocation(n_components=25, n_jobs=1, random_state=random_state, verbose=1),
    "lda_30_t2": LatentDirichletAllocation(n_components=30, n_jobs=1, random_state=random_state, verbose=1),
    "lda_35_t2": LatentDirichletAllocation(n_components=35, n_jobs=1, random_state=random_state, verbose=1),
    "lda_40_t2": LatentDirichletAllocation(n_components=40, n_jobs=1, random_state=random_state, verbose=1),
    "lda_45_t2": LatentDirichletAllocation(n_components=45, n_jobs=1, random_state=random_state, verbose=1),
    "lda_50_t2": LatentDirichletAllocation(n_components=50, n_jobs=1, random_state=random_state, verbose=1),
    "lda_55_t2": LatentDirichletAllocation(n_components=55, n_jobs=1, random_state=random_state, verbose=1),
    "lda_60_t2": LatentDirichletAllocation(n_components=60, n_jobs=1, random_state=random_state, verbose=1),
    "lda_65_t2": LatentDirichletAllocation(n_components=65, n_jobs=1, random_state=random_state, verbose=1)
}

In [46]:
#FIXME lda_path = os.path.join(model_path, "topic_lda")

for model in models:
    print("Fitting", model)
    models[model].fit(tfidf_v)
    print("Dump", model, "\n")
    dump(models[model], os.path.join(lda_path, model + ".joblib"))

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=30, n_jobs=-1,
                          perp_tol=0.1, random_state=23, topic_word_prior=None,
                          total_samples=1000000.0, verbose=1)

In [None]:
def print_scores(model, name):
    """ Prints Log-Likelihood and Perplexity scores of model. """
    print("Scores for", name)
    print("Log Likelihood:", model.score(tfidf_v))
    print("Perplexity:", model.perplexity(tfidf_v), "\n")

for model in models:
    print_scores(models[model], model)

In [None]:
#FIXME viz_path = os.path.join(figures_path, "topics")

# for model in models:
#         p = None
#         p = pyLDAvis.sklearn.prepare(models[model], tfidif_v, tfidf, mds="tsne")
#         pyLDAvis.save_html(p, os.path.join(viz_path, "topics_" + model + ".html"))
#         print("topics_" + model + " persisted")

# Conclusion

#FIXME