## Importing libraries/modules

In [1]:
import utils
import numpy as np
import pandas as pd
import os

In [2]:
DATA_DIR = 'Data'
CACHE_DIR = 'Cache'
SPEAKER_INFO_FILE_PATH = os.path.join(DATA_DIR, 'speaker_attributes.parquet')
CACHE_FILE_PATH = os.path.join(CACHE_DIR, 'processed_data.json.bz2')
BERT_MODEL_SAVE_PATH = os.path.join(CACHE_DIR, 'bert_model')

In [3]:
def get_words(string):
    # Make all lower-case.
    string = string.lower()
    
    # Remove punctuation and numbers.
    string = re.sub(r'[^a-z ]', ' ', string)
    
    # Split into words.
    words = string.split()
    
    return words

### Latent Semantic Analysis and Latent Dirichlet Allocation

**Note:** On long term, we may want to do two passes on the dataset: one to fit models and one to predict, as that should be less RAM intensive. Also, max_features, max_df and min_df in CountVectorizer may be tuned for same reason.

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

@utils.cache_to_file_pickle("function-count_matrix_dataset")
def count_matrix_dataset(data_dir):
    count_vectorizer = CountVectorizer(strip_accents = "ascii", lowercase = True, stop_words = "english")
    
    counts_matrix = count_vectorizer.fit_transform(line['quotation'] for line in utils.all_quotes_generator(data_dir))
    
    return count_vectorizer, counts_matrix


count_vectorizer, counts_matrix = count_matrix_dataset(data_dir = DATA_DIR)
feature_names = count_vectorizer.get_feature_names()
del count_vectorizer

#### Latent Semantic Analysis

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD

def latent_semantic_analysis(counts_matrix):
    tfidf_embedding = TfidfTransformer(norm = 'l2', use_idf = True, smooth_idf = True, sublinear_tf = False).fit_transform(counts_matrix)

    lsa_model = TruncatedSVD(n_components = 100, n_iter = 10)
    lsa_model.fit(tfidf_embedding)
    
    return lsa_model

lsa_model = latent_semantic_analysis(counts_matrix)

In [None]:
def print_main_topic_words(feature_names, components, n_main_words = 10):
    for i, component in enumerate(components):
        words_making_up_component = {feature: coordinate for feature, coordinate in zip(feature_names, component)}
        main_words = sorted(words_making_up_component, key = words_making_up_component.get, reverse = True)[:n_main_words]
        print(f"Topic {i}:")
        print(' '.join(main_words))
        
print_main_topic_words(feature_names, lsa_model.components_)

Topic 0:
going bay green away time action people think litigation resort
Topic 1:
think actually dad saw just played opportunity make great sure
Topic 2:
dad saw opportunity great words coming greatest movement came fair
Topic 3:
just sure play ve ready make played healthy minutes playing
Topic 4:
people able come want help committee members run fair carry
Topic 5:
long tail crocodile ll crack donald engineering president storm trump
Topic 6:
good long tail year crocodile high day performance want plays
Topic 7:
didn program cost feel team year want fit portion relationship
Topic 8:
want able ll does cat leash make season wanted deal
Topic 9:
doesn agent hoping getting does 94 football miller probably quickest
Topic 10:
new offer gas capabilities computer design fiction science tv watching
Topic 11:
high numbers good really glass does 365 cause damage degrees
Topic 12:
high far years numbers good didn want season new think
Topic 13:
road trade said day years came took home issue iffa
T

#### LatentDirichletAllocation

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components = 100, learning_method = 'batch', evaluate_every = -1, njobs = -1)
lda_model.fit(counts_matrix)

In [None]:
print_main_topic_words(feature_names, lda_model.components_)

https://www.kaggle.com/rajmehra03/topic-modelling-using-lda-and-lsa-in-sklearn

https://yanlinc.medium.com/how-to-build-a-lda-topic-model-using-from-text-601cdcbfd3a6

### BERT Pretrained

https://github.com/MaartenGr/BERTopic

In [3]:
from bertopic import BERTopic

@utils.cache_to_file_pickle("function-embed_dataset_BERT")
def embed_dataset_BERT(data_dir):
    
    # Min topics size should be fine-tuned: we have very large dataset hence a value of ~1000 is better than default one.
    # Also, may want to play with nr_topics: either leave it to default none, or auto, or set a manual value large enough
    # that topics do not get mixed up while merging. Also note it is heavy to run anything other than None.
    # In background, BERTopic uses UMAP, HDBSCAN, CountVectorizer. Each has their set of parameters, of which only a
    # subset is tunable via the BERTopic constructor. For best tuning, may want to look into them.
    bert_model = BERTopic(embedding_model = "all-MiniLM-L6-v2", min_topic_size = 500)
    
    topics, probabilities = bert_model.fit_transform(utils.all_quotes_generator(data_dir, 'quotation'))
    return bert_model, (topics, probabilities)

In [4]:
bert_model, (topics, probabilities) = embed_dataset_BERT(data_dir = DATA_DIR)

Starting processing Data\quotes-2015.json.bz2
Processed 1000000 lines from Data\quotes-2015.json.bz2 in 0.461 minutes
Processed 2000000 lines from Data\quotes-2015.json.bz2 in 0.914 minutes
Processed 3000000 lines from Data\quotes-2015.json.bz2 in 1.415 minutes
Processed 4000000 lines from Data\quotes-2015.json.bz2 in 1.892 minutes
Processed 5000000 lines from Data\quotes-2015.json.bz2 in 2.347 minutes
Processed 6000000 lines from Data\quotes-2015.json.bz2 in 2.800 minutes
Processed 7000000 lines from Data\quotes-2015.json.bz2 in 3.262 minutes
Processed 8000000 lines from Data\quotes-2015.json.bz2 in 3.724 minutes
Processed 9000000 lines from Data\quotes-2015.json.bz2 in 4.214 minutes
Processed 10000000 lines from Data\quotes-2015.json.bz2 in 4.701 minutes
Processed 11000000 lines from Data\quotes-2015.json.bz2 in 5.179 minutes
Processed 12000000 lines from Data\quotes-2015.json.bz2 in 5.636 minutes
Processed 13000000 lines from Data\quotes-2015.json.bz2 in 6.113 minutes
Processed 1400

Finished processing Data\quotes-2019.json.bz2 in 10.561 minutes
Starting processing Data\quotes-2020.json.bz2
Processed 1000000 lines from Data\quotes-2020.json.bz2 in 0.484 minutes
Processed 2000000 lines from Data\quotes-2020.json.bz2 in 0.973 minutes
Processed 3000000 lines from Data\quotes-2020.json.bz2 in 1.451 minutes
Processed 4000000 lines from Data\quotes-2020.json.bz2 in 1.918 minutes
Processed 5000000 lines from Data\quotes-2020.json.bz2 in 2.377 minutes
Finished processing Data\quotes-2020.json.bz2 in 2.494 minutes
Starting processing Data\quotes-2015.json.bz2
Processed 1000000 lines from Data\quotes-2015.json.bz2 in 0.456 minutes
Processed 2000000 lines from Data\quotes-2015.json.bz2 in 0.927 minutes
Processed 3000000 lines from Data\quotes-2015.json.bz2 in 1.367 minutes
Processed 4000000 lines from Data\quotes-2015.json.bz2 in 1.817 minutes
Processed 5000000 lines from Data\quotes-2015.json.bz2 in 2.282 minutes
Processed 6000000 lines from Data\quotes-2015.json.bz2 in 2.7

Processed 14000000 lines from Data\quotes-2019.json.bz2 in 6.786 minutes
Processed 15000000 lines from Data\quotes-2019.json.bz2 in 7.278 minutes
Processed 16000000 lines from Data\quotes-2019.json.bz2 in 7.791 minutes
Processed 17000000 lines from Data\quotes-2019.json.bz2 in 8.303 minutes
Processed 18000000 lines from Data\quotes-2019.json.bz2 in 8.781 minutes
Processed 19000000 lines from Data\quotes-2019.json.bz2 in 9.277 minutes
Processed 20000000 lines from Data\quotes-2019.json.bz2 in 9.788 minutes
Processed 21000000 lines from Data\quotes-2019.json.bz2 in 10.280 minutes
Finished processing Data\quotes-2019.json.bz2 in 10.648 minutes
Starting processing Data\quotes-2020.json.bz2
Processed 1000000 lines from Data\quotes-2020.json.bz2 in 0.483 minutes
Processed 2000000 lines from Data\quotes-2020.json.bz2 in 0.944 minutes
Processed 3000000 lines from Data\quotes-2020.json.bz2 in 1.448 minutes
Processed 4000000 lines from Data\quotes-2020.json.bz2 in 1.917 minutes
Processed 5000000

KeyboardInterrupt: 

In [None]:
bert_model.save(BERT_MODEL_SAVE_PATH)

In [None]:
bert_model.get_topic_info().head(20)

In [None]:
for topic in range(20):
    print(f"Topic {topic}:")
    print('\n'.join(str(elem) for elem in bert_model.get_topic(topic)), '\n\n')

In [None]:
bert_model.visualize_topics()

In [None]:
bert_model.visualize_hierarchy(top_n_topics=50)

In [None]:
bert_model.visualize_barchart(top_n_topics=5)

In [None]:
bert_model.visualize_heatmap(n_clusters=20, width=1000, height=1000)

In [None]:
# This declining going to right is sign that first words in each topic are very representative of whole topic
bert_model.visualize_term_rank()