In [2]:
%%capture
import sys
import os
import textwrap
import datasets
from dotenv import dotenv_values
from pathlib import Path
from scipy.special import softmax
from scipy.stats import entropy
import pandas as pd
import numpy as np
np.random.seed(19950808)
from nltk import sent_tokenize, word_tokenize
pd.set_option('display.max_colwidth', None)

config = dotenv_values("./../../../config/.env") # take environment variables from .env.
base_path = Path(config["BASE_PATH"])
sys.path.append(str(base_path/"code"))

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import CountVectorizer

import cohere
import backoff
from bertopic.representation import Cohere
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

In [4]:
embedding_name = "emb_bge_large_en"
n_neighbors = 8

In [5]:
df = pd.read_pickle(base_path/"data/predictions/df_chunks_embedding.pkl")
df.report_id = df.report_id.astype(str)
df.filing_type = df.filing_type.apply(lambda x: "10-K" if x[:4] == "10-K" else ".pdf")
df.reset_index(inplace=True, drop=True)

df_risks = pd.read_pickle(base_path/'data/processed/df_risk_descriptions.pkl')
df_risks = df_risks[~df_risks.Source.str.contains("internal")]
df_risks.reset_index(drop=True, inplace=True)

embeddings = np.array([np.array(x) for x in df[embedding_name].tolist()])
embeddings_2d = UMAP(n_neighbors=n_neighbors, n_components=2, min_dist=0.0, metric='cosine', random_state=19950808).fit_transform(embeddings)

df["x"] = embeddings_2d[:,0].tolist()
df["y"] = embeddings_2d[:,1].tolist()

cos

In [4]:
def docs_to_indices(source_df, documents):
    return [source_df[source_df.text == document].index[0] for document in documents]

def get_top_k_matches(indices, semantic_scores, top_k=10):
    df = pd.DataFrame()
    for score_list in [semantic_scores[index] for index in indices]:
        df = df.append(pd.DataFrame(score_list), ignore_index=True)
    df.sort_values(by="score", ascending=False, inplace=True)
    df.drop_duplicates(subset=['corpus_id'], inplace=True, keep="first")
    df.reset_index(drop=True, inplace=True)
    return df.head(top_k)

def map_topics_risks_and_paragraphs(df, tm_df, df_risks, top_k, embedding_name):
    ## Map topics to risks
    embeddings = np.array(df[embedding_name].tolist())
    risk_embeddings = np.array(df_risks[embedding_name].tolist())
    
    semantic_scores = util.semantic_search(
        embeddings,
        risk_embeddings,
        top_k=top_k,
        score_function=util.cos_sim)
    all_scores = util.cos_sim(embeddings, risk_embeddings).flatten()

    mean = float(all_scores.mean())
    std = float(all_scores.std())

    topics_to_risks_df = pd.DataFrame()
    for i, row in tm_df.iterrows():
        indices = docs_to_indices(source_df=df, documents=row["Representative_Docs"])
        matches = get_top_k_matches(indices=indices, semantic_scores=semantic_scores, top_k=top_k)
        matches.rename(columns = {'corpus_id': "risks_index"}, inplace = True)
        matches["Topic"] = i-1
        matches.score = matches.score.sub(mean).div(std)
        topics_to_risks_df = topics_to_risks_df.append(matches[matches.score > 0], ignore_index=True)
    
    topics_to_risks_df.rename(columns = {'score': "Score"}, inplace = True)

    ## Map Risks to topics
    semantic_scores = util.semantic_search(
        risk_embeddings,
        embeddings,
        top_k=top_k,
        score_function=util.cos_sim)

    risks_to_topics_df = pd.DataFrame()
    for i, row in df_risks.iterrows():
        indices = [i]
        matches = get_top_k_matches(indices=indices, semantic_scores=semantic_scores, top_k=top_k)
        matches.rename(columns = {'corpus_id': "topics_index"}, inplace = True)
        matches["risk_index"] = i
        matches.score = matches.score.sub(mean).div(std)
        risks_to_topics_df = risks_to_topics_df.append(matches[matches.score > 0], ignore_index=True)

    risks_to_topics_df.rename(columns = {'score': "Score"}, inplace = True)

    ## Map Risks to paragraphs
    semantic_scores = util.semantic_search(
        embeddings,
        risk_embeddings,
        top_k=top_k,
        score_function=util.cos_sim)

    risks_to_paragraphs_df = pd.DataFrame()
    for i, row in df.iterrows():
        indices = [i]
        matches = get_top_k_matches(indices=indices, semantic_scores=semantic_scores, top_k=top_k)
        matches.rename(columns = {'corpus_id': "paragraphs_index"}, inplace = True)
        matches["risk_index"] = i
        matches.score = matches.score.sub(mean).div(std)
        risks_to_paragraphs_df = risks_to_paragraphs_df.append(matches[matches.score > 0], ignore_index=True)

    risks_to_paragraphs_df.rename(columns = {'score': "Score"}, inplace = True)

    result_dict = {
        "topics_to_risks": topics_to_risks_df, 
        "risks_to_topics": risks_to_topics_df,
        "risks_to_paragraphs":  risks_to_paragraphs_df
    }
    return result_dict

In [5]:
def get_year_counts(topic, year_counts_df):
    year_count_dict = {}
    dff =  year_counts_df[year_counts_df.Topic == topic]
    for year in range(dff.Year.min(), dff.Year.max()+1):
        values = dff[dff.Year == year].Count.values
        if values.size >0:
            year_count_dict[year] = values[0]
        else:
            year_count_dict[year] = 0
    return year_count_dict

def get_max_count_year(year_count_dict):
    return list(year_count_dict.keys())[np.array(list(year_count_dict.values())).argmax()]

def get_first_year(year_count_dict):
    return list(year_count_dict.keys())[0]

def get_last_year(year_count_dict):
    return list(year_count_dict.keys())[-1]

def get_entropy(year_count_dict):
    counts = np.array(list(year_count_dict.values()))
    counts = counts/counts.sum()
    return entropy(counts)

def create_and_save_topic_model_and_sematic_similarity(df, embedding_name,  n_neigbors, df_risks, top_k):
    embeddings = np.array([np.array(x) for x in df[embedding_name].tolist()])
    docs = df.text.tolist()
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))
    ctfidf_model = ClassTfidfTransformer()
    embedding_model = SentenceTransformer("BAAI/bge-large-en")
    umap_model = UMAP(n_neighbors=n_neigbors, n_components=5, min_dist=0.0, metric='cosine', random_state=19950808)
    hdbscan_model = HDBSCAN(min_cluster_size=n_neigbors, metric='euclidean', cluster_selection_method="eom", cluster_selection_epsilon=0.2, prediction_data=True)
    co = cohere.Client(config["COHERE_API_KEY"])
    #@backoff.on_exception(backoff.expo, openai.error.RateLimitError, on_backoff=lambda x: print(f"""Backing off: {round(x['wait'])} seconds"""), )  
    #representation_model = Cohere(co, delay_in_seconds=12)
    #representation_model = KeyBERTInspired(model='all-MiniLM-L6-v2')
    tm = BERTopic(
        embedding_model=embedding_model,          # Step 1 - Extract embeddings
        umap_model=umap_model,                    # Step 2 - Reduce dimensionality
        hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
        vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
        ctfidf_model=ctfidf_model ,               # Step 5 - Extract topic words
        #representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
    )
    topics, probs = tm.fit_transform(docs, embeddings)
    df["Topic"] = topics
    df["probs"] = probs

    tm_df = tm.get_topic_info()
    tm_df.set_index('Topic', inplace=True)
    tm_df["Top_Words"] = tm.get_topics()
    tm_df.reset_index(inplace=True)
    tm_df.Name = tm_df.Name.apply(lambda x: f"T {x.split('_')[0]}: {', '.join([y.capitalize() for y in x.split('_')[1:]])}" if int(
        x.split('_')[0]) != -1 else "No Topic")
    
    df.reset_index(inplace=True,drop=True)
    matching_dict = map_topics_risks_and_paragraphs(df, tm_df, df_risks, top_k, embedding_name)
    save_path = base_path/'Dashboard/topic_models'/(embedding_name+f"_{n_neigbors}")
    if not save_path.exists():
        save_path.mkdir(parents=True)
    for df_name, match_df in matching_dict.items():
        match_df.to_pickle(save_path/f"{df_name}.pkl")

    topics_2_risks = matching_dict["topics_to_risks"].merge(df_risks, right_index=True, left_on='risks_index', how='left')
    topic_risk_year_counts = topics_2_risks.groupby(["Topic", "Year"]).count()[["risks_index"]].rename(columns={"risks_index": "Count"}).reset_index()

    ## Only drop embeddings after computing similarites
    df.drop(columns=[col for col in df.columns if col[:3] == "emb"], inplace=True)
    topic_map = dict(zip(tm_df.Topic.tolist(), tm_df.Name.tolist()))
    df['Topic_Name'] = df['Topic'].map(topic_map)
    df['text_wrapped'] = df.text.apply(lambda x: "<br>".join(textwrap.wrap(x, width=60)))
    df_risks.drop(columns=[col for col in df_risks.columns if col[:3] == "emb"], inplace=True)

    df.rename(columns={"year": "Year"}, inplace=True)
    year_counts = df.groupby(["Topic", "Year"]).count()[["report_id"]].rename(columns={"report_id": "Count"}).reset_index()
    tm_df["Year_Counts"] = tm_df.Topic.apply(lambda x: get_year_counts(x, year_counts))
    tm_df["Year_Counts_Risks"] = tm_df.Topic.apply(lambda x: get_year_counts(x, topic_risk_year_counts))
    tm_df["Max_Count_Year"] = tm_df.Year_Counts.apply(lambda x: get_max_count_year(x))
    tm_df["First_Year"] = tm_df.Year_Counts.apply(lambda x: get_first_year(x))
    tm_df["Last_Year"] = tm_df.Year_Counts.apply(lambda x: get_last_year(x))
    tm_df["Entropy"] = tm_df.Year_Counts.apply(lambda x: get_entropy(x))
    tm_df["Embeddings"] = tm.topic_embeddings_.tolist()

    hierarchy = tm.hierarchical_topics(docs)

    df.to_pickle(save_path/'df.pkl')
    df_risks.to_pickle(save_path/'df_risks.pkl')
    tm_df.to_pickle(save_path/'tm_df.pkl')
    hierarchy.to_pickle(save_path/'hierarchy.pkl')
    return [df, tm_df, hierarchy, tm, matching_dict]

In [6]:
results = create_and_save_topic_model_and_sematic_similarity(df.copy(), embedding_name=embedding_name, n_neigbors=n_neighbors, df_risks=df_risks.copy(), top_k=100)

100%|██████████| 193/193 [00:02<00:00, 93.67it/s] 
