# Topic Clustering

In [1]:
from rec_sys_uni.datasets import datasets as ds
from bertopic import BERTopic
from umap import UMAP
import matplotlib.pyplot as plt
import numpy as np
import contextualSpellCheck
import pandas as pd
import spacy
from tqdm.auto import tqdm
from hdbscan import HDBSCAN
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.cluster import KMeans, AgglomerativeClustering
import pyLDAvis
import pyLDAvis.lda_model as lda
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
%load_ext autoreload
%autoreload 2

In [2]:
# Check Cuda
import torch
torch.cuda.is_available()

True

In [9]:
import os
os.chdir(os.getcwd().replace("\\notebooks", ""))

In [10]:
# Get Course Data
course_data = ds.get_course_data()

In [None]:
docs = []
title = []
for i in course_data:
    desc = course_data[i]['course_name'].lower() + '\n'
    desc += course_data[i]['description'].lower().replace('course', course_data[i]['course_name']) + '\n'
    desc += " ".join(course_data[i]['ilos']).lower().replace('course', course_data[i]['course_name'])
    title.append(course_data[i]['course_name'])
    docs.append(desc)

In [None]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
for i in ['student', 'students', 'course', 'courses', 'learning', 'knowledge', 'skills', 'skill', 'understanding', 'understand', 'able', 'use', 'english', 'studies', 'science', 'part', 'one', 'texts', 'text', 'study', 'topics', 'topic', 'basic', 'principles', 'principle', 'introduction', 'introductions', 'introduces', 'introduce', 'introducing', 'provide', 'provides', 'provide', "education"]:
    stop_words.append(i)

## Bert Topic


### Sentence Transformer

In [None]:
"""
Asymmetric search:
    1. sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco  (keyBert, Dot Product)
    2. msmarco-distilbert-base-v4 (keyBert, Cosine Similarity)
    3. intfloat/e5-large-v2 (keyBert, Cosine Similarity)
Symmetric search:
    1. all-MiniLM-L12-v2 (keyBert, Cosine Similarity)
    2. BAAI/bge-large-en-v1.5
"""
sentence_model = SentenceTransformer("BAAI/bge-large-en-v1.5", device='cuda')

### Dimensionality Reduction

In [None]:
umap_model = UMAP(n_neighbors=2, n_components=20, min_dist=0.01, spread=3, metric='cosine')

### Clustering

In [None]:
# hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# cluster_model = KMeans(n_clusters=20, random_state=42)
cluster_model = AgglomerativeClustering(n_clusters=30, linkage='ward')

### Vectorizers

In [None]:
vectorizer_model = KeyphraseCountVectorizer(stop_words=stop_words)

### cTF-IDF

In [None]:
ctfidf_model = ClassTfidfTransformer()

### Topic Model

In [None]:
topic_model = BERTopic(

    # Pipeline models
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=cluster_model,
    vectorizer_model=vectorizer_model,

    # Hyperparameters
    top_n_words=30,
    verbose=True
)

In [None]:
# Train model
topics, probs = topic_model.fit_transform(docs)

In [None]:
# Fine-tune topic representations after training BERTopic
vectorizer_model = KeyphraseCountVectorizer(stop_words=stop_words)
topic_model.update_topics(docs, vectorizer_model=vectorizer_model)

In [None]:
# Show topics
topic_model.get_topic_info()

In [None]:
fig = topic_model.visualize_heatmap()
# fig.write_html("matrix.html")
fig

In [None]:
fig = topic_model.visualize_topics()
# fig.write_html("map.html")
fig

In [None]:
embeddings = sentence_model.encode(docs, show_progress_bar=False)

fig = topic_model.visualize_documents(title, embeddings=embeddings, custom_labels=True, title="Course Clustering")
# fig.write_html("course_clustering.html")
fig

In [None]:
cluster = topic_model.fit_transform(docs, embeddings=embeddings)

In [None]:
title_and_cluster = {}
for index, i in enumerate(course_data):
    title_and_cluster[course_data[i]['course_name']] = cluster[0][index]

In [None]:
pyLDAvis.enable_notebook()

tf_vectorizer = KeyphraseCountVectorizer(stop_words=stop_words)
dtm_tf = tf_vectorizer.fit_transform(docs)
lda_tf = LatentDirichletAllocation(n_components=20, random_state=0, max_iter=1000)
lda_tf.fit(dtm_tf)
fig = lda.prepare(lda_tf, dtm_tf, tf_vectorizer)
fig

In [None]:
# pyLDAvis.save_html(fig, 'lda.html')

# Keyword Extraction

In [None]:
# "BAAI/bge-large-en-v1.5"
sentence_model = SentenceTransformer("BAAI/bge-large-en-v1.5", device='cuda')
vectorizer = KeyphraseCountVectorizer(stop_words=stop_words)
kw_model = KeyBERT(model=sentence_model)

In [None]:
def get_desc(code):
    full_desc = course_data[code]['course_name'].lower() + '\n'
    full_desc += course_data[code]['description'].lower().replace('course', course_data[code]['course_name']) + '\n'
    full_desc += " ".join(course_data[code]['ilos']).lower().replace('course', course_data[code]['course_name'])
    return full_desc

In [None]:
keywords_data = {}
progress_bar = tqdm(range(len(course_data)))
for i in course_data:
    desc = get_desc(i)
    keywords_data[i] = kw_model.extract_keywords(desc, vectorizer=vectorizer, top_n=30)
    progress_bar.update(1)

In [None]:
new_keyword_data = {}
nlp = spacy.load('en_core_web_trf')
contextualSpellCheck.add_to_pipe(nlp)
progress_bar = tqdm(range(len(keywords_data)))
for i in keywords_data:
    new_keyword_data[i] = []
    for j in keywords_data[i]:
        if j[0] in stop_words:
            continue
        if j[1] > 0.55:
            doc = nlp(j[0])
            k = j[0]
            if doc._.performed_spellCheck:
                k = doc._.outcome_spellCheck
            doc = nlp(k)
            k = " ".join([token.lemma_ for token in doc])
            new_keyword_data[i].append((k, j[1]))
    progress_bar.update(1)

In [None]:
#load pickle
import pickle
with open('\\knowledge_graph\\keywords.pickle', 'rb') as handle:
    new_keyword_data = pickle.load(handle)

In [None]:
progress_bar = tqdm(range(len(new_keyword_data)))
for i in new_keyword_data:
    tmp = []
    for k, s in new_keyword_data[i]:
        splits = k.split()
        tmp_splits = []
        for j in splits:
            if j not in stop_words:
                tmp_splits.append(j)
        tmp.append((" ".join(tmp_splits), s))
    progress_bar.update(1)
    new_keyword_data[i] = tmp

In [None]:
final_keywords = {}
progress_bar = tqdm(range(len(new_keyword_data)))
for i in new_keyword_data:
    final_keywords[i] = []
    for j in new_keyword_data[i]:
        final_keywords[i].append(j[0])
    progress_bar.update(1)
    final_keywords[i] = list(set(final_keywords[i]))

In [None]:
keywords_list = []
for i in final_keywords:
    for j in final_keywords[i]:
        keywords_list.append(j)
keywords_list = list(set(keywords_list))
print(len(keywords_list))

In [None]:
keywords_list_emb = sentence_model.encode(keywords_list)
full_desc = []
for prog, i in enumerate(final_keywords):
    desc = get_desc(i)
    full_desc.append(desc)
full_desc_emb = sentence_model.encode(full_desc)

In [None]:
progress_bar = tqdm(range(len(final_keywords)))
for prog, i in enumerate(final_keywords):
    desc = get_desc(i)
    emb = full_desc_emb[prog].reshape(1, -1)
    for index, k in enumerate(keywords_list):
        results = cosine_similarity(emb, keywords_list_emb[index].reshape(1, -1))[0][0]
        if results > 0.70:
            final_keywords[i].append(k)
    progress_bar.update(1)


In [None]:
for i in final_keywords:
    final_keywords[i] = list(set(final_keywords[i]))

In [None]:
keywords_key = {}
progress_bar = tqdm(range(len(keywords_list)))
for prog, i in enumerate(keywords_list):
    keywords_key[i] = []
    emb = keywords_list_emb[prog].reshape(1, -1)
    for index, k in enumerate(keywords_list):
        emb2 = keywords_list_emb[index].reshape(1, -1)
        results = cosine_similarity(emb, emb2)[0][0]
        if results > 0.70 and k != i:
            keywords_key[i].append(k)
    progress_bar.update(1)


In [None]:
df = pd.DataFrame(columns=['node_1', 'node_2', 'sim', 'course'])

In [None]:
count = 0
progress_bar = tqdm(range(len(final_keywords)))
for x, i in enumerate(final_keywords):
    title = course_data[i]['course_name']
    for k in final_keywords[i]:
        # Find index of k in keywords_list
        index = keywords_list.index(k)
        sim = cosine_similarity(full_desc_emb[x].reshape(1, -1), keywords_list_emb[index].reshape(1, -1))[0][0]
        df.loc[count] = [title, k, sim, True]
        count += 1
    progress_bar.update(1)
df = df[df['sim'] > 0.50]

In [None]:
# A lot of connection ()
# progress_bar = tqdm(range(len(keywords_key)))
# for i in keywords_key:
#     index_i = keywords_list.index(i)
#     for j in keywords_key[i]:
#         index_j = keywords_list.index(j)
#         sim = cosine_similarity(keywords_list_emb[index_i].reshape(1, -1), keywords_list_emb[index_j].reshape(1, -1))[0][0]
#         df.loc[count] = [i, j, sim, False]
#         count += 1
#     progress_bar.update(1)

In [None]:
progress_bar = tqdm(range(len(final_keywords)))
for x, i in enumerate(final_keywords):
    title_x = course_data[i]['course_name']
    for y, j in enumerate(final_keywords):
        if x == y:
            continue
        # Find index of k in keywords_list
        title_y = course_data[j]['course_name']
        
        sim = cosine_similarity(full_desc_emb[x].reshape(1, -1), full_desc_emb[y].reshape(1, -1))[0][0]
        if sim > 0.66:
            df.loc[count] = [title_x, title_y, sim, True]
            count += 1
    progress_bar.update(1)

In [None]:
#load dataframe
df = pd.read_csv('\\knowledge_graph\\keywords_small.csv')

# Knowledge Graph

In [None]:
NUM_COLORS = 30

cm = plt.get_cmap('gist_rainbow')
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_prop_cycle(color=[cm(1.*i/NUM_COLORS) for i in range(NUM_COLORS)])
for i in range(NUM_COLORS):
    ax.plot(np.arange(10)*(i+1))

plt.show()

In [None]:
nodes = pd.concat([df['node_1'], df['node_2']], axis=0).unique()
nodes.shape

In [None]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in df.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        weight=row['sim'],
        title=row["sim"]
    )

In [None]:
title_and_cluster

In [None]:
# Create communities
communities = [set() for _ in range(31)]
for index, row in df.iterrows():
    if row['node_1'] in title_and_cluster:
        communities[title_and_cluster[row['node_1']]].add(row['node_1'])
    else:
        communities[30].add(row['node_1'])
        
    if row['node_2'] in title_and_cluster:
        communities[title_and_cluster[row['node_2']]].add(row['node_2'])
    else:
        communities[30].add(row['node_2'])
communities    

In [None]:
len(communities)

In [None]:
## Now add these colors to communities and make another dataframe
def colors2Community(communities, NUM_COLORS) -> pd.DataFrame:
    ## Define a color palette
    p = [cm(1.*i/NUM_COLORS) for i in range(NUM_COLORS)]
    rows = []
    group = 0
    for community in communities:
        if group == 30:
            color = "lightgrey"
        else:
            color = p.pop()
        group += 1
        print(group)
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities, 30)
colors

In [None]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [None]:
from pyvis.network import Network

graph_output_directory = "index_x.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
net.repulsion(node_distance=150, spring_length=400)
# net.force_atlas_2based(central_gravity=-1, gravity=-51)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])
net.toggle_physics(False)

net.show(graph_output_directory, local=False)