In [None]:
import re, pickle, os, torch, csv
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from bertopic import BERTopic
import datetime
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import math

import statsmodels.api as sm
from sklearn.preprocessing import normalize
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import minmax_scale
from scipy.cluster.hierarchy import fcluster, linkage
from sklearn.metrics.pairwise import cosine_similarity
from scipy.interpolate import interp1d
from sklearn.utils.extmath import safe_sparse_dot
import plotly.figure_factory as ff
import plotly.graph_objects as go

from cuml.preprocessing import Normalizer


## Notes
- 5/8/24
    - Created file
    - added *topics_over_time* function:
        - function that takes document dataframe (with topic assignments and bin_period) and label reperesentations and transforms the data into a matrix representing the relative min and maximum frequencies for each topics
        - additionally displays information on a heatmap
    - added *gpu_cosine_similarity* function:
        - uses cuml's normalizer to speed up cosine similarity calculation
    - **Will need to address label representations eventually**
    - added **topic comparisions**
        - takes topic embeddings and calculates cosine similarity between them 


### Functions

In [None]:
#Takes docs and transforms into a relative topic frequency matrix based on bin period
def topics_over_time(documents, topic_labels, save_img=None, title=None, index=None):
    topic_frequency_norm = documents.groupby('Topic')['bin_period'].value_counts(normalize=True).unstack(fill_value=0)
    #in case bins were not split correctly
    if len(topic_frequency_norm.columns) == 21:
        df = topic_frequency_norm.drop(20, axis=1)
    else:
        df = topic_frequency_norm
    #min max normalization
    df_norm = df.sub(df.min(axis=1), axis=0).div((df.max(axis=1) - df.min(axis=1)), axis=0)
    df_norm = df_norm.reset_index()
    if index is not None:
        df_norm = np.take(df_norm, index, axis=0)
    #creates extra column specifing the bin period where min-max score = 1, for easier organization in heatmap
    df_norm['largest'] = df_norm.iloc[:,1:21].values.argmax(axis=1)
   
    df_norm = df_norm.sort_values('largest')
    df_norm = df_norm.reset_index()
    sorted_index = [topic_labels[i] for i in df_norm['index']]
    
    #plotting matrix
    fig = px.imshow(df_norm.iloc[:,2:22], width=1400, height=2200, aspect="auto", color_continuous_scale='deep')
    fig.update_layout(
       yaxis=dict(
            tickvals=list(range(len(df_norm['index']))),
           ticktext=sorted_index,
           tickfont=dict(size=13),
           title = dict(font_size=18,text="<i><b>Topic Labels</b></i>")),
           xaxis=dict(title = dict(font_size=18,text="<i><b>Bin Period</b></i>")),
    margin=dict(l=600)
    )
    if title is not None:
        fig.update_layout(
        title=dict(text=title, y=0.97, x=0.5, xanchor='center')
            )
    fig.show()
    if save_img is not None:
        fig.write_image(save_img)

    return df_norm

In [None]:
# replication of scikit-learn's cosine similarity, with cuml's normalizer for speed
def gpu_cosine_similarity(matrix, matrix2=None, max=None):
    norm = Normalizer()
    norm_matrix = norm.transform(matrix)
    if max is None:
        if matrix2 is None:
            K = safe_sparse_dot(norm_matrix, norm_matrix.T, dense_output=True) 
        else:
            norm_matrix2 = norm.transform(matrix2)
            K = safe_sparse_dot(norm_matrix, norm_matrix2.T)
        return K
    else
        values = []
        if matrix2 is None:
            num_samples = round(math.sqrt(number))
            for n in range(max):
                i = np.random
                vectorA = 
                


In [None]:
def expand_docs(documents, data, topic_model, save_dir=None):
    docs = documents
    docs['Year'] = pd.to_datetime(data['Date']).dt.year
    docs['Journal'] = data['Journal']
   # docs = docs.drop(columns='ID')

    docs['Date'] = pd.to_datetime(data['Date'])
# documents['Timestamp'] = documents['Date'].dt.timestamp()
    ts_for_bins = list(docs['Date'])
    ts_for_bins.sort()

    bin_num  = 20
    bin_size = int(len(ts_for_bins)/bin_num)
    bin_idxs = [idx for idx in range(0, len(ts_for_bins), bin_size)]

    bin_timestamps = [ts_for_bins[idx] for idx in bin_idxs]

    max_timestamp      = max(ts_for_bins) + pd.Timedelta(1, unit='D')

    bin_df         = pd.DataFrame(list(zip(bin_idxs, bin_timestamps)),
            columns=['bin_start_idx', 'bin_start_date'])

    bin_df['Count'] = bin_df['bin_start_idx'].diff().fillna(bin_df['bin_start_idx'].iloc[0]).astype(int)
    bin_df['bin_end_date'] = bin_df['bin_start_date'].shift(-1) - pd.Timedelta(days=1)
    bin_df['bin_end_date'][20] = max(docs['Date']) + pd.Timedelta(1, unit='D')

    bin_period = []
    docs['bin_period'] = 0

    for i in tqdm(range(len(docs))):
        period = 0
        while docs['Date'][i] > bin_df['bin_end_date'][period] and period < len(bin_df):
            period +=1
        
            # doc = documents['Date'][i]
            # bindate = bin_df['bin_end_date'][period]

            # print(f'Period: {period}.. {doc} < {bindate}')

    # print(f'Assigning Document: {i} bin: {period}')
        docs['bin_period'][i] = period

    if save_dir is not None:
        with open(save_dir/'new_docs.pickle', "wb") as f:
            pickle.dump(docs, f)
        print(f"Docs saved at: {save_dir}")
    
    return docs

### loading info

In [None]:
#load in data from journals 0-264

dir_0full = Path('/mnt/scratch/ande2472/model_output/topic_modeling/0_to_264/')

topic_model_0 = BERTopic.load(dir_0full/'model_outliers_reduced')
# with open(dir_0full/'new_docs.pickle', "rb") as f:
#   docs_0 = pickle.load(f)

In [None]:
topic_model_0.get_topic_info()

In [None]:
dir(topic_model_0)

In [None]:
# two options for label generation, will need to be addressed later

topic_labels2 = topic_model_0.generate_topic_labels(nr_words=4, word_length=15, aspect="MMR", separator='|')
topic_labels = topic_model_0.generate_topic_labels(nr_words=3, word_length=20, aspect="KeyBERT", separator='|')

### heatmap

In [None]:
matrix = topics_over_time(docs_0, topic_labels, title=None)

### topic comparisions

In [None]:
# creates topic by topic similarity matrix
topic_embeddings = topic_model_0.topic_embeddings_
sim_matrix = gpu_cosine_similarity(topic_embeddings)

In [None]:
#viewing similarity of first 20 topics to eachother
fig = ff.create_distplot(sim_matrix[:19], topic_labels2[:19],bin_size=.025)
fig.show()

### within topic similarity

In [None]:
# document within topic similarity matrix calculated in topicsimilarity.py
# example only for topic 1 of journals 0-264


dir_0full = Path('/mnt/scratch/ande2472/sjrouts/0to264_full/')
save_dir = dir_0full/'sim_matrix_test.pickle'
with open(save_dir, "rb") as f:
    sim_mat = pickle.load(f)

In [None]:
dir_0full = Path('/mnt/scratch/ande2472/sjrouts/0to264_full/')
with open(dir_0full/'new_docs.pickle', "rb") as f:
        docs_0 = pickle.load(f)

In [None]:
topic_list = list(docs_0[docs_0['Topic'] == 1].index)
topic_unlist = list(docs_0[docs_0['Topic'] != 1].index)

In [None]:
# for a topic with 55033, there are 3 billion (3028631089) entries.... idk if scaling up will be possible for between topic similarity
upper_triangle_no_diag = sim_mat[np.triu_indices_from(sim_mat, k=1)]

In [None]:
max_val = np.max(upper_triangle_no_diag)

In [None]:
min_val = np.min(upper_triangle_no_diag)

In [None]:
array = np.arange(0, 1, 0.025)

In [None]:
bin_count['Count'][40]

In [None]:
for num in tqdm(upper_triangle_no_diag):
        bin = 0
        while num > bin_count['Count'][bin] and bin > 38:
            bin_count +=1
        bin_count['Count'][bin] += 1

In [None]:
bin_indices = np.digitize(upper_triangle_no_diag, array, right=True)

In [None]:
bin_counts = np.bincount(bin_indices, minlength=len(array))
bin_counts[-2] += bin_counts[-1]
bin_counts = bin_counts[:-1]

In [None]:
bin_counts

In [None]:
bin_count_df = pd.DataFrame({'Bin': array, 'Count': bin_counts})

In [None]:
fig = go.Figure(data=[
    go.Bar(
        x=bin_count_df['Bin'],
        y=bin_count_df['Count'],
        width=0.02  # Adjust the width to match the bin spacing
    )
])
fig.update_layout(
    title='Histogram of Bin Values and Counts',
    xaxis_title='Bin Value',
    yaxis_title='Count',
    bargap=0.2  # Adjust gap between bars
)

fig.show()

In [None]:
fig = ff.create_distplot(sim_matrix[:19], topic_labels2[:19],bin_size=.025)

In [None]:
def gen_histogram(array):
    max_val = np.max(array)
    min_val = np.min()

In [None]:
#too slow
fig = ff.create_distplot([upper_triangle_no_diag],['Topic 1'] ,bin_size=.025)
fig.show()

In [None]:
fig = go.Figure(data=[
    go.Bar(
        x=bin_count_df['Bin'],
        y=bin_count_df['Count'],
        width=0.02  # Adjust the width to match the bin spacing
    )
])
fig.update_layout(
    title='Histogram of Bin Values and Counts',
    xaxis_title='Bin Value',
    yaxis_title='Count',
    bargap=0.2  # Adjust gap between bars
)

fig.show()

In [None]:
array1 = np.array([[3,4,5,6],[2,3,2,3],[3,4,5,3]])
array2 = np.array([[3,4,5,6],[2,3,2,3],[2,3,2,3],[3,4,5,3],[3,4,5,3],[3,4,5,3]])

In [None]:
# replication of scikit-learn's cosine similarity, with cuml's normalizer for speed
def gpu_cosine_similarity(matrix, matrix2=None, max=None):
    norm = Normalizer()
    norm_matrix = norm.transform(matrix)
    if max is None:
        if matrix2 is None:
            K = safe_sparse_dot(norm_matrix, norm_matrix.T, dense_output=True) 
        else:
            norm_matrix2 = norm.transform(matrix2)
            K = safe_sparse_dot(norm_matrix, norm_matrix2.T)
        return K
    else:
        similarity_values = []
        if matrix2 is None:
            num_samples = round(math.sqrt(max))
            idx1 = np.random.randint(0, matrix.shape[0], num_samples)
            idx2 = np.random.randint(0, matrix.shape[0], num_samples)
            for i in idx1:
                for j in idx2:
                    similarity_values += [np.dot(norm_matrix[i],norm_matrix[j])]
        else:
            norm_matrix2 = norm.transform(matrix2)
            num_samples = round(math.sqrt(max))
            idx1 = np.random.randint(0, matrix.shape[0], num_samples)
            idx2 = np.random.randint(0, matrix.shape[0], num_samples)
            for i in idx1:
                for j in idx2:
                    similarity_values += [np.dot(norm_matrix[i],norm_matrix2[j])]
        return similarity_values
                    


In [None]:
array1[:,3]

In [None]:
k = gpu_cosine_similarity(array1,array2, max=7)

In [None]:
k

In [None]:
i = [2,4,5]
array[-i]

In [None]:
file_dir = Path('/mnt/scratch/ande2472/data/0_topjournals/')
with open(file_dir/'0_topjournals_embs.pickle', "rb") as f:
    emb = pickle.load(f)

In [None]:
docs = pd.read_csv(file_dir/'0_topjournals.csv', sep='\t')


In [None]:
documents = docs.drop(['Unnamed: 0','index','Title','Abstract'], axis=1)

In [None]:
documents['Topic'] = topic_model_0.topics_

In [None]:
with open(dir_0full/'new_docs.pickle', "wb") as f:
        pickle.dump(documents,f)

In [None]:
docs_0 = documents

In [None]:
within_similarity = gpu_cosine_similarity(emb[topic_list],max=1000000)

In [None]:
between_similarity = gpu_cosine_similarity(emb[topic_unlist],max=1000000)

In [None]:
fig = ff.create_distplot([within_similarity,between_similarity], ['Within','Between'],bin_size=.025, show_rug=False)
fig.show()

In [None]:
topic_list = list(docs_0[docs_0['Topic'] == 38].index)
topic_unlist = list(docs_0[docs_0['Topic'] != 38].index)
within_similarity = gpu_cosine_similarity(emb[topic_list],max=1000000)
between_similarity = gpu_cosine_similarity(emb[topic_unlist],max=1000000)
fig = ff.create_distplot([within_similarity,between_similarity], ['Within','Between'],bin_size=.025, show_rug=False)
fig.update_layout(title_text='Topic 38')
fig.show()

In [None]:
topic_list = list(docs_0[docs_0['Topic'] == 78].index)
topic_unlist = list(docs_0[docs_0['Topic'] != 78].index)
within_similarity = gpu_cosine_similarity(emb[topic_list],max=1000000)
between_similarity = gpu_cosine_similarity(emb[topic_unlist],max=1000000)
colors = ['rgb(0, 200, 200)','rgb(0, 0, 100)']
fig = ff.create_distplot([within_similarity,between_similarity], ['Within Topic','Between Topic'],bin_size=.025, show_rug=False, colors=colors)

fig.update_layout(title_text='Document Similarity Distribution for Topic 78')
fig.update_xaxes(title_text='Cosine Similarity Score')
fig.update_yaxes(showgrid=False, title_text='Relative Frequency')
fig.show()

In [None]:
docs_0

In [None]:
def generate_within_between(emb, docs, labels, save_dir, max_val=1000000, topics=None ):
    if topics is None:
        for topic in tqdm(docs['Topic'].unique()):
            topic_list = list(docs[docs['Topic'] == topic].index)
            topic_unlist = list(docs[docs['Topic'] != topic].index)
            within_similarity = gpu_cosine_similarity(emb[topic_list],max=max_val)
            between_similarity = gpu_cosine_similarity(emb[topic_unlist],max=max_val)
            colors = ['rgb(0, 200, 200)','rgb(0, 0, 100)']
            fig = ff.create_distplot([within_similarity,between_similarity], ['Within Topic','Between Topic'],bin_size=.025, show_rug=False, colors=colors)
            label = labels[topic]
            fig.update_layout(title_text=f'Document Similarity Distribution for {label}')
            
            fig.update_xaxes(title_text='Cosine Similarity Score')
            fig.update_yaxes(showgrid=False, title_text='Relative Frequency')
            
            fig.write_image(save_dir/f"topic_{topic}_within_between_sim.pdf")
         

    else:
        for topic in tqdm(topics):
            topic_list = list(docs[docs['Topic'] == topic].index)
            topic_unlist = list(docs[docs['Topic'] != topic].index)
            within_similarity = gpu_cosine_similarity(emb[topic_list],max=max_val)
            between_similarity = gpu_cosine_similarity(emb[topic_unlist],max=max_val)
            colors = ['rgb(0, 200, 200)','rgb(0, 0, 100)']
            fig = ff.create_distplot([within_similarity,between_similarity], ['Within Topic','Between Topic'],bin_size=.025, show_rug=False, colors=colors)
            label = labels[topic]
            fig.update_layout(title_text=f'Document Similarity Distribution for {label}')
            
            fig.update_xaxes(title_text='Cosine Similarity Score')
            fig.update_yaxes(showgrid=False, title_text='Relative Frequency')
   
            fig.write_image(save_dir/f"topic_{topic}_within_between_sim.pdf")
           



In [None]:
plots_dir = Path('/mnt/scratch/ande2472/model_output/topic_modeling/0_to_264/plots')


In [None]:
generate_within_between(emb, docs_0, topic_labels, save_dir=plots_dir)