In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, models
from nltk.tokenize import word_tokenize
import altair as alt
import os
import re
from nltk.util import ngrams
from itertools import chain

import numpy as np
import warnings
warnings.simplefilter("ignore")

df = pd.read_csv('Data_mesh_publications_cleaned.csv')
df_sample = df.copy()



In [4]:
def find_topic_words_lsi(df, num_topics=36, num_words=10):
    """
    Find topic words for each content in the DataFrame using LSI.
    
    Parameters:
    - df: DataFrame with a 'Content' column containing preprocessed (tokenized) texts.
    - num_topics: Number of topics to extract.
    - num_words: Number of words to show for each topic.
    
    Returns:
    - topics_df: DataFrame with topics and their words for each document.
    """
    tokenize_content = df['Content'].apply(lambda x: word_tokenize(x.lower()))

    dictionary = corpora.Dictionary(tokenize_content)

    corpus = [dictionary.doc2bow(text) for text in tokenize_content]
    
    # Create the LSI model
    tfidf = models.TfidfModel(corpus)  
    corpus_tfidf = tfidf[corpus]
    lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics)

    topics_df = pd.DataFrame()
    for n, doc in enumerate(corpus_tfidf):
        doc_topics = lsi_model[doc]
        topic_num, prop_topic = sorted(doc_topics, key=lambda x: abs(x[1]), reverse=True)[0]  
        wp = lsi_model.show_topic(topic_num, topn=num_words)
        topic_keywords = ", ".join([word for word, prop in wp])
        word_probs = [(word, round(prop, 4)) for word, prop in wp]  
        topics_df = topics_df.append(pd.Series([int(topic_num), topic_keywords, word_probs]), ignore_index=True)
    
    topics_df.columns = ['Dominant_Topic', 'Topic_Keywords','Word_Probabilities']
    return topics_df



In [6]:

topics_df = find_topic_words_lsi(df)
print(topics_df)



    Dominant_Topic                                     Topic_Keywords  \
0                6  localization, region, india, port, zone, india...   
1               14  port, ds, composition, aligned, ai, dl, cdm, m...   
2                5  localization, zone, port, utc, restriction, pi...   
3                3  dwh, archimate, gartner, vault, paradigm, fabr...   
4                9  dairy, quasi, farm, erential, anonymisation, c...   
5               11  componentization, contribution, netflix, analy...   
6               12  ds, smart, monitoring, modal, dairy, medical, ...   
7                8  gray, dairy, farm, id, cowmesh, cow, semantic,...   
8                0  interviewee, interview, bp, smart, product, or...   
9                4  dairy, farm, quasi, semantic, cowmesh, cow, er...   
10              10  gray, id, composite, layered, rq, dairy, farm,...   
11               1  interviewee, smart, bp, city, motivational, in...   
12              15  ai, ds, dl, cdm, contemporary, 

In [7]:
merged_df = pd.concat([df_sample, topics_df], axis=1)
result_data = []




In [8]:
for _, row in merged_df.iterrows():
    Filename = row['Filename']
    topic_words = row['Word_Probabilities']


    for word, prop in topic_words:
        result_data.append({'Filename': Filename, 'topic_word': word, 'prop': prop})


result_df = pd.DataFrame(result_data, columns=['Filename', 'topic_word', 'prop'])

print(result_df)

                                              Filename    topic_word    prop
0    Breaking Down Data Silos Data Mesh to Achieve ...  localization  0.7619
1    Breaking Down Data Silos Data Mesh to Achieve ...        region  0.1381
2    Breaking Down Data Silos Data Mesh to Achieve ...         india  0.1317
3    Breaking Down Data Silos Data Mesh to Achieve ...          port -0.1008
4    Breaking Down Data Silos Data Mesh to Achieve ...          zone -0.0932
..                                                 ...           ...     ...
205  Utilization of Data Mesh Framework as a Part o...  organization  0.1105
206  Utilization of Data Mesh Framework as a Part o...  motivational  0.1030
207  Utilization of Data Mesh Framework as a Part o...        fabric  0.0966
208  Utilization of Data Mesh Framework as a Part o...     lakehouse  0.0911
209  Utilization of Data Mesh Framework as a Part o...          city  0.0891

[210 rows x 3 columns]


In [10]:
# Select the top terms for each PDF
top_terms= result_df.groupby('Filename').head(10)

# Add a small random value to 'idf_value' for better visualization
top_terms['idf_value'] = top_terms['prop'] + np.random.rand(top_terms.shape[0]) * 0.0001

# Create a base chart
base = alt.Chart(top_terms).encode(
    x=alt.X('rank:O', axis=None),  # Use rank for x-axis
    y='Filename:N',
    color=alt.Color('idf_value:Q', scale=alt.Scale(scheme='viridis')),
    text='topic_word:N'  # Corrected column name to 'topic_word'
).transform_window(
    rank="rank()",
    sort=[alt.SortField("idf_value", order="descending")],
    groupby=["Filename"]
)

# Create a heatmap
heatmap = base.mark_rect().encode(
    color='idf_value:Q',
)

# Create text labels with conditional color
text = base.mark_text(baseline='middle').encode(
    text='topic_word:N',
    color=alt.condition(alt.datum.idf_value >= 0.4, alt.value('white'), alt.value('black'))
)

# Display the heatmap and text labels
chart_lda= (heatmap + text).properties(width=1000)





In [11]:
chart_lda

In [12]:
result_data_publication_type = []

# Iterate through each row in the original DataFrame
for _, row in merged_df.iterrows():
    Publication_type = row['Publication_type']
    topic_words = row['Word_Probabilities']

    # Iterate through each tuple in the list of topic_words
    for word, idf_value in topic_words:
        result_data_publication_type.append({'Publication_type': Publication_type, 'topic_word': word, 'probs': idf_value})

# Create the result DataFrame from the list
result_df_publication_type = pd.DataFrame(result_data_publication_type, columns=['Publication_type', 'topic_word', 'probs'])

# Print the result DataFrame
result_df_publication_type

Unnamed: 0,Publication_type,topic_word,probs
0,conference,localization,0.7619
1,conference,region,0.1381
2,conference,india,0.1317
3,conference,port,-0.1008
4,conference,zone,-0.0932
...,...,...,...
205,master thesis,organization,0.1105
206,master thesis,motivational,0.1030
207,master thesis,fabric,0.0966
208,master thesis,lakehouse,0.0911


In [15]:
# Select the top terms for each PDF
top_terms_df_publication_type = result_df_publication_type.groupby('Publication_type').head(10)

# Add a small random value to 'idf_value' for better visualization
top_terms_df_publication_type['probs'] = top_terms_df_publication_type['probs'] + np.random.rand(top_terms_df_publication_type.shape[0]) * 0.0001

# Create a base chart
base = alt.Chart(top_terms_df_publication_type).encode(
    x=alt.X('rank:O', axis=None),  # Use rank for x-axis
    y='Publication_type:N',
    color=alt.Color('probs:Q', scale=alt.Scale(scheme='viridis')),
    text='topic_word:N'  # Corrected column name to 'topic_word'
).transform_window(
    rank="rank()",
    sort=[alt.SortField("probs", order="descending")],
    groupby=["Publication_type"]
)

# Create a heatmap
heatmap = base.mark_rect().encode(
    color='probs:Q',
)

# Create text labels with conditional color
text = base.mark_text(baseline='middle').encode(
    text='topic_word:N',
    color=alt.condition(alt.datum.idf_value >= 2.0, alt.value('white'), alt.value('black'))
)

# Display the heatmap and text labels
chart_lda_publication_type = (heatmap + text).properties(width=2000)
chart_lda_publication_type

In [16]:
result_data_publisher = []

# Iterate through each row in the original DataFrame
for _, row in merged_df.iterrows():
    Publisher = row['Publisher']
    topic_words = row['Word_Probabilities']

    # Iterate through each tuple in the list of topic_words
    for word, idf_value in topic_words:
        result_data_publisher.append({'Publisher': Publisher, 'topic_word': word, 'prob': idf_value})

# Create the result DataFrame from the list
result_df_publisher = pd.DataFrame(result_data_publisher)


result_df_publisher

Unnamed: 0,Publisher,topic_word,prob
0,IEEE,localization,0.7619
1,IEEE,region,0.1381
2,IEEE,india,0.1317
3,IEEE,port,-0.1008
4,IEEE,zone,-0.0932
...,...,...,...
205,miscellaneous,organization,0.1105
206,miscellaneous,motivational,0.1030
207,miscellaneous,fabric,0.0966
208,miscellaneous,lakehouse,0.0911


In [17]:
# Select the top terms for each PDF
top_terms_df_publisher = result_df_publisher.groupby('Publisher').head(10)

# Add a small random value to 'idf_value' for better visualization
top_terms_df_publisher['prob'] = top_terms_df_publisher['prob'] + np.random.rand(top_terms_df_publisher.shape[0]) * 0.0001

# Create a base chart
base = alt.Chart(top_terms_df_publisher).encode(
    x=alt.X('rank:O', axis=None),  # Use rank for x-axis
    y='Publisher:N',
    color=alt.Color('prob:Q', scale=alt.Scale(scheme='viridis')),
    text='topic_word:N'  # Corrected column name to 'topic_word'
).transform_window(
    rank="rank()",
    sort=[alt.SortField("prob", order="descending")],
    groupby=["Publisher"]
)

# Create a heatmap
heatmap = base.mark_rect().encode(
    color='prob:Q',
)

# Create text labels with conditional color
text = base.mark_text(baseline='middle').encode(
    text='topic_word:N',
    color=alt.condition(alt.datum.idf_value >= 3.0, alt.value('white'), alt.value('black'))
)

# Display the heatmap and text labels
chart_lda_publisher = (heatmap + text).properties(width=1500)
chart_lda_publisher