In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import altair as alt
from altair_saver import save
import os
import re

import numpy as np
import warnings
warnings.simplefilter("ignore")

## Read csv of Papers and Content

In [2]:
df = pd.read_csv('Data_mesh_publications_cleaned.csv')
df

Unnamed: 0,Filename,Content,Publisher,Publication_type
0,Breaking Down Data Silos Data Mesh to Achieve ...,abstract data localization law becoming make h...,IEEE,conference
1,Decentralized Data Governance as Part of a Dat...,abstract data socio technical decentralized an...,IEEE,conference
2,Enterprise Data Strategy A Decentralized Data ...,abstract enterprise experience exponential gro...,IEEE,conference
3,Finding Your Way Through the Jungle of Big Dat...,abstract paper present systematic ofcommon ana...,IEEE,conference
4,01. Evolution of Data Architecture,look back data architecture developed response...,Springer,book chapter
5,02. Terminology Data Fabric and Data Mesh,term data fabric data often viewed different c...,Springer,book chapter
6,03. Data Fabric and Data Mesh Use Case Scenarios,organization realized importance implementing ...,Springer,book chapter
7,04. Data Fabric and Data Mesh Business Benefits,defining data fabric underlying data architect...,Springer,book chapter
8,05. Key Data Fabric and Data Mesh Capabilities,introduces key capability concept knowledge ca...,Springer,book chapter
9,06. Relevant ML and DL Concepts,ai ml dlbroadly speaking ai simulate human per...,Springer,book chapter


## Define Function to Extract Topics

In [3]:
# def generate_custom_token_pattern(ngram_min, ngram_max):
#     # Ensure no word repetition in n-grams
#     word_pattern = r'\b(\w+)\b(?=.*\b\1\b)'
    
#     # Combine word patterns for specified n-gram range
#     ngram_pattern = r'(?:' + word_pattern + r'\s){' + str(ngram_min - 1) + r',' + str(ngram_max - 1) + r'}' + word_pattern
    
#     return ngram_pattern

# token_pattern = generate_custom_token_pattern(1, 4)

In [4]:
def extract_topics(text, tfidf_vectorizer, lda_model):
    
    # TF-IDF Vectorization for the specific document
    tfidf_data = tfidf_vectorizer.transform([text])
    tfidf_values = tfidf_data.toarray()
    feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
    df_tfidf = pd.DataFrame(data=tfidf_values, columns=feature_names)

    # Apply LDA
    doc_topic_prob = lda_model.transform(tfidf_data)

    # Get the most probable topic for the document
    most_probable_topic = np.argmax(doc_topic_prob)

    # Get the top words for the most probable topic with TF-IDF values
    topic = lda_model.components_[most_probable_topic]
    top_keywords_idx = topic.argsort()[:-10 - 1:-1]
    top_keywords = [feature_names[i] for i in top_keywords_idx]
    top_tfidf_values = [tfidf_vectorizer.idf_[tfidf_vectorizer.vocabulary_[word]] for word in top_keywords]
    # top_tfidf_values = [tfidf_data[0, tfidf_vectorizer.vocabulary_[word]] for word in top_keywords]

    topic_words = list(zip(top_keywords, top_tfidf_values))

    return most_probable_topic + 1, topic_words

In [5]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 4), token_pattern=r'\b\w+\b')
tfidf_data = tfidf_vectorizer.fit_transform(df['Content'])

# Apply LDA
lda = LatentDirichletAllocation(n_components=20, random_state=42)
lda.fit(tfidf_data)

# Assuming 'df' is your DataFrame with 'pdf_content' column
df['topic'], df['topic_words'] = zip(*df.apply(lambda row: extract_topics(row['Content'], tfidf_vectorizer, lda), axis=1))

In [6]:
df

Unnamed: 0,Filename,Content,Publisher,Publication_type,topic,topic_words
0,Breaking Down Data Silos Data Mesh to Achieve ...,abstract data localization law becoming make h...,IEEE,conference,9,"[(localization, 3.970414465569701), (data loca..."
1,Decentralized Data Governance as Part of a Dat...,abstract data socio technical decentralized an...,IEEE,conference,19,"[(port, 2.717651497074333), (output port, 3.05..."
2,Enterprise Data Strategy A Decentralized Data ...,abstract enterprise experience exponential gro...,IEEE,conference,16,"[(driver, 2.5841201044498106), (key driver, 3...."
3,Finding Your Way Through the Jungle of Big Dat...,abstract paper present systematic ofcommon ana...,IEEE,conference,2,"[(data, 1.0), (product, 1.137201121513485), (a..."
4,01. Evolution of Data Architecture,look back data architecture developed response...,Springer,book chapter,8,"[(edw, 3.0541237336955462), (attribute, 1.9555..."
5,02. Terminology Data Fabric and Data Mesh,term data fabric data often viewed different c...,Springer,book chapter,2,"[(data, 1.0), (product, 1.137201121513485), (a..."
6,03. Data Fabric and Data Mesh Use Case Scenarios,organization realized importance implementing ...,Springer,book chapter,15,"[(fabric data scenario, 3.5649493574615367), (..."
7,04. Data Fabric and Data Mesh Business Benefits,defining data fabric underlying data architect...,Springer,book chapter,16,"[(driver, 2.5841201044498106), (key driver, 3...."
8,05. Key Data Fabric and Data Mesh Capabilities,introduces key capability concept knowledge ca...,Springer,book chapter,5,"[(ai asset, 2.1786549963416464), (data curatio..."
9,06. Relevant ML and DL Concepts,ai ml dlbroadly speaking ai simulate human per...,Springer,book chapter,7,"[(privacy, 1.7731898882334818), (gray, 3.27726..."


# Results with filename

In [7]:
result_data_filename = []

# Iterate through each row in the original DataFrame
for _, row in df.iterrows():
    filename = row['Filename']
    topic_words = row['topic_words']

    # Iterate through each tuple in the list of topic_words
    for word, idf_value in topic_words:
        result_data_filename.append({'Filename': filename, 'topic_word': word, 'idf_value': idf_value})

# Create the result DataFrame from the list
result_df_filename = pd.DataFrame(result_data_filename, columns=['Filename', 'topic_word', 'idf_value'])

# Print the result DataFrame
result_df_filename

Unnamed: 0,Filename,topic_word,idf_value
0,Breaking Down Data Silos Data Mesh to Achieve ...,localization,3.970414
1,Breaking Down Data Silos Data Mesh to Achieve ...,data localization,3.970414
2,Breaking Down Data Silos Data Mesh to Achieve ...,smart,2.178655
3,Breaking Down Data Silos Data Mesh to Achieve ...,smart monitoring,3.970414
4,Breaking Down Data Silos Data Mesh to Achieve ...,monitoring,2.178655
...,...,...,...
375,Utilization of Data Mesh Framework as a Part o...,id,2.871802
376,Utilization of Data Mesh Framework as a Part o...,identi,2.871802
377,Utilization of Data Mesh Framework as a Part o...,relevant ml dl concept,3.970414
378,Utilization of Data Mesh Framework as a Part o...,relevant ml dl,3.970414


In [8]:
# Select the top terms for each PDF
top_terms_df_filename = result_df_filename.groupby('Filename').head(10)

# Add a small random value to 'idf_value' for better visualization
top_terms_df_filename['idf_value'] = top_terms_df_filename['idf_value'] + np.random.rand(top_terms_df_filename.shape[0]) * 0.0001

# Create a base chart
base = alt.Chart(top_terms_df_filename).encode(
    x=alt.X('rank:O', axis=None),  # Use rank for x-axis
    y='Filename:N',
    color=alt.Color('idf_value:Q', scale=alt.Scale(scheme='viridis')),
    text='topic_word:N'  # Corrected column name to 'topic_word'
).transform_window(
    rank="rank()",
    sort=[alt.SortField("idf_value", order="descending")],
    groupby=["Filename"]
)

# Create a heatmap
heatmap = base.mark_rect().encode(
    color='idf_value:Q',
)

# Create text labels with conditional color
text = base.mark_text(baseline='middle').encode(
    text='topic_word:N',
    color=alt.condition(alt.datum.idf_value >= 2.0, alt.value('white'), alt.value('black'))
)

# Display the heatmap and text labels
chart_lda_filename = (heatmap + text).properties(width=2000)

In [9]:
chart_lda_filename
# save(chart_lda_filename, "lda_idf_heatmap.html")

# Result with publication type

In [12]:
result_data_publication_type = []

# Iterate through each row in the original DataFrame
for _, row in df.iterrows():
    Publication_type = row['Publication_type']
    topic_words = row['topic_words']

    # Iterate through each tuple in the list of topic_words
    for word, idf_value in topic_words:
        result_data_publication_type.append({'Publication_type': Publication_type, 'topic_word': word, 'idf_value': idf_value})

# Create the result DataFrame from the list
result_df_publication_type = pd.DataFrame(result_data_publication_type, columns=['Publication_type', 'topic_word', 'idf_value'])

# Print the result DataFrame
result_df_publication_type

Unnamed: 0,Publication_type,topic_word,idf_value
0,conference,localization,3.970414
1,conference,data localization,3.970414
2,conference,smart,2.178655
3,conference,smart monitoring,3.970414
4,conference,monitoring,2.178655
...,...,...,...
375,master thesis,id,2.871802
376,master thesis,identi,2.871802
377,master thesis,relevant ml dl concept,3.970414
378,master thesis,relevant ml dl,3.970414


In [13]:
# Select the top terms for each PDF
top_terms_df_publication_type = result_df_publication_type.groupby('Publication_type').head(10)

# Add a small random value to 'idf_value' for better visualization
top_terms_df_publication_type['idf_value'] = top_terms_df_publication_type['idf_value'] + np.random.rand(top_terms_df_publication_type.shape[0]) * 0.0001

# Create a base chart
base = alt.Chart(top_terms_df_publication_type).encode(
    x=alt.X('rank:O', axis=None),  # Use rank for x-axis
    y='Publication_type:N',
    color=alt.Color('idf_value:Q', scale=alt.Scale(scheme='viridis')),
    text='topic_word:N'  # Corrected column name to 'topic_word'
).transform_window(
    rank="rank()",
    sort=[alt.SortField("idf_value", order="descending")],
    groupby=["Publication_type"]
)

# Create a heatmap
heatmap = base.mark_rect().encode(
    color='idf_value:Q',
)

# Create text labels with conditional color
text = base.mark_text(baseline='middle').encode(
    text='topic_word:N',
    color=alt.condition(alt.datum.idf_value >= 2.0, alt.value('white'), alt.value('black'))
)

# Display the heatmap and text labels
chart_lda_publication_type = (heatmap + text).properties(width=2000)
chart_lda_publication_type

# Result with publisher

In [10]:
result_data_publisher = []

# Iterate through each row in the original DataFrame
for _, row in df.iterrows():
    Publisher = row['Publisher']
    topic_words = row['topic_words']

    # Iterate through each tuple in the list of topic_words
    for word, idf_value in topic_words:
        result_data_publisher.append({'Publisher': Publisher, 'topic_word': word, 'idf_value': idf_value})

# Create the result DataFrame from the list
result_df_publisher = pd.DataFrame(result_data_publisher)


result_df_publisher

Unnamed: 0,Publisher,topic_word,idf_value
0,IEEE,localization,3.970414
1,IEEE,data localization,3.970414
2,IEEE,smart,2.178655
3,IEEE,smart monitoring,3.970414
4,IEEE,monitoring,2.178655
...,...,...,...
375,miscellaneous,id,2.871802
376,miscellaneous,identi,2.871802
377,miscellaneous,relevant ml dl concept,3.970414
378,miscellaneous,relevant ml dl,3.970414


In [11]:
# Select the top terms for each PDF
top_terms_df_publisher = result_df_publisher.groupby('Publisher').head(10)

# Add a small random value to 'idf_value' for better visualization
top_terms_df_publisher['idf_value'] = top_terms_df_publisher['idf_value'] + np.random.rand(top_terms_df_publisher.shape[0]) * 0.0001

# Create a base chart
base = alt.Chart(top_terms_df_publisher).encode(
    x=alt.X('rank:O', axis=None),  # Use rank for x-axis
    y='Publisher:N',
    color=alt.Color('idf_value:Q', scale=alt.Scale(scheme='viridis')),
    text='topic_word:N'  # Corrected column name to 'topic_word'
).transform_window(
    rank="rank()",
    sort=[alt.SortField("idf_value", order="descending")],
    groupby=["Publisher"]
)

# Create a heatmap
heatmap = base.mark_rect().encode(
    color='idf_value:Q',
)

# Create text labels with conditional color
text = base.mark_text(baseline='middle').encode(
    text='topic_word:N',
    color=alt.condition(alt.datum.idf_value >= 3.0, alt.value('white'), alt.value('black'))
)

# Display the heatmap and text labels
chart_lda_publisher = (heatmap + text).properties(width=1500)
chart_lda_publisher

# USING ONLY TFIDF

In [None]:
# save(chart_lda_publisher, 'lda_idf_heatmap_pub.html')

In [None]:
def find_top_words_for_each_document(df, num_top_words=10):

    # Create a TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 4), token_pattern=r'\b\w+\b')

    # Fit and transform the PDF content
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['Content'])

    # Get feature names (words) from the TF-IDF vectorizer
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Create a DataFrame with TF-IDF values
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

    # Now, for each document, find the top words based on TF-IDF scores
    top_words_for_each_document = []
    for i, row in enumerate(tfidf_df.iterrows()):
        _, document_tfidf_scores = row
        top_words_index = document_tfidf_scores.argsort()[-num_top_words:][::-1]
        top_words = [(feature_names[index], document_tfidf_scores[index]) for index in top_words_index]
        top_words_for_each_document.append(top_words)

    # Create a new DataFrame with 'pdf_content' and 'top_words'
    result_df = pd.DataFrame({'Filename': df['Filename'],'Publisher': df['Publisher'],'Publication_type': df['Publication_type'], 'Content': df['Content'], 'top_words': top_words_for_each_document})

    return result_df

In [None]:
# Call the function
result_df_tfidf = find_top_words_for_each_document(df)

# Display the result DataFrame
# result_df_tfidf['Publisher'] = df['Publisher']
# result_df_tfidf['Publication_type'] = df['Publication_type']
result_df_tfidf

# Results with filename

In [None]:
result_data_tfidf_filename = []

# Iterate through each row in the original DataFrame
for _, row in result_df_tfidf.iterrows():
    Filename = row['Filename']
    top_words = row['top_words']

    # Iterate through each tuple in the list of top_words
    for word, tfidf in top_words:
        result_data_tfidf_filename.append({'Filename': Filename, 'topic_word': word, 'tfidf_value': tfidf})

# Create the result DataFrame from the list
result_df_tfidf_filename = pd.DataFrame(result_data_tfidf_filename)

result_df_tfidf_filename

In [None]:
import pandas as pd

# Assuming result_df_3 is your DataFrame
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None) 
pd.set_option('display.max_colwidth', None)# Show all rows

# Display the entire DataFrame
print(result_df_tfidf_filename)


In [None]:
# Select the top terms for each PDF
top_terms_df_tfidf = result_df_tfidf_filename.groupby('Filename').head(10)

# Add a small random value to 'idf_value' for better visualization
top_terms_df_tfidf['tfidf_value'] = top_terms_df_tfidf['tfidf_value'] + np.random.rand(top_terms_df_tfidf.shape[0]) * 0.0001

# Create a base chart
base = alt.Chart(top_terms_df_tfidf).encode(
    x=alt.X('rank:O', axis=None),  # Use rank for x-axis
    y='Filename:N',
    color=alt.Color('tfidf_value:Q', scale=alt.Scale(scheme='viridis')),
    text='topic_word:N'  # Corrected column name to 'topic_word'
).transform_window(
    rank="rank()",
    sort=[alt.SortField("tfidf_value", order="descending")],
    groupby=["Filename"]
)

# Create a heatmap
heatmap = base.mark_rect().encode(
    color='tfidf_value:Q',
)

# Create text labels with conditional color
text = base.mark_text(baseline='middle').encode(
    text='topic_word:N',
    color=alt.condition(alt.datum.tfidf_value >= 0.4, alt.value('white'), alt.value('black'))
)

# Display the heatmap and text labels
chart_tfidf = (heatmap + text).properties(width=2000)


In [None]:
# save(chart_tfidf, 'tfidf_heatmap.html')
chart_tfidf

# Result with publisher

In [None]:
result_data_tfidf_publisher = []

# Iterate through each row in the original DataFrame
for _, row in result_df_tfidf.iterrows():
    Publisher = row['Publisher']
    topic_words = row['top_words']

    # Iterate through each tuple in the list of topic_words
    for word, tfidf in topic_words:
        result_data_tfidf_publisher.append({'Publisher': Publisher, 'top_word': word, 'tf_idf_value': tfidf})

# Create the result DataFrame from the list
result_df_tfidf_publisher = pd.DataFrame(result_data_tfidf_publisher)


result_df_tfidf_publisher

In [None]:
result_df_tfidf_publisher.describe()

In [None]:
# Select the top terms for each PDF
top_terms_df_publisher = result_df_tfidf_publisher.groupby('Publisher').head(10)

# Add a small random value to 'idf_value' for better visualization
top_terms_df_publisher['tf_idf_value'] = top_terms_df_publisher['tf_idf_value'] + np.random.rand(top_terms_df_publisher.shape[0]) * 0.0001

# Create a base chart
base = alt.Chart(top_terms_df_publisher).encode(
    x=alt.X('rank:O', axis=None),  # Use rank for x-axis
    y='Publisher:N',
    color=alt.Color('tf_idf_value:Q', scale=alt.Scale(scheme='viridis')),
    text='top_word:N'  # Corrected column name to 'topic_word'
).transform_window(
    rank="rank()",
    sort=[alt.SortField("tf_idf_value", order="descending")],
    groupby=["Publisher"]
)

# Create a heatmap
heatmap = base.mark_rect().encode(
    color='tf_idf_value:Q',
)

# Create text labels with conditional color
text = base.mark_text(baseline='middle').encode(
    text='top_word:N',
    color=alt.condition(alt.datum.tf_idf_value >= 0.4, alt.value('white'), alt.value('black'))
)

# Display the heatmap and text labels
chart_publisher = (heatmap + text).properties(width=1500)
chart_publisher

In [None]:
# save(chart_tfidf_publisher_2, 'tfidf_heatmap_publisher.html')

# Results with Publisher

In [None]:
result_data_tfidf_publication_type = []

# Iterate through each row in the original DataFrame
for _, row in result_df_tfidf.iterrows():
    Publication_type = row['Publication_type']
    topic_words = row['top_words']

    # Iterate through each tuple in the list of topic_words
    for word, tfidf in topic_words:
        result_data_tfidf_publication_type.append({'Publication_type': Publication_type, 'top_word': word, 'tf_idf_value': tfidf})

# Create the result DataFrame from the list
result_df_tfidf_publication_type = pd.DataFrame(result_data_tfidf_publication_type)


result_df_tfidf_publication_type

In [None]:
# Select the top terms for each PDF
top_terms_df_publication_type = result_df_tfidf_publication_type.groupby('Publication_type').head(10)

# Add a small random value to 'idf_value' for better visualization
top_terms_df_publication_type['tf_idf_value'] = top_terms_df_publication_type['tf_idf_value'] + np.random.rand(top_terms_df_publication_type.shape[0]) * 0.0001

# Create a base chart
base = alt.Chart(top_terms_df_publication_type).encode(
    x=alt.X('rank:O', axis=None),  # Use rank for x-axis
    y='Publication_type:N',
    color=alt.Color('tf_idf_value:Q', scale=alt.Scale(scheme='viridis')),
    text='top_word:N'  # Corrected column name to 'topic_word'
).transform_window(
    rank="rank()",
    sort=[alt.SortField("tf_idf_value", order="descending")],
    groupby=["Publication_type"]
)

# Create a heatmap
heatmap = base.mark_rect().encode(
    color='tf_idf_value:Q',
)

# Create text labels with conditional color
text = base.mark_text(baseline='middle').encode(
    text='top_word:N',
    color=alt.condition(alt.datum.tf_idf_value >= 0.4, alt.value('white'), alt.value('black'))
)

# Display the heatmap and text labels
chart_publication_type = (heatmap + text).properties(width=1500)
chart_publication_type