In [34]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import altair as alt
import os

import numpy as np
import warnings
warnings.simplefilter("ignore")

df = pd.read_csv('Data_mesh_cleaned.csv')


In [8]:
df

Unnamed: 0,Filename,Content,topic,topic_words
0,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,localization law becoming make hard company ma...,9,"[(localization, 3.5257286443082556), (region, ..."
1,Decentralized_Data_Governance_as_Part_of_a_Dat...,socio technical decentralized analytics manage...,7,"[(integration, 2.139434283188365), (learning, ..."
2,Enterprise_Data_Strategy_A_Decentralized_Data_...,enterprise experience exponential growth centr...,2,"[(cloud, 2.83258146374831), (hybrid, 3.5257286..."
3,Finding_Your_Way_Through_the_Jungle_of_Big_Dat...,paper present systematic analytical architectu...,4,"[(warehouse, 2.83258146374831), (dama, 3.12026..."
4,978-1-4842-9253-2_1,look back architecture developed response pain...,5,"[(new, 1.8209805520698303), (new trend, 3.5257..."
5,978-1-4842-9253-2_10,provide high overview fabric evolution elabora...,10,"[(ai, 1.3856624808119848), (fabric, 1.32850406..."
6,978-1-4842-9253-2_11,saw fabric evolution previous architecture inf...,1,"[(fabric, 1.328504066972036), (ai, 1.385662480..."
7,978-1-4842-9253-2_12,look intersection two initiative digital trans...,2,"[(cloud, 2.83258146374831), (hybrid, 3.5257286..."
8,978-1-4842-9253-2_13,metadata digital era enterprise know aspect mu...,8,"[(fabric, 1.328504066972036), (digital era, 3...."
9,978-1-4842-9253-2_14,applying ai metadata intelligent cataloging qu...,6,"[(metadata, 2.2729656758128876), (intelligent,..."


In [5]:
def extract_topics(text, tfidf_vectorizer, lda_model, num_topics=10):
    # TF-IDF Vectorization for the specific document
    tfidf_data = tfidf_vectorizer.transform([text])
    tfidf_values = tfidf_data.toarray()
    feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
    df_tfidf = pd.DataFrame(data=tfidf_values, columns=feature_names)

    # Apply LDA
    doc_topic_prob = lda_model.transform(tfidf_data)

    # Get the most probable topic for the document
    most_probable_topic = np.argmax(doc_topic_prob)

    # Get the top words for the most probable topic with TF-IDF values
    topic = lda_model.components_[most_probable_topic]
    top_keywords_idx = topic.argsort()[:-10 - 1:-1]
    top_keywords = [feature_names[i] for i in top_keywords_idx]
    top_tfidf_values = [tfidf_vectorizer.idf_[tfidf_vectorizer.vocabulary_[word]] for word in top_keywords]
    topic_words = list(zip(top_keywords, top_tfidf_values))

    return most_probable_topic + 1, topic_words


In [9]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))
tfidf_data = tfidf_vectorizer.fit_transform(df['Content'])

# Apply LDA
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(tfidf_data)

# Assuming 'df' is your DataFrame with 'pdf_content' column
df['topic'], df['topic_words'] = zip(*df.apply(lambda row: extract_topics(row['Content'], tfidf_vectorizer, lda), axis=1))

In [13]:
result_df = pd.DataFrame(columns=['Filename', 'topic_word', 'idf_value'])

# Iterate through each row in the original DataFrame
for _, row in df.iterrows():
    Filename = row['Filename']
    topic_words = row['topic_words']

    # Iterate through each tuple in the list of topic_words
    for word, idf_value in topic_words:
        result_df = result_df.append({'Filename': Filename, 'topic_word': word, 'idf_value': idf_value}, ignore_index=True)

# Print the result DataFrame


In [14]:
result_df

Unnamed: 0,Filename,topic_word,idf_value
0,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,localization,3.525729
1,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,region,3.525729
2,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,regulation,3.525729
3,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,india,3.525729
4,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,problem,2.427116
...,...,...,...
235,978-3-031-45021-1_23,microservices,3.120264
236,978-3-031-45021-1_23,application,2.021651
237,978-3-031-45021-1_23,control,2.832581
238,978-3-031-45021-1_23,driven,2.139434


In [22]:


# Select the top terms for each PDF
top_terms_df = result_df.groupby('Filename').head(10)

# Add a small random value to 'idf_value' for better visualization
top_terms_df['idf_value'] = top_terms_df['idf_value'] + np.random.rand(top_terms_df.shape[0]) * 0.0001

# Create a base chart
base = alt.Chart(top_terms_df).encode(
    x=alt.X('rank:O', axis=None),  # Use rank for x-axis
    y='Filename:N',
    color=alt.Color('idf_value:Q', scale=alt.Scale(scheme='viridis')),
    text='topic_word:N'  # Corrected column name to 'topic_word'
).transform_window(
    rank="rank()",
    sort=[alt.SortField("idf_value", order="descending")],
    groupby=["Filename"]
)

# Create a heatmap
heatmap = base.mark_rect().encode(
    color='idf_value:Q',
)

# Create text labels with conditional color
text = base.mark_text(baseline='middle').encode(
    text='topic_word:N',
    color=alt.condition(alt.datum.idf_value >= 2.0, alt.value('white'), alt.value('black'))
)

# Display the heatmap and text labels
chart_lda = (heatmap + text).properties(width=1500)


In [None]:
chart_lda.save(os.path.join(output_folder, 'heatmap_chart_tfidf.pdf'))

USING ONLY TFIDF

In [27]:
def find_top_words_for_each_document(df, num_top_words=10):

    # Create a TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))

    # Fit and transform the PDF content
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['Content'])

    # Get feature names (words) from the TF-IDF vectorizer
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Create a DataFrame with TF-IDF values
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

    # Now, for each document, find the top words based on TF-IDF scores
    top_words_for_each_document = []
    for i, row in enumerate(tfidf_df.iterrows()):
        _, document_tfidf_scores = row
        top_words_index = document_tfidf_scores.argsort()[-num_top_words:][::-1]
        top_words = [(feature_names[index], document_tfidf_scores[index]) for index in top_words_index]
        top_words_for_each_document.append(top_words)

    # Create a new DataFrame with 'pdf_content' and 'top_words'
    result_df = pd.DataFrame({'Filename': df['Filename'], 'Content': df['Content'], 'top_words': top_words_for_each_document})

    return result_df

In [28]:
# Call the function
result_df_2 = find_top_words_for_each_document(df)

# Display the result DataFrame
result_df_2

Unnamed: 0,Filename,Content,top_words
0,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,localization law becoming make hard company ma...,"[(localization, 0.39445298996091116), (region,..."
1,Decentralized_Data_Governance_as_Part_of_a_Dat...,socio technical decentralized analytics manage...,"[(described, 0.12276664647116566), (example, 0..."
2,Enterprise_Data_Strategy_A_Decentralized_Data_...,enterprise experience exponential growth centr...,"[(lake, 0.26723160490133363), (ownership, 0.10..."
3,Finding_Your_Way_Through_the_Jungle_of_Big_Dat...,paper present systematic analytical architectu...,"[(warehouse, 0.19212001181825478), (dama, 0.18..."
4,978-1-4842-9253-2_1,look back architecture developed response pain...,"[(new, 0.18503785660208386), (new trend, 0.179..."
5,978-1-4842-9253-2_10,provide high overview fabric evolution elabora...,"[(fabric, 0.19719401418951754), (fabric patter..."
6,978-1-4842-9253-2_11,saw fabric evolution previous architecture inf...,"[(requirement, 0.12160028945123409), (methodol..."
7,978-1-4842-9253-2_12,look intersection two initiative digital trans...,"[(cloud, 0.45885820534265315), (hybrid cloud, ..."
8,978-1-4842-9253-2_13,metadata digital era enterprise know aspect mu...,"[(metadata digital era, 0.2034487518976551), (..."
9,978-1-4842-9253-2_14,applying ai metadata intelligent cataloging qu...,"[(metadata, 0.1949305969947279), (quality asse..."


In [31]:
result_df_3 = pd.DataFrame(columns=['Filename', 'topic_word', 'tfidf_value'])

# Iterate through each row in the original DataFrame
for _, row in result_df_2.iterrows():
    Filename = row['Filename']
    top_words = row['top_words']

    # Iterate through each tuple in the list of top_words
    for word, tfidf in top_words:
        result_df_3 = result_df_3.append({'Filename': Filename, 'topic_word': word, 'tfidf_value': tfidf}, ignore_index=True)

# Display the result DataFrame
print(result_df_3)

                                              Filename        topic_word  \
0    Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...      localization   
1    Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...            region   
2    Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...             india   
3    Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...        regulation   
4    Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...           problem   
..                                                 ...               ...   
235                               978-3-031-45021-1_23             speci   
236                               978-3-031-45021-1_23           quantum   
237                               978-3-031-45021-1_23  decentralisation   
238                               978-3-031-45021-1_23         converged   
239                               978-3-031-45021-1_23         equipment   

     tfidf_value  
0       0.394453  
1       0.208828  
2       0.162422  
3       0.1

In [40]:


# Select the top terms for each PDF
top_terms_df = result_df_3.groupby('Filename').head(10)

# Add a small random value to 'idf_value' for better visualization
top_terms_df['tfidf_value'] = top_terms_df['tfidf_value'] + np.random.rand(top_terms_df.shape[0]) * 0.0001

# Create a base chart
base = alt.Chart(top_terms_df).encode(
    x=alt.X('rank:O', axis=None),  # Use rank for x-axis
    y='Filename:N',
    color=alt.Color('tfidf_value:Q', scale=alt.Scale(scheme='viridis')),
    text='topic_word:N'  # Corrected column name to 'topic_word'
).transform_window(
    rank="rank()",
    sort=[alt.SortField("tfidf_value", order="descending")],
    groupby=["Filename"]
)

# Create a heatmap
heatmap = base.mark_rect().encode(
    color='tfidf_value:Q',
)

# Create text labels with conditional color
text = base.mark_text(baseline='middle').encode(
    text='topic_word:N',
    color=alt.condition(alt.datum.tfidf_value >= 0.2, alt.value('white'), alt.value('black'))
)

# Display the heatmap and text labels
chart_tfidf = (heatmap + text).properties(width=1500)


In [43]:
chart_tfidf

In [44]:
chart_tfidf.save('heatmap_chart_tfidf.pdf', scale_factor=2.0)

CalledProcessError: Command '['C:\\Program Files\\nodejs\\npm.CMD', 'bin', '--global']' returned non-zero exit status 1.