In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import altair as alt
import os
import re

import numpy as np
import warnings
warnings.simplefilter("ignore")

In [3]:
df = pd.read_csv('Data_mesh_cleaned.csv')
df

Unnamed: 0,Filename,Content,Publisher
0,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,chieve effective aggregation data localization...,IEEE
1,Decentralized_Data_Governance_as_Part_of_a_Dat...,pproaches arif wider sumedha verma atif akhtar...,IEEE
2,Enterprise_Data_Strategy_A_Decentralized_Data_...,decentralized data vijay kumar butte sujata bu...,IEEE
3,Finding_Your_Way_Through_the_Jungle_of_Big_Dat...,rchitectures torsten priebe sebastian neumaier...,IEEE
4,978-1-4842-9253-2_1,look back data architecture developed response...,Springer
5,978-1-4842-9253-2_10,provide high overview data fabric data evoluti...,Springer
6,978-1-4842-9253-2_11,saw data fabric architecture evolution previou...,Springer
7,978-1-4842-9253-2_12,look intersection two initiative digital trans...,Springer
8,978-1-4842-9253-2_13,metadata digital era enterprise know aspect da...,Springer
9,978-1-4842-9253-2_14,applying ai metadata intelligent cataloging da...,Springer


In [23]:
# def generate_custom_token_pattern(ngram_min, ngram_max):
#     # Ensure no word repetition in n-grams
#     word_pattern = r'\b(\w+)\b(?=.*\b\1\b)'
    
#     # Combine word patterns for specified n-gram range
#     ngram_pattern = r'(?:' + word_pattern + r'\s){' + str(ngram_min - 1) + r',' + str(ngram_max - 1) + r'}' + word_pattern
    
#     return ngram_pattern

# token_pattern = generate_custom_token_pattern(1, 4)

In [24]:
def extract_topics(text, tfidf_vectorizer, lda_model):
    
    # TF-IDF Vectorization for the specific document
    tfidf_data = tfidf_vectorizer.transform([text])
    tfidf_values = tfidf_data.toarray()
    feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
    df_tfidf = pd.DataFrame(data=tfidf_values, columns=feature_names)

    # Apply LDA
    doc_topic_prob = lda_model.transform(tfidf_data)

    # Get the most probable topic for the document
    most_probable_topic = np.argmax(doc_topic_prob)

    # Get the top words for the most probable topic with TF-IDF values
    topic = lda_model.components_[most_probable_topic]
    top_keywords_idx = topic.argsort()[:-10 - 1:-1]
    top_keywords = [feature_names[i] for i in top_keywords_idx]
    top_tfidf_values = [tfidf_vectorizer.idf_[tfidf_vectorizer.vocabulary_[word]] for word in top_keywords]
    topic_words = list(zip(top_keywords, top_tfidf_values))

    return most_probable_topic + 1, topic_words


In [26]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 4), token_pattern=r'\b\w+\b')
tfidf_data = tfidf_vectorizer.fit_transform(df['Content'])

# Apply LDA
lda = LatentDirichletAllocation(n_components=20, random_state=42)
lda.fit(tfidf_data)

# Assuming 'df' is your DataFrame with 'pdf_content' column
df['topic'], df['topic_words'] = zip(*df.apply(lambda row: extract_topics(row['Content'], tfidf_vectorizer, lda), axis=1))

In [27]:
df

Unnamed: 0,Filename,Content,Publisher,topic,topic_words
0,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,chieve effective aggregation data localization...,IEEE,10,"[(localization, 4.044522437723423), (data loca..."
1,Decentralized_Data_Governance_as_Part_of_a_Dat...,pproaches arif wider sumedha verma atif akhtar...,IEEE,17,"[(port, 2.6582280766035327), (city, 2.65822807..."
2,Enterprise_Data_Strategy_A_Decentralized_Data_...,decentralized data vijay kumar butte sujata bu...,IEEE,9,"[(principle, 1.5187937934151674), (paper, 1.69..."
3,Finding_Your_Way_Through_the_Jungle_of_Big_Dat...,rchitectures torsten priebe sebastian neumaier...,IEEE,2,"[(logical data warehouse, 3.6390573296152584),..."
4,978-1-4842-9253-2_1,look back data architecture developed response...,Springer,1,"[(dwh, 3.128231705849268), (edw, 3.12823170584..."
5,978-1-4842-9253-2_10,provide high overview data fabric data evoluti...,Springer,11,"[(id, 2.9459101490553135), (intelligent catalo..."
6,978-1-4842-9253-2_11,saw data fabric architecture evolution previou...,Springer,9,"[(principle, 1.5187937934151674), (paper, 1.69..."
7,978-1-4842-9253-2_12,look intersection two initiative digital trans...,Springer,16,"[(hybrid cloud, 2.791759469228055), (hybrid, 2..."
8,978-1-4842-9253-2_13,metadata digital era enterprise know aspect da...,Springer,11,"[(id, 2.9459101490553135), (intelligent catalo..."
9,978-1-4842-9253-2_14,applying ai metadata intelligent cataloging da...,Springer,8,"[(data, 1.0), (product, 1.1267517056391438), (..."


In [28]:
result_data = []

# Iterate through each row in the original DataFrame
for _, row in df.iterrows():
    Filename = row['Filename']
    topic_words = row['topic_words']

    # Iterate through each tuple in the list of topic_words
    for word, idf_value in topic_words:
        result_data.append({'Filename': Filename, 'topic_word': word, 'idf_value': idf_value})

# Create the result DataFrame from the list
result_df = pd.DataFrame(result_data, columns=['Filename', 'topic_word', 'idf_value'])

# Print the result DataFrame
result_df

Unnamed: 0,Filename,topic_word,idf_value
0,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,localization,4.044522
1,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,data localization,4.044522
2,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,ai engineering,4.044522
3,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,ai lifecycle,2.791759
4,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,terminology data fabric,4.044522
...,...,...,...
405,urn_nbn_fi_uef-20211359,pak,2.945910
406,urn_nbn_fi_uef-20211359,pak data,2.945910
407,urn_nbn_fi_uef-20211359,cloud pak data,2.945910
408,urn_nbn_fi_uef-20211359,cloud pak,2.945910


In [29]:
# Select the top terms for each PDF
top_terms_df = result_df.groupby('Filename').head(10)

# Add a small random value to 'idf_value' for better visualization
top_terms_df['idf_value'] = top_terms_df['idf_value'] + np.random.rand(top_terms_df.shape[0]) * 0.0001

# Create a base chart
base = alt.Chart(top_terms_df).encode(
    x=alt.X('rank:O', axis=None),  # Use rank for x-axis
    y='Filename:N',
    color=alt.Color('idf_value:Q', scale=alt.Scale(scheme='viridis')),
    text='topic_word:N'  # Corrected column name to 'topic_word'
).transform_window(
    rank="rank()",
    sort=[alt.SortField("idf_value", order="descending")],
    groupby=["Filename"]
)

# Create a heatmap
heatmap = base.mark_rect().encode(
    color='idf_value:Q',
)

# Create text labels with conditional color
text = base.mark_text(baseline='middle').encode(
    text='topic_word:N',
    color=alt.condition(alt.datum.idf_value >= 2.0, alt.value('white'), alt.value('black'))
)

# Display the heatmap and text labels
chart_lda = (heatmap + text).properties(width=2000)


In [30]:
chart_lda

In [35]:
result_data_publisher = []

# Iterate through each row in the original DataFrame
for _, row in df.iterrows():
    Publisher = row['Publisher']
    topic_words = row['topic_words']

    # Iterate through each tuple in the list of topic_words
    for word, idf_value in topic_words:
        result_data_publisher.append({'Publisher': Publisher, 'topic_word': word, 'idf_value': idf_value})

# Create the result DataFrame from the list
result_df_publisher = pd.DataFrame(result_data_publisher)


result_df_publisher

Unnamed: 0,Publisher,topic_word,idf_value
0,IEEE,localization,4.044522
1,IEEE,data localization,4.044522
2,IEEE,ai engineering,4.044522
3,IEEE,ai lifecycle,2.791759
4,IEEE,terminology data fabric,4.044522
...,...,...,...
405,miscellaneous,pak,2.945910
406,miscellaneous,pak data,2.945910
407,miscellaneous,cloud pak data,2.945910
408,miscellaneous,cloud pak,2.945910


In [36]:
# Select the top terms for each PDF
top_terms_df = result_df_publisher.groupby('Publisher').head(10)

# Add a small random value to 'idf_value' for better visualization
top_terms_df['idf_value'] = top_terms_df['idf_value'] + np.random.rand(top_terms_df.shape[0]) * 0.0001

# Create a base chart
base = alt.Chart(top_terms_df).encode(
    x=alt.X('rank:O', axis=None),  # Use rank for x-axis
    y='Publisher:N',
    color=alt.Color('idf_value:Q', scale=alt.Scale(scheme='viridis')),
    text='topic_word:N'  # Corrected column name to 'topic_word'
).transform_window(
    rank="rank()",
    sort=[alt.SortField("idf_value", order="descending")],
    groupby=["Publisher"]
)

# Create a heatmap
heatmap = base.mark_rect().encode(
    color='idf_value:Q',
)

# Create text labels with conditional color
text = base.mark_text(baseline='middle').encode(
    text='topic_word:N',
    color=alt.condition(alt.datum.idf_value >= 3.0, alt.value('white'), alt.value('black'))
)

# Display the heatmap and text labels
chart_lda_publisher = (heatmap + text).properties(width=1500)
chart_lda_publisher

USING ONLY TFIDF

In [44]:
def find_top_words_for_each_document(df, num_top_words=10):

    # Create a TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(2, 4), token_pattern=r'\b\w+\b')

    # Fit and transform the PDF content
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['Content'])

    # Get feature names (words) from the TF-IDF vectorizer
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Create a DataFrame with TF-IDF values
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

    # Now, for each document, find the top words based on TF-IDF scores
    top_words_for_each_document = []
    for i, row in enumerate(tfidf_df.iterrows()):
        _, document_tfidf_scores = row
        top_words_index = document_tfidf_scores.argsort()[-num_top_words:][::-1]
        top_words = [(feature_names[index], document_tfidf_scores[index]) for index in top_words_index]
        top_words_for_each_document.append(top_words)

    # Create a new DataFrame with 'pdf_content' and 'top_words'
    result_df = pd.DataFrame({'Filename': df['Filename'], 'Content': df['Content'], 'top_words': top_words_for_each_document})

    return result_df

In [50]:
# Call the function
result_df_2 = find_top_words_for_each_document(df)

# Display the result DataFrame
result_df_2['Publisher'] = df['Publisher']
result_df_2

Unnamed: 0,Filename,Content,top_words,Publisher
0,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,chieve effective aggregation data localization...,"[(data localization, 0.6461519865416517), (dat...",IEEE
1,Decentralized_Data_Governance_as_Part_of_a_Dat...,pproaches arif wider sumedha verma atif akhtar...,"[(data product, 0.3474289493730131), (output p...",IEEE
2,Enterprise_Data_Strategy_A_Decentralized_Data_...,decentralized data vijay kumar butte sujata bu...,"[(data product, 0.14960855571902104), (landing...",IEEE
3,Finding_Your_Way_Through_the_Jungle_of_Big_Dat...,rchitectures torsten priebe sebastian neumaier...,"[(logical data warehouse, 0.19271604639305134)...",IEEE
4,978-1-4842-9253-2_1,look back data architecture developed response...,"[(data architecture, 0.1593842444893316), (evo...",Springer
5,978-1-4842-9253-2_10,provide high overview data fabric data evoluti...,"[(data fabric, 0.2457513522492156), (fabric da...",Springer
6,978-1-4842-9253-2_11,saw data fabric architecture evolution previou...,"[(application architecture, 0.1647607467120480...",Springer
7,978-1-4842-9253-2_12,look intersection two initiative digital trans...,"[(hybrid cloud, 0.38900951209618134), (data hy...",Springer
8,978-1-4842-9253-2_13,metadata digital era enterprise know aspect da...,"[(intelligent cataloging, 0.19566048410964015)...",Springer
9,978-1-4842-9253-2_14,applying ai metadata intelligent cataloging da...,"[(fabric data aspect, 0.17310408189191503), (a...",Springer


In [46]:
result_data_3 = []

# Iterate through each row in the original DataFrame
for _, row in result_df_2.iterrows():
    Filename = row['Filename']
    top_words = row['top_words']

    # Iterate through each tuple in the list of top_words
    for word, tfidf in top_words:
        result_data_3.append({'Filename': Filename, 'topic_word': word, 'tfidf_value': tfidf})

# Create the result DataFrame from the list
result_df_3 = pd.DataFrame(result_data_3)

result_df_3

Unnamed: 0,Filename,topic_word,tfidf_value
0,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,data localization,0.646152
1,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,data aggregation,0.079440
2,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,potential benefit,0.066134
3,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,localization data,0.064777
4,Breaking_Down_Data_Silos_Data_Mesh_to_Achieve_...,data localization data,0.062186
...,...,...,...
405,urn_nbn_fi_uef-20211359,data architecture,0.050306
406,urn_nbn_fi_uef-20211359,ubiquitous language,0.049491
407,urn_nbn_fi_uef-20211359,data framework,0.048799
408,urn_nbn_fi_uef-20211359,conformed dimension,0.047457


In [47]:
# Select the top terms for each PDF
top_terms_df = result_df_3.groupby('Filename').head(10)

# Add a small random value to 'idf_value' for better visualization
top_terms_df['tfidf_value'] = top_terms_df['tfidf_value'] + np.random.rand(top_terms_df.shape[0]) * 0.0001

# Create a base chart
base = alt.Chart(top_terms_df).encode(
    x=alt.X('rank:O', axis=None),  # Use rank for x-axis
    y='Filename:N',
    color=alt.Color('tfidf_value:Q', scale=alt.Scale(scheme='viridis')),
    text='topic_word:N'  # Corrected column name to 'topic_word'
).transform_window(
    rank="rank()",
    sort=[alt.SortField("tfidf_value", order="descending")],
    groupby=["Filename"]
)

# Create a heatmap
heatmap = base.mark_rect().encode(
    color='tfidf_value:Q',
)

# Create text labels with conditional color
text = base.mark_text(baseline='middle').encode(
    text='topic_word:N',
    color=alt.condition(alt.datum.tfidf_value >= 0.2, alt.value('white'), alt.value('black'))
)

# Display the heatmap and text labels
chart_tfidf = (heatmap + text).properties(width=2000)


In [48]:
chart_tfidf

In [54]:
result_data_publisher_2 = []

# Iterate through each row in the original DataFrame
for _, row in result_df_2.iterrows():
    Publisher = row['Publisher']
    topic_words = row['top_words']

    # Iterate through each tuple in the list of topic_words
    for word, idf_value in topic_words:
        result_data_publisher_2.append({'Publisher': Publisher, 'top_word': word, 'idf_value': idf_value})

# Create the result DataFrame from the list
result_df_publisher_2 = pd.DataFrame(result_data_publisher_2)


result_df_publisher_2

Unnamed: 0,Publisher,top_word,idf_value
0,IEEE,data localization,0.646152
1,IEEE,data aggregation,0.079440
2,IEEE,potential benefit,0.066134
3,IEEE,localization data,0.064777
4,IEEE,data localization data,0.062186
...,...,...,...
405,miscellaneous,data architecture,0.050306
406,miscellaneous,ubiquitous language,0.049491
407,miscellaneous,data framework,0.048799
408,miscellaneous,conformed dimension,0.047457


In [72]:
# Select the top terms for each PDF
top_terms_df_2 = result_df_publisher_2.groupby('Publisher').head(10)

# Add a small random value to 'idf_value' for better visualization
top_terms_df_2['idf_value'] = top_terms_df_2['idf_value'] + np.random.rand(top_terms_df_2.shape[0]) * 0.0001

# Create a base chart
base = alt.Chart(top_terms_df_2).encode(
    x=alt.X('rank:O', axis=None),  # Use rank for x-axis
    y='Publisher:N',
    color=alt.Color('idf_value:Q', scale=alt.Scale(scheme='viridis')),
    text='top_word:N'  # Corrected column name to 'topic_word'
).transform_window(
    rank="rank()",
    sort=[alt.SortField("idf_value", order="descending")],
    groupby=["Publisher"]
)

# Create a heatmap
heatmap = base.mark_rect().encode(
    color='idf_value:Q',
)

# Create text labels with conditional color
text = base.mark_text(baseline='middle').encode(
    text='top_word:N',
    color=alt.condition(alt.datum.idf_value >= 0.4, alt.value('white'), alt.value('black'))
)

# Display the heatmap and text labels
chart_lda_publisher_2 = (heatmap + text).properties(width=1500)
chart_lda_publisher_2