In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import warnings
warnings.simplefilter("ignore")

df = pd.read_csv('Data_mesh_cleaned.csv')


c:\Users\sitas\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Users\sitas\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


In [14]:
def extract_topics(text, tfidf_vectorizer, lda_model, num_topics=10):
    # TF-IDF Vectorization for the specific document
    tfidf_data = tfidf_vectorizer.transform([text])
    tfidf_values = tfidf_data.toarray()
    feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
    df_tfidf = pd.DataFrame(data=tfidf_values, columns=feature_names)

    # Apply LDA
    doc_topic_prob = lda_model.transform(tfidf_data)

    # Get the most probable topic for the document
    most_probable_topic = np.argmax(doc_topic_prob)

    # Get the top words for the most probable topic with TF-IDF values
    topic = lda_model.components_[most_probable_topic]
    top_keywords_idx = topic.argsort()[:-10 - 1:-1]
    top_keywords = [feature_names[i] for i in top_keywords_idx]
    top_tfidf_values = [tfidf_vectorizer.idf_[tfidf_vectorizer.vocabulary_[word]] for word in top_keywords]
    topic_words = list(zip(top_keywords, top_tfidf_values))

    return most_probable_topic + 1, topic_words


In [15]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))
tfidf_data = tfidf_vectorizer.fit_transform(df['pdf_content'])

# Apply LDA
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(tfidf_data)

# Assuming 'df' is your DataFrame with 'pdf_content' column
df['topic'], df['topic_words'] = zip(*df.apply(lambda row: extract_topics(row['pdf_content'], tfidf_vectorizer, lda), axis=1))

In [4]:
result_df = pd.DataFrame(columns=['pdf_name', 'topic_word', 'idf_value'])

# Iterate through each row in the original DataFrame
for _, row in df.iterrows():
    pdf_name = row['pdf_name']
    topic_words = row['topic_words']

    # Iterate through each tuple in the list of topic_words
    for word, idf_value in topic_words:
        result_df = result_df.append({'pdf_name': pdf_name, 'topic_word': word, 'idf_value': idf_value}, ignore_index=True)

# Print the result DataFrame


                        pdf_name        topic_word  idf_value
0                       0670.pdf    term condition   3.233592
1                       0670.pdf      digital twin   3.233592
2                       0670.pdf      numer method   3.639057
3                       0670.pdf        method eng   3.639057
4                       0670.pdf         int numer   3.233592
..                           ...               ...        ...
265  urn_nbn_fi_uef-20211359.pdf  lakehouse fabric   2.540445
266  urn_nbn_fi_uef-20211359.pdf    lake lakehouse   2.386294
267  urn_nbn_fi_uef-20211359.pdf    icdabi limited   3.639057
268  urn_nbn_fi_uef-20211359.pdf     existing lake   3.233592
269  urn_nbn_fi_uef-20211359.pdf      overall need   3.233592

[270 rows x 3 columns]


In [16]:
import altair as alt
import pandas as pd
import numpy as np



# Select the top terms for each PDF
top_terms_df = result_df.groupby('pdf_name').head(10)

# Add a small random value to 'idf_value' for better visualization
top_terms_df['idf_value'] = top_terms_df['idf_value'] + np.random.rand(top_terms_df.shape[0]) * 0.0001

# Create a base chart
base = alt.Chart(top_terms_df).encode(
    x=alt.X('rank:O', axis=None),  # Use rank for x-axis
    y='pdf_name:N',
    color=alt.Color('idf_value:Q', scale=alt.Scale(scheme='viridis')),
    text='topic_word:N'  # Corrected column name to 'topic_word'
).transform_window(
    rank="rank()",
    sort=[alt.SortField("idf_value", order="descending")],
    groupby=["pdf_name"]
)

# Create a heatmap
heatmap = base.mark_rect().encode(
    color='idf_value:Q',
)

# Create text labels
text = base.mark_text(baseline='middle').encode(
    color=alt.condition(alt.datum.idf_value >= 0.23, alt.value('white'), alt.value('black'))
)

# Display the heatmap and text labels
(heatmap + text).properties(width=1500)


In [12]:
USING ONLY TFIDF

Unnamed: 0,pdf_name,pdf_content
0,0670.pdf,introducing city platform design marcin kryste...
1,1-s2.0-S1877050921022365-main.pdf,author http license system projman system tech...
2,1-s2.0-S1877050923006099-main.pdf,author http license program chair procs procs ...
3,2302.01713.pdf,practice avoid mess jan bode germany niklas kü...
4,2304.01062.pdf,systematic gray abel goedegebuure indika kumar...
5,978-1-4842-9253-2.pdf,fabric approach ai guide ai cataloging integra...
6,978-3-031-12423-5_7.pdf,cok survey privacy challenge relation mesh nik...
7,978-3-031-36118-0.pdf,lecture note engineering communication technol...
8,978-3-031-39847-6_1.pdf,integration revitalized warehouse lake robert ...
9,978-3-031-45021-1_23.pdf,converging microservice principle uni logical ...


USING ONLY TFIDF

In [45]:
def find_top_words_for_each_document(df, num_top_words=10):

    # Create a TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))

    # Fit and transform the PDF content
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['pdf_content'])

    # Get feature names (words) from the TF-IDF vectorizer
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Create a DataFrame with TF-IDF values
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

    # Now, for each document, find the top words based on TF-IDF scores
    top_words_for_each_document = []
    for i, row in enumerate(tfidf_df.iterrows()):
        _, document_tfidf_scores = row
        top_words_index = document_tfidf_scores.argsort()[-num_top_words:][::-1]
        top_words = [(feature_names[index], document_tfidf_scores[index]) for index in top_words_index]
        top_words_for_each_document.append(top_words)

    # Create a new DataFrame with 'pdf_content' and 'top_words'
    result_df = pd.DataFrame({'pdf_name': df['pdf_name'], 'pdf_content': df['pdf_content'], 'top_words': top_words_for_each_document})

    return result_df

In [46]:
# Call the function
result_df_2 = find_top_words_for_each_document(df)

# Display the result DataFrame
result_df_2

Unnamed: 0,pdf_name,pdf_content,top_words
0,0670.pdf,introducing city platform design marcin kryste...,"[(city, 0.518498642926999), (digital twin, 0.1..."
1,1-s2.0-S1877050921022365-main.pdf,author http license system projman system tech...,"[(big, 0.15409979475208258), (lake, 0.12813794..."
2,1-s2.0-S1877050923006099-main.pdf,author http license program chair procs procs ...,"[(monitoring, 0.3140537607465791), (community,..."
3,2302.01713.pdf,practice avoid mess jan bode germany niklas kü...,"[(interviewee, 0.1905634896488923), (bp, 0.158..."
4,2304.01062.pdf,systematic gray abel goedegebuure indika kumar...,"[(product, 0.3323737537582746), (gray, 0.23490..."
5,978-1-4842-9253-2.pdf,fabric approach ai guide ai cataloging integra...,"[(ai, 0.40894699924108674), (fabric, 0.2851429..."
6,978-3-031-12423-5_7.pdf,cok survey privacy challenge relation mesh nik...,"[(privacy, 0.27180647075917264), (identi, 0.23..."
7,978-3-031-36118-0.pdf,lecture note engineering communication technol...,"[(teaching, 0.18206801824247285), (student, 0...."
8,978-3-031-39847-6_1.pdf,integration revitalized warehouse lake robert ...,"[(wrembel, 0.19419032760539323), (optimization..."
9,978-3-031-45021-1_23.pdf,converging microservice principle uni logical ...,"[(microservice, 0.1802468623006914), (service,..."


In [47]:
result_df_3 = pd.DataFrame(columns=['pdf_name', 'topic_word', 'tfidf_value'])

# Iterate through each row in the original DataFrame
for _, row in result_df_2.iterrows():
    pdf_name = row['pdf_name']
    top_words = row['top_words']

    # Iterate through each tuple in the list of top_words
    for word, idf_value in top_words:
        result_df_3 = result_df_3.append({'pdf_name': pdf_name, 'topic_word': word, 'tfidf_value': idf_value}, ignore_index=True)

# Display the result DataFrame
print(result_df_3)

                        pdf_name    topic_word  tfidf_value
0                       0670.pdf          city     0.518499
1                       0670.pdf  digital twin     0.108710
2                       0670.pdf        poznan     0.106657
3                       0670.pdf      scenario     0.105048
4                       0670.pdf          twin     0.099039
..                           ...           ...          ...
265  urn_nbn_fi_uef-20211359.pdf      software     0.105699
266  urn_nbn_fi_uef-20211359.pdf     different     0.103203
267  urn_nbn_fi_uef-20211359.pdf        domain     0.093232
268  urn_nbn_fi_uef-20211359.pdf         theme     0.086539
269  urn_nbn_fi_uef-20211359.pdf           new     0.083845

[270 rows x 3 columns]


In [48]:
import altair as alt
import pandas as pd
import numpy as np



# Select the top terms for each PDF
top_terms_df = result_df_3.groupby('pdf_name').head(10)

# Add a small random value to 'idf_value' for better visualization
top_terms_df['tfidf_value'] = top_terms_df['tfidf_value'] + np.random.rand(top_terms_df.shape[0]) * 0.0001

# Create a base chart
base = alt.Chart(top_terms_df).encode(
    x=alt.X('rank:O', axis=None),  # Use rank for x-axis
    y='pdf_name:N',
    color=alt.Color('tfidf_value:Q', scale=alt.Scale(scheme='viridis')),
    text='topic_word:N'  # Corrected column name to 'topic_word'
).transform_window(
    rank="rank()",
    sort=[alt.SortField("tfidf_value", order="descending")],
    groupby=["pdf_name"]
)

# Create a heatmap
heatmap = base.mark_rect().encode(
    color='tfidf_value:Q',
)

# Create text labels
text = base.mark_text(baseline='middle').encode(
    color=alt.condition(alt.datum.idf_value >= 0.23, alt.value('white'), alt.value('black'))
)

# Display the heatmap and text labels
(heatmap + text).properties(width=1500)
