In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import warnings
warnings.simplefilter("ignore")

df = pd.read_csv('Data_mesh_cleaned.csv')


c:\Users\sitas\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Users\sitas\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


In [14]:
def extract_topics(text, tfidf_vectorizer, lda_model, num_topics=10):
    # TF-IDF Vectorization for the specific document
    tfidf_data = tfidf_vectorizer.transform([text])
    tfidf_values = tfidf_data.toarray()
    feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
    df_tfidf = pd.DataFrame(data=tfidf_values, columns=feature_names)

    # Apply LDA
    doc_topic_prob = lda_model.transform(tfidf_data)

    # Get the most probable topic for the document
    most_probable_topic = np.argmax(doc_topic_prob)

    # Get the top words for the most probable topic with TF-IDF values
    topic = lda_model.components_[most_probable_topic]
    top_keywords_idx = topic.argsort()[:-10 - 1:-1]
    top_keywords = [feature_names[i] for i in top_keywords_idx]
    top_tfidf_values = [tfidf_vectorizer.idf_[tfidf_vectorizer.vocabulary_[word]] for word in top_keywords]
    topic_words = list(zip(top_keywords, top_tfidf_values))

    return most_probable_topic + 1, topic_words


In [15]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(2, 2))
tfidf_data = tfidf_vectorizer.fit_transform(df['pdf_content'])

# Apply LDA
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(tfidf_data)

# Assuming 'df' is your DataFrame with 'pdf_content' column
df['topic'], df['topic_words'] = zip(*df.apply(lambda row: extract_topics(row['pdf_content'], tfidf_vectorizer, lda), axis=1))

In [4]:
result_df = pd.DataFrame(columns=['pdf_name', 'topic_word', 'idf_value'])

# Iterate through each row in the original DataFrame
for _, row in df.iterrows():
    pdf_name = row['pdf_name']
    topic_words = row['topic_words']

    # Iterate through each tuple in the list of topic_words
    for word, idf_value in topic_words:
        result_df = result_df.append({'pdf_name': pdf_name, 'topic_word': word, 'idf_value': idf_value}, ignore_index=True)

# Print the result DataFrame


                        pdf_name        topic_word  idf_value
0                       0670.pdf    term condition   3.233592
1                       0670.pdf      digital twin   3.233592
2                       0670.pdf      numer method   3.639057
3                       0670.pdf        method eng   3.639057
4                       0670.pdf         int numer   3.233592
..                           ...               ...        ...
265  urn_nbn_fi_uef-20211359.pdf  lakehouse fabric   2.540445
266  urn_nbn_fi_uef-20211359.pdf    lake lakehouse   2.386294
267  urn_nbn_fi_uef-20211359.pdf    icdabi limited   3.639057
268  urn_nbn_fi_uef-20211359.pdf     existing lake   3.233592
269  urn_nbn_fi_uef-20211359.pdf      overall need   3.233592

[270 rows x 3 columns]


In [16]:
import altair as alt
import pandas as pd
import numpy as np

# Assuming you already have the 'result_df' DataFrame from the previous code

# Select the top terms for each PDF
top_terms_df = result_df.groupby('pdf_name').head(10)

# Add a small random value to 'idf_value' for better visualization
top_terms_df['idf_value'] = top_terms_df['idf_value'] + np.random.rand(top_terms_df.shape[0]) * 0.0001

# Create a base chart
base = alt.Chart(top_terms_df).encode(
    x=alt.X('rank:O', axis=None),  # Use rank for x-axis
    y='pdf_name:N',
    color=alt.Color('idf_value:Q', scale=alt.Scale(scheme='viridis')),
    text='topic_word:N'  # Corrected column name to 'topic_word'
).transform_window(
    rank="rank()",
    sort=[alt.SortField("idf_value", order="descending")],
    groupby=["pdf_name"]
)

# Create a heatmap
heatmap = base.mark_rect().encode(
    color='idf_value:Q',
)

# Create text labels
text = base.mark_text(baseline='middle').encode(
    color=alt.condition(alt.datum.idf_value >= 0.23, alt.value('white'), alt.value('black'))
)

# Display the heatmap and text labels
(heatmap + text).properties(width=1500)


In [17]:
warnings.resetwarnings()