In [None]:
import pandas as pd
import gensim
from gensim import corpora
from collections import defaultdict


# Replace the path with the actual path to your JSON file
file_path = '/content/drive/MyDrive/combined_texts_preprocessed.json'

# Load the JSON file into a DataFrame
combined_texts_preprocessed = pd.read_json(file_path, lines=True)

combined_texts_preprocessed

In [None]:
# Step 1: Flatten bigrams correctly without separating them
def flatten_bigrams(bigrams):
    # Flatten the list of lists into a single list of bigram strings
    return [f"{bigram[0]}_{bigram[1]}" for bigram in bigrams]

# Apply the flatten function to each document
combined_texts_preprocessed['flattened_bigrams'] = combined_texts_preprocessed['preprocessed_text_bigrams'].apply(flatten_bigrams)

# Combine unigrams and flattened bigrams
def combine_tokens(unigrams, bigrams):
    return unigrams + bigrams

combined_texts_preprocessed['aggregated_tokens'] = combined_texts_preprocessed.apply(
    lambda row: combine_tokens(row['preprocessed_text'], row['flattened_bigrams']), axis=1
)

# Step 2: Create a Gensim dictionary from the aggregated tokens
dictionary = corpora.Dictionary(combined_texts_preprocessed['aggregated_tokens'])

# Filter out tokens with a document frequency of 1 (singleton or very rare terms)
low_df_ids = [token_id for token_id, docfreq in dictionary.dfs.items() if docfreq <= 1]
dictionary.filter_tokens(bad_ids=low_df_ids)

# Update the corpus to exclude the filtered tokens
bow_corpus = [dictionary.doc2bow(text) for text in combined_texts_preprocessed['aggregated_tokens']]

# Step 3: Generate the TF-IDF matrix
tfidf = gensim.models.TfidfModel(bow_corpus)
tfidf_corpus = tfidf[bow_corpus]

# Now you have `dictionary`, `bow_corpus`, and `tfidf_corpus` available.

In [None]:
from gensim.models import CoherenceModel

from gensim.models import LdaModel, CoherenceModel
import matplotlib.pyplot as plt

# Function to train LDA model
def train_lda_model(corpus, dictionary, num_topics, chunksize=2000, passes=20, iterations=400, eval_every=None):
    model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        chunksize=chunksize,
        alpha='auto',
        eta='auto',
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=eval_every
    )
    return model

# Function to compute coherence score
def compute_coherence_values(model, corpus, dictionary, texts, coherence='c_v'):
    coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, corpus=corpus, coherence=coherence)
    return coherence_model.get_coherence()

# List to store coherence values
coherence_values_cv = []
coherence_values_umass = []
models = []

# Range of topics to try
topic_range = range(2, 11)  # Example: topics from 2 to 10

for num_topics in topic_range:
    print(f"Training LDA model with {num_topics} topics...")
    
    # Train LDA model
    model = train_lda_model(bow_corpus, dictionary, num_topics)
    models.append(model)
    
    # Compute coherence values
    coherence_cv = compute_coherence_values(model, bow_corpus, dictionary, combined_texts_preprocessed['aggregated_tokens'], coherence='c_v')
    coherence_umass = compute_coherence_values(model, bow_corpus, dictionary, combined_texts_preprocessed['aggregated_tokens'], coherence='u_mass')
    
    coherence_values_cv.append(coherence_cv)
    coherence_values_umass.append(coherence_umass)
    
    print(f"Coherence (c_v): {coherence_cv}, Coherence (u_mass): {coherence_umass}")

# Plot coherence scores
plt.figure(figsize=(10, 5))

# Plot c_v
plt.plot(topic_range, coherence_values_cv, label="c_v Coherence", marker='o')

# Plot UMass
plt.plot(topic_range, coherence_values_umass, label="u_mass Coherence", marker='o')

plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.legend(("c_v Coherence", "u_mass Coherence"), loc='best')
plt.title("Coherence Scores by Number of Topics")
plt.show()


In [None]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 3
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=bow_corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [None]:
top_topics = model.top_topics(bow_corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

In [None]:
# Get the top 5 words and their weights for each topic
num_words = 5
top_words_per_topic = []
for i in range(num_topics):
    top_words = model.show_topic(i, topn=num_words)
    top_words_per_topic.append(top_words)


In [None]:
# Get the top 5 words and their weights for each topic
num_words = 5
top_words_per_topic = []
for i in range(num_topics):
    top_words = model.show_topic(i, topn=num_words)
    top_words_per_topic.append(top_words)

def plot_topic_barcharts(top_words_per_topic):
    fig, axs = plt.subplots(2, 2, figsize=(12, 16))  # 2x2 grid for 4 topics
    axs = axs.flatten()  # Flatten the array to easily iterate over it

    colors = ['#1f77b4', '#ff7f0e', '#2ca02c',]  # Colors for the bars

    for topic_id, topic in enumerate(top_words_per_topic):
        words, weights = zip(*topic)

        # Calculate word counts from bow_corpus
        word_counts = []
        for word in words:
            word_id = dictionary.token2id[word]
            count = sum([count for doc in bow_corpus for word_id_in_doc, count in doc if word_id_in_doc == word_id])
            word_counts.append(count)

        ax_count = axs[topic_id].twinx()  # Create a twin axes sharing the x-axis

        # Bar chart for word counts
        axs[topic_id].bar(np.arange(len(words)), word_counts, color=colors[topic_id], alpha=0.9,width=0.2, label='Word Count')
        axs[topic_id].set_ylabel('Word Count', color=colors[topic_id])
        axs[topic_id].tick_params(axis='y', labelcolor=colors[topic_id])

        # Bar chart for weights
        ax_count.bar(np.arange(len(words)) + 0.4, weights, color=colors[topic_id], alpha=0.4, width=0.3, label='Weights')
        ax_count.set_ylabel('Weights', color=colors[topic_id])
        ax_count.tick_params(axis='y', labelcolor=colors[topic_id])

        axs[topic_id].set_xticks(np.arange(len(words)) + 0.2)
        axs[topic_id].set_xticklabels(words, rotation=45, ha='right')
        axs[topic_id].set_title(f'Topic {topic_id}', fontsize=16, color=colors[topic_id])

        axs[topic_id].legend(loc='upper left')
        ax_count.legend(loc='upper right')

    plt.tight_layout()
    plt.show()

# Plot the bar charts
plot_topic_barcharts(top_words_per_topic)

In [None]:
#Topic distribution over campo_analisi


# Step 1: Get the topic distribution for each document (this part of the code remains the same)
topic_distributions = []
for doc_bow in bow_corpus:
    # Get the topic distribution for the document
    doc_topics = model.get_document_topics(doc_bow, minimum_probability=0)

    # Convert to an array and normalize the probabilities
    doc_topics_array = np.array([topic_prob for _, topic_prob in doc_topics])

    # Set contributions less than 1% to zero
    doc_topics_array[doc_topics_array < 0.01] = 0

    # Append the results
    topic_distributions.append(doc_topics_array)

# Step 2: Create a DataFrame from the topic distributions.
df_topic_distribution = pd.DataFrame(topic_distributions, columns=[f"Topic {i}" for i in range(num_topics)])

# Step 3: Assuming df is your original dataframe with a 'Campo_analisi' column that links documents to fields
df_topic_distribution['Campo_analisi'] = combined_texts_preprocessed['Campo_analisi']

# Step 4: Aggregate topic distributions by field (Campo_analisi)
df_field_distribution = df_topic_distribution.groupby('Campo_analisi').mean()

# Step 5: Find the dominant topic for each field (if needed)
df_field_distribution['Dominant Topic'] = df_field_distribution.idxmax(axis=1)

# Step 6: Display the final table with topic distributions over fields
df_field_distribution.reset_index(inplace=True)

# Display the final table
df_field_distribution

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


# Step 1: Filter out only numeric topic distribution columns
# Make sure you're excluding any columns that do not represent topics (like 'Campo_analisi' and 'Dominant Topic')
topic_columns = [col for col in df_field_distribution.columns if col.startswith('Topic')]
df_topic_only = df_field_distribution[topic_columns].copy()

# Step 2: Compute cosine similarity between the fields
cosine_sim_matrix = cosine_similarity(df_topic_only)

# Step 3: Convert the similarity matrix to a DataFrame for better readability
df_cosine_similarity = pd.DataFrame(cosine_sim_matrix, index=df_field_distribution['Campo_analisi'], columns=df_field_distribution['Campo_analisi'])

# Display the cosine similarity matrix
df_cosine_similarity

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Function to compute the BERT embeddings for a text
def get_bert_embedding(text):
    # Tokenize and encode the input text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the [CLS] token's representation as the sentence embedding
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

# Assuming you have 'combined_texts_preprocessed' DataFrame with 'text' and 'Campo_analisi' columns
# Compute the mean embeddings for each 'Campo_analisi'
combined_texts_preprocessed['bert_embedding'] = combined_texts_preprocessed['original_text_partial'].apply(get_bert_embedding)
campo_analisi_embeddings = combined_texts_preprocessed.groupby('Campo_analisi')['bert_embedding'].apply(lambda x: x.mean(axis=0))

# Convert to a DataFrame for cosine similarity computation
campo_analisi_embeddings_df = pd.DataFrame(campo_analisi_embeddings.tolist(), index=campo_analisi_embeddings.index)

# Compute cosine similarity
cosine_sim_matrix = cosine_similarity(campo_analisi_embeddings_df)

# Convert to DataFrame for better readability
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=campo_analisi_embeddings.index, columns=campo_analisi_embeddings.index)

# Display the cosine similarity matrix
cosine_sim_df

In [None]:
from scipy.spatial.distance import pdist, squareform

# Step 1: Normalize the embeddings to make them similar to probability distributions
def normalize_embeddings(embedding):
    # Ensure all elements are non-negative and normalize
    embedding = np.abs(embedding)  # Make sure embeddings are non-negative
    norm_embedding = embedding / np.linalg.norm(embedding, ord=1)  # Normalize by L1 norm (sum)
    return norm_embedding

# Step 2: Function to compute Hellinger distance between two probability distributions
def hellinger_distance(p, q):
    return np.sqrt(0.5 * np.sum((np.sqrt(p) - np.sqrt(q)) ** 2))

# Step 3: Normalize embeddings for each campo_analisi
campo_analisi_normalized_embeddings = campo_analisi_embeddings.apply(normalize_embeddings)

# Convert to a DataFrame for Hellinger distance computation
campo_analisi_normalized_embeddings_df = pd.DataFrame(campo_analisi_normalized_embeddings.tolist(), index=campo_analisi_normalized_embeddings.index)

# Step 4: Compute pairwise Hellinger distances between each campo_analisi
pairwise_hellinger_distances = pdist(campo_analisi_normalized_embeddings_df.values, metric=hellinger_distance)

# Convert the distances into a square matrix for better readability
hellinger_distance_matrix = squareform(pairwise_hellinger_distances)

# Step 5: Convert to DataFrame for readability
hellinger_distance_df = pd.DataFrame(hellinger_distance_matrix, index=campo_analisi_normalized_embeddings.index, columns=campo_analisi_normalized_embeddings.index)

# Display the Hellinger distance matrix
print("Mean Pairwise Hellinger Distance Between Fields:")
hellinger_distance_df

In [None]:
# Subtract the Hellinger distance matrix from 1 to get a similarity matrix
hellinger_similarity_matrix = 1 - hellinger_distance_df

# Display the Hellinger similarity matrix
print("Pairwise Similarity Between Fields (1 - Hellinger Distance):")
hellinger_similarity_matrix

In [None]:
# Ensure all matrices have the same shape
if cosine_sim_df.shape == hellinger_similarity_matrix.shape == df_cosine_similarity.shape:

    # Convert DataFrames to NumPy arrays for computation
    cosine_sim_array = cosine_sim_df.values
    hellinger_similarity_matrix_array = hellinger_similarity_matrix.values
    df_cosine_similarity_array = df_cosine_similarity.values

    # Compute the element-wise mean of the three matrices
    mean_matrix_array = (cosine_sim_array + hellinger_similarity_matrix_array + df_cosine_similarity_array) / 3

    # Convert back to DataFrame (optional for better readability)
    mean_matrix_df = pd.DataFrame(mean_matrix_array, index=cosine_sim_df.index, columns=cosine_sim_df.columns)

    # Display the resulting mean matrix
    print("Mean Matrix:")
mean_matrix_df

In [None]:
# Set a threshold for very small values close to zero
threshold = 1e-7

# Subtract from 1 to convert similarity to distance
distance_matrix_array = 1 - mean_matrix_array

# Correct the diagonal to be exactly zero
np.fill_diagonal(distance_matrix_array, 0)

# Set very small negative or near-zero values to 0
distance_matrix_array[distance_matrix_array < threshold] = 0

# Convert back to DataFrame for readability
distance_matrix_df = pd.DataFrame(distance_matrix_array, index=mean_matrix_df.index, columns=mean_matrix_df.columns)

# Display the corrected distance matrix
print("Corrected Distance Matrix:")
distance_matrix_df


In [None]:
# Set a threshold for very small values close to zero
threshold = 1e-7

# Subtract from 1 to convert similarity to distance
distance_matrix_array = 1 - mean_matrix_array

# Correct the diagonal to be exactly zero
np.fill_diagonal(distance_matrix_array, 0)

# Set very small negative or near-zero values to 0
distance_matrix_array[distance_matrix_array < threshold] = 0

# Convert back to DataFrame for readability
distance_matrix_df = pd.DataFrame(distance_matrix_array, index=mean_matrix_df.index, columns=mean_matrix_df.columns)

# Display the corrected distance matrix
print("Corrected Distance Matrix:")
distance_matrix_df


In [None]:
import numpy as np
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import silhouette_score

num_clusters = 3  # Choose the appropriate number of clusters
cluster_labels = fcluster(linkage_matrix, num_clusters, criterion='maxclust')

# Set the diagonal of the distance matrix to 0
np.fill_diagonal(distance_matrix_df.values, 0)

# Now, calculate the Silhouette Score
silhouette_avg = silhouette_score(distance_matrix_df, cluster_labels, metric='precomputed')

# Print the Silhouette Score
print(f'Silhouette Score for {num_clusters} clusters: {silhouette_avg}')


In [None]:

# Convert DataFrames to NumPy arrays
hellinger_array = hellinger_similarity_matrix.values
df_cosine_array = df_cosine_similarity.values
cosine_array = cosine_sim_df.values
mean_array = mean_matrix_df.values

# Get the shape of the matrices (assuming they all have the same shape)
rows, cols = hellinger_array.shape

# Create an empty array to store the combined results
combined_matrix = np.empty((rows, cols, 2), dtype=object)  # Use dtype=object to store tuples

# Fill the combined matrix with values from the original matrices
for i in range(rows):
    for j in range(cols):
        combined_matrix[i, j, 0] = (hellinger_array[i, j], df_cosine_array[i, j], cosine_array[i, j])  # First row values
        combined_matrix[i, j, 1] = mean_array[i, j]  # Second row value

# Convert combined_matrix to a DataFrame for better readability
combined_df = pd.DataFrame(index=mean_matrix_df.index, columns=mean_matrix_df.columns)

# Populate the DataFrame with tuples (first row values, second row values)
for i in range(rows):
    for j in range(cols):
        if i == j:  # Check for diagonal
            combined_df.iat[i, j] = ('', '')  # Set diagonal to '-'
        else:
            combined_df.iat[i, j] = (
                (round(hellinger_array[i, j], 2), round(df_cosine_array[i, j], 2), round(cosine_array[i, j], 2)),
                round(mean_array[i, j], 2)
            )  # Round values to 2 decimal points

# Display the combined DataFrame
print(combined_df)
