# Topic Modeling for Narrative Generation

This notebook is part of a master's thesis project in Digital Interaction Design at Politecnico di Milano, by Federico Denni.

In [76]:
from IPython.display import clear_output
!pip3 install --upgrade pip
!pip install torchaudio==2.2.2
!pip install -U torch torchvision
!pip install bertopic
!pip install huggingface_hub
!pip install -q -U bitsandbytes
!pip install accelerate
!pip install xformers
!pip install adjustText
!pip install ipywidgets
!pip install spacy
!python -m spacy download xx_sent_ud_sm
clear_output()
print("all installed!")

all installed!


In order to use mistral we need to first identify ourselves using the hugginface token

In [77]:
from huggingface_hub import login, logout
from kaggle_secrets import UserSecretsClient  #import HF token from secrets

user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HUGGINFACE_TOKEN")
login(secret_value_0)

Let's now import the other necessary libraries

In [78]:
# Let's load the preprocessing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

Let's now import the csv file

In [79]:
try:
    #Change this directory for other dataset
    #if file not csv, use spaCy and pandas to convert it
    data_research = "/kaggle/input/probes/probe_survey.csv"
    df = pd.read_csv(data_research,encoding="utf-8")
    #extract text column
    text_column = df[['text']]
    text_column.to_csv(r'/kaggle/working/just_text_column.csv',index=False, encoding="utf-8")
    df = pd.read_csv(r'/kaggle/working/just_text_column.csv',encoding="utf-8")
    df.head(10)
    
except FileNotFoundError:
      print("Error: file not found. Please upload the file or provide the correct path.")

We make sure the text that you want load is all in a column called "text", we will also eliminate whitespaces

In [80]:
#remove whitespaces
def remove_whitespaces(text):
    return " ".join(text.split())

df['text'] = df['text'].apply(remove_whitespaces)

# Display the first 12 rows of the updated DataFrame
df.head(50)

Unnamed: 0,text
0,Quello delle carte da gioco è un patrimonio sa...
1,Il gioco della scopa non è nulla senza il best...
2,Ritengo che sia da salvaguardare l’atto del gi...
3,Salvaguardare un patrimonio culturale è sempre...
4,"It is the games, the traditions and the time s..."
5,That’s people should play cards because it bri...
6,"Le persone anziane sono ricche di esperienze, ..."
7,"entrambe le opzioni sono valide, le carte parl..."
8,Che le carte siano una parte importante per la...
9,That’s people should play cards because it bri...


Let's import the llm model (mistral) and hope it doesn't break

In [81]:
from torch import cuda

model_id = 'mistralai/Mistral-7B-Instruct-v0.1'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

print(device)

cuda:0


Let's quantize so we don't burn anymore trees than necessary

In [82]:
from torch import bfloat16
import transformers

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16  # Computation type
)

# Mistral Tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

# Mistral Model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)
model.eval()

print("quantized!")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

quantized!


In [83]:
#Let's setup the role and pipeline of the model
# Our text generator
generator = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)

Device set to use cuda:0


In [84]:
#Let's now test the model
prompt = "Scrivimi un haiku di 3 linee su una rana in un pozzo, il testo deve essere evocativo e creativo. Per favore, riporta solo il testo del poema"
res = generator(prompt)
print(res[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Scrivimi un haiku di 3 linee su una rana in un pozzo, il testo deve essere evocativo e creativo. Per favore, riporta solo il testo del poema.

La rana nel pozzo
Silenziosamente si muove
Natura's secret


Now let's compose the prompt, so that we instruct the mistral model to do exactly as we want

In [85]:
# System prompt describes information given to all conversations
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics, you follow the instruction closely.
<</SYS>>
"""

In [86]:
# Example prompt demonstrating the output we are looking for, maybe also we could add multiple language examples?
example_prompt = """
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- La carne, nello specifico il bovino, è il cibo con più alto impatto sull'ambiente.
- Mangiare la carne rappresentava il tuo status sociale di persona più ricca

The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.

Based on the information about the topic above, please create a short label of this topic with at most 5 words. Make sure you to only return the label and nothing more.

[/INST] Environmental impacts of eating meat
"""

In [87]:
main_prompt = """
[INST]
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic with at most 5 words. Make sure you to only return the label and nothing more.
[/INST]
"""

There are two BERTopic-specific tags that are of interest, namely `[DOCUMENTS]` and `[KEYWORDS]`:

* `[DOCUMENTS]` contain the top 5 most relevant documents to the topic
* `[KEYWORDS]` contain the top 10 most relevant keywords to the topic as generated through c-TF-IDF

This template will be filled accordingly to each topic. And finally, we can combine this into our final prompt:

In [88]:
prompt = system_prompt + example_prompt + main_prompt

---

## BERTopic and Embeddings

let's prepare the embeddings, we will use a multilingual embedding model

In [89]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("BAAI/bge-m3") #multilanguage embedding 
embeddings = embedding_model.encode(df['text'], show_progress_bar=True)

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

In [90]:
from umap import UMAP
from hdbscan import HDBSCAN

umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine', random_state=42) #augment neighbors when you will have the correct amount of data
hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True) 

In [91]:
# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=5, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)  #augment neighbors when you will have the correct amount of data

Let's now add additional rapresentations method so that we can have multiple points of view on our subject

In [92]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration

# KeyBERT
keybert = KeyBERTInspired()

# MMR
mmr = MaximalMarginalRelevance(diversity=0.5) #modify this value to get more different representative_docs

# Text generation with mistral
mistral = TextGeneration(generator, prompt=prompt)

# candidate_topics from research
candidate_topics = [
    'Family and Games', 
    'Rules of Games',
    'Games Collection',
    'Traditional Games',
    'Memories'
]

# All representation models
representation_model = {
    "Keywords": keybert,
    "Labels": mistral,
    "MMR": mmr,
}

Let's now train our topic model with BERTopic

In [93]:
from bertopic import BERTopic

topic_model = BERTopic(

  language="multilingual",
  zeroshot_topic_list = candidate_topics,
  zeroshot_min_similarity = 0.2,
  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(df['text'], embeddings)

#Show topics
freq = topic_model.get_topic_info() 
freq.to_csv(r'/kaggle/working/topics.csv', index=False)
freq.head(11)

2025-02-13 00:19:54,050 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-02-13 00:19:54,492 - BERTopic - Dimensionality - Completed ✓
2025-02-13 00:19:54,494 - BERTopic - Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-02-13 00:19:54,555 - BERTopic - Zeroshot Step 1 - Completed ✓
2025-02-13 00:19:54,559 - BERTopic - Representation - Extracting topics from clusters using representation models.
  0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 20%|██        | 1/5 [00:15<01:03, 15.97s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 40%|████      | 2/5 [00:25<00:37, 12.40s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 60%|██████    | 3/5 [00:31<00:18,  9.31s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 80%|████████  | 4/5 [00:36<00:07,  7.66s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
100%|██████████| 5/5 [00:41<00:00,  8.37s/it]
2025-02-13 00:20:39,970 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Keywords,Labels,MMR,Representative_Docs
0,0,145,0_di_carte_che_in,"[di, carte, che, in, il, non, gioco, le, un, da]","[playing, cards, carte, play, giocare, bar, fa...","[\nSaving the card game heritage, , , , , , , ...","[carte, non, si, con, anche, una, più, famigli...",[ritengo che il gioco di carte sia un patrimon...
1,1,41,1_che_di_carte_le,"[che, di, carte, le, in, il, gioco, un, del, non]","[playing, games, carte, tradizione, interessan...","[Traditional card games as social bonding, , ,...","[carte, del, non, una, si, giochi, ritengo, pa...",[Non saprei come rappresentarlo visivamente; è...
2,2,34,2_che_di_mi_carte,"[che, di, mi, carte, una, le, un, ho, per, la]","[anziane, condividere, esperienze, qualcosa, r...","[Memories of childhood and Italian culture, , ...","[carte, per, delle, non, ogni, me, più, qualco...",[Ho queste carte milanesi nello zaino (e in og...
3,3,25,3_il_the_non_le,"[il, the, non, le, si, che, la, sempre, del, c...","[game, rules, osservando, partita, bisogna, lo...","[Respectful card game tips, , , , , , , , , ]","[the, non, sempre, bluff, carta, rules, bisogn...",[Durante le partite qualche giocatore se ne es...
4,4,5,4_uno_tutte_carte_senza,"[uno, tutte, carte, senza, per, di, trattate, ...","[game, , , accumulare, changer, community, car...","[Collectible card games and community, , , , ,...","[carte, unicorns, solitamente, sconnesse, scam...",[Adoro portare caos durante le partite di Unst...


In [94]:
topic_model.get_topic(1, full=True)["Keywords"] #let's check some of the keywords of the second item in the list

[('playing', 0.5887516),
 ('games', 0.5326551),
 ('carte', 0.5223361),
 ('tradizione', 0.5206987),
 ('interessante', 0.5173243),
 ('giocare', 0.5112589),
 ('carta', 0.48521823),
 ('diverse', 0.47723716),
 ('tradizionali', 0.4758815),
 ('interessanti', 0.47557938)]

In [95]:
mistral_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Labels"].values()]

mistral_labels = [label[0][0].strip().split("\n")[0] for label in topic_model.get_topics(full=True)["Labels"].values()]
topic_model.set_topic_labels(mistral_labels)

print(mistral_labels)    
topic_model.set_topic_labels(mistral_labels)

['Saving the card game heritage', 'Traditional card games as social bonding', 'Memories of childhood and Italian culture', 'Respectful card game tips', 'Collectible card games and community']


---

### Visualize

Let's now visualize our topics (hoping that it works with this dataset)

In [96]:
from scipy.cluster import hierarchy as sch

linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True)

# Extract hierarchical topics and their representations.
# A dataframe that contains a hierarchy of topics represented by their parents and their children.
hierarchical_topics: pd.DataFrame = topic_model.hierarchical_topics(df['text'], linkage_function=linkage_function)

fig = topic_model.visualize_hierarchy(
    # 'str' the orientation of the figure. Either 'left' or 'bottom'.
    orientation='left',

    # 'list[int]' a selection of topics to visualize.
    topics=None,

    # 'int' only select the top n most frequent topics to visualize.
    top_n_topics=None,

    # 'pd.DataFrame' a dataframe that contains a hierarchy of topics represented by their
    # parents and their children.
    # NOTE: The hierarchical topic names are only visualized if both 'topics' and 'top_n_topics' are not set.
    hierarchical_topics=hierarchical_topics,

    # Whether to use custom topic labels that were defined using 'topic_model.set_topic_labels'.
    custom_labels=True,

    width=1200,
    height=1000,
)

fig.update_layout(
    # Adjust left, right, top, bottom margin of the overall figure.
    margin=dict(l=20, r=20, t=60, b=20),

    title={
        'text': "Hierarchical structure of the topics",
        'y':0.975,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(
            size=22,
            color="#000000"
        )
    },
)

fig.show()

100%|██████████| 4/4 [00:00<00:00, 278.90it/s]


In [97]:
#topic_model.visualize_barchart()

import plotly.express as px

top_n_topics = 9
n_words = 5

# 'fig_barchart' is a plotly figure.
fig_barchart = topic_model.visualize_barchart(
    top_n_topics = top_n_topics,  # Only select the top n most frequent topics.
    n_words = n_words,            # Number of words to show in a topic.
    custom_labels=True,          # Whether to use custom topic labels that were defined using topic_model.set_topic_labels.
    title=f"Top {top_n_topics} topics visualized by the frequency of the top {n_words} words",
    width=300,
    height=300,
)

fig_barchart.update_layout(
    # Adjust left, right, top, bottom margin of the overall figure.
    margin=dict(l=20, r=50, t=80, b=20),

    plot_bgcolor='rgba(0,0,0,0)',         # Set background color (transparent in this example).

    title={
        'text': f"Top {top_n_topics} topics visualized by the frequency of the top {n_words} words/phrases",
        'y':0.975,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(
            family="Sans Serif",
            size=24,
            color="#000000"
        )
    },

    font=dict(
        family="Roboto",
        size=10,
        color="#000000"
    ),
)

color_sequence = px.colors.qualitative.Vivid  # Choose a color sequence.
fig_barchart.update_traces(marker_color=color_sequence)

# Show the updated figure
fig_barchart.show()

In [98]:
topic_model.visualize_heatmap(custom_labels=True)

In [99]:
df = pd.read_csv(data_research, encoding="utf-8")

titles = df['text']
documents = topic_model.visualize_documents(
    titles, 
    reduced_embeddings=reduced_embeddings,
    title=f"Topics visualized",
    hide_annotations=True, 
    hide_document_hover=False, 
    custom_labels=True,
)

documents.update_layout(
    # Adjust left, right, top, bottom margin of the overall figure.
    margin=dict(l=20, r=20, t=60, b=20),

    title={
        'text': "Topics distribution",
        'y':0.975,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(
            size=22,
            color="#000000"
        )
    },
    
    colorway=px.colors.qualitative.G10,

)

documents.write_html(r'/kaggle/working/playing_cards_topic.html')
plt = documents

plt.show()

In [100]:
topic_model.visualize_term_rank(custom_labels=True)

In [101]:
topic_model.visualize_topics(custom_labels=True)

In [102]:
tree=topic_model.get_topic_tree(hierarchical_topics)
print(tree)

.
├─■──uno_tutte_carte_senza_per ── Topic: 4
└─di_che_carte_il_in
     ├─di_carte_che_in_il
     │    ├─■──che_di_mi_carte_una ── Topic: 2
     │    └─di_carte_che_in_il
     │         ├─■──di_carte_che_in_il ── Topic: 0
     │         └─■──che_di_carte_le_in ── Topic: 1
     └─■──il_the_non_le_si ── Topic: 3



---

# Narreme Generation

The narremes are the basic output of nouns (as subjects, characters, concepts, objects...), adjectives (emotional, sentimental values), and verbs (relationships, actions...). By putting these together we obtained a narreme that is tagged as marginal (could be a description of an environment) or central (protagonist actions), depending on the role in the narrative.

## Clean the csv

In [103]:
try:
    #Change this directory for other dataset
    df = pd.read_csv(r'/kaggle/working/topics.csv', encoding="utf-8")
    
    #leave CustomName, Counts, Keywords, Update Topic from -1 to 0 and took off the rest
    # Define the columns to keep
    columns_to_keep = ['Representative_Docs', 'Name', 'Labels','Count', 'Keywords', 'Topic']
    
    # Drop the columns that are not in the list of columns to keep
    df = df[columns_to_keep]
    
    # Update the 'Topic' column to start indexing from 0
    df['Topic'] = df['Topic'] - df['Topic'].min()

except FileNotFoundError:
      print("Error: file not found. Please upload the file or provide the correct path.")

df.head(10)

Unnamed: 0,Representative_Docs,Name,Labels,Count,Keywords,Topic
0,"[""ritengo che il gioco di carte sia un patrimo...",0_di_carte_che_in,"['\nSaving the card game heritage', '', '', ''...",145,"['playing', 'cards', 'carte', 'play', 'giocare...",0
1,['Non saprei come rappresentarlo visivamente; ...,1_che_di_carte_le,"['Traditional card games as social bonding', '...",41,"['playing', 'games', 'carte', 'tradizione', 'i...",1
2,['Ho queste carte milanesi nello zaino (e in o...,2_che_di_mi_carte,"['Memories of childhood and Italian culture', ...",34,"['anziane', 'condividere', 'esperienze', 'qual...",2
3,['Durante le partite qualche giocatore se ne e...,3_il_the_non_le,"['Respectful card game tips', '', '', '', '', ...",25,"['game', 'rules', 'osservando', 'partita', 'bi...",3
4,['Adoro portare caos durante le partite di Uns...,4_uno_tutte_carte_senza,"['Collectible card games and community', '', '...",5,"['game', '', '', 'accumulare', 'changer', 'com...",4


In [104]:
import ast

# Ensure the column name is correct
topics_docs = 'Representative_Docs'

# Check if the column exists
if topics_docs in df.columns:
    # Function to safely evaluate strings containing lists
    def safe_eval(val):
        try:
            return ast.literal_eval(val)
        except:
            return []

    # Apply safe_eval function to 'Representative_Docs' column
    df[topics_docs] = df[topics_docs].apply(safe_eval)

    # Split the 'Representative_Docs' column into three new columns
    df[['docs_1', 'docs_2', 'docs_3']] = pd.DataFrame(df[topics_docs].tolist(), index=df.index)

    # Drop the original 'Representative_Docs' column if you no longer need it
    df.drop(columns=[topics_docs], inplace=True)

    # Display the DataFrame
    #Let's clean the label output
    def process_labels(label):
        if isinstance(label, str):
            label = eval(label)  # Convert string representation of list to actual list
            return [lbl.strip().split("\n")[0] for lbl in label if lbl]
                
        return label
    
    df['Labels'] = df['Labels'].apply(process_labels)
    
    #Let's convert the labels to string
    df['Labels'] = [' '.join(map(str, l)) for l in df['Labels']]
    df['Labels']=df.text.str.replace(r'\b(\w{1,3})\b', '')

else:
    print(f"Column '{topics_docs}' does not exist. Available columns are:", df.columns)

AttributeError: 'DataFrame' object has no attribute 'text'

In [None]:
# Save the updated DataFrame to a new CSV file
df.to_csv(r'/kaggle/working/split_docs.csv', index=False)
df.head(10)

## Sentiment Analysis with RoBERTa

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment" #multi-lingual process
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
import math

def normalize(score, alpha=15):
    """
    Normalize the score to be between -1 and 1 using an alpha that
    approximates the max expected value.
    """
    norm_score = score / math.sqrt((score * score) + alpha)
    return norm_score

def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    
    # Calculate sum of sentiment scores
    sum_s = scores[2] - scores[0]

    # Normalize the compound score
    compound_score = normalize(sum_s)
    
    scores_dict = {
        'Negative' : scores[0],
        'Neutral' : scores[1],
        'Positive' : scores[2], #add compound
        'Compound': compound_score  # Add normalized compound score
    }
    return scores_dict

In [None]:
from tqdm import tqdm

df = pd.read_csv(r'/kaggle/working/split_docs.csv', encoding="utf-8")

# Iterate over each row in the DataFrame
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        # Iterate over the three document columns
        for doc_col in ['docs_1', 'docs_2', 'docs_3']:
            text = row[doc_col]
            if pd.notnull(text):
                scores = polarity_scores_roberta(text)
                
                # Determine the sentiment with the highest score (excluding 'Compound')
                sentiment_label = max(['Negative', 'Neutral', 'Positive'], key=lambda k: scores[k])
                
                # Add sentiment label and compound score to the DataFrame
                df.at[i, f'{doc_col} Sentiment'] = sentiment_label
                df.at[i, f'{doc_col} Compound Score'] = scores['Compound']
                
    except RuntimeError:
        print(f'Broke for index {i}')

df.to_csv(r'/kaggle/working/ready_data.csv',index=False, encoding="utf-8")
df.head(10)

In [None]:
# Drop unnecessary columns for visualization
pair_viz = df.drop(columns=['Count', 'docs_1 Sentiment', 'docs_2 Sentiment', 'docs_3 Sentiment'])

# Create the pairplot
pairplot = sns.pairplot(data=pair_viz, hue='Topic', palette='tab10')

# Add KDE plots to the diagonal subplots with differentiation by color for each topic
for ax in pairplot.diag_axes:
    sns.kdeplot(data=pair_viz, x=ax.get_xlabel(), hue='Topic', ax=ax, fill=True, alpha=.5, linewidth=0)
    
# Save the plot as an image
pairplot.savefig(r'/kaggle/working/pairplot_visualization.png')

# Display the plot
pairplot.show()