In [14]:
import os
import random
import re
import string

import nltk
import numpy as np
import pandas as pd

from gensim.models import Word2Vec

from nltk import word_tokenize
from nltk.corpus import stopwords


from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

nltk.download("stopwords")
nltk.download("punkt")

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daniel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens

In [7]:
df_raw = pd.read_json("/Users/daniel/Desktop/STATAPP/Statap-panels/temp_dataset.json")
df = df_raw.copy()
for col in text_columns:
    df[col] = df[col].astype(str).fillna("")

# Combine all text fields into a single text column
df["text"] = df[text_columns].apply(lambda x: " | ".join(x), axis=1)
df.head()

Unnamed: 0,history,question,query_modifier_question,generated_answer,answer,text
0,"[{'role': 'user', 'content': 'What are the mai...",What other sectors are they involved in?,What other sectors is Bouygues involved in bes...,"Besides construction, real estate, media, and ...","Besides construction and real estate, Bouygues...","[{'role': 'user', 'content': 'What are the mai..."
1,"[{'role': 'user', 'content': 'What is employee...",How did it help against Vincent Bolloré's atte...,How did employee shareholding at Bouygues help...,"In 1998, Bouygues' employee shareholding helpe...",The employee shareholding helped counter Vince...,"[{'role': 'user', 'content': 'What is employee..."
2,"[{'role': 'user', 'content': 'Where is the hea...",And what about Bouygues Construction?,Where is the headquarters of Bouygues Construc...,"The headquarters of Bouygues Construction, kno...","The headquarters of Bouygues Construction, kno...","[{'role': 'user', 'content': 'Where is the hea..."
3,"[{'role': 'user', 'content': 'What happens whe...",How much money does this generate in France?,How much money do search engines earn from spo...,Search engines in France earn approximately 2....,"In France, this generates about 2.4 billion eu...","[{'role': 'user', 'content': 'What happens whe..."
4,"[{'role': 'user', 'content': 'How can I instal...",Is there any cost involved?,Is there any cost involved in installing Lilo ...,Not enough information is available about the ...,"No, using Lilo in a business setting is simple...","[{'role': 'user', 'content': 'How can I instal..."


In [15]:
# Define stopwords (use sklearn's built-in stopwords)
custom_stopwords = set(ENGLISH_STOP_WORDS) | {"history", "question", "query", "answer"}

# Tokenization and cleaning function (without NLTK)
def clean_and_tokenize(text):
    tokens = text.lower().split()  # Lowercase & split on whitespace
    tokens = [word for word in tokens if word.isalnum() and word not in custom_stopwords]  # Remove punctuation & stopwords
    return tokens

# Apply tokenization without using nltk
df["tokens"] = df["text"].astype(str).apply(clean_and_tokenize)

# Print dataset sizes
print(f"Original dataframe: {df_raw.shape}")
print(f"Pre-processed dataframe: {df.shape}")

Original dataframe: (144, 5)
Pre-processed dataframe: (144, 7)


In [None]:
w2v_model = Word2Vec(sentences=df["tokens"], vector_size=100, window=5, min_count=2, workers=4)

# Convert documents to feature vectors by averaging word embeddings
def document_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Create document embeddings
df["doc_vector"] = df["tokens"].map(lambda x: document_vector(x, w2v_model))

# Stack document vectors into a matrix
X = np.vstack(df["doc_vector"].values)

# Apply K-Means clustering
num_clusters = 3 # Adjust based on your dataset
kmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=42, n_init=10)
df["cluster"] = kmeans.fit_predict(X)

# Print results
print(df[["text", "cluster"]])

# Save clustered dataset
df.to_csv("clustered_data.csv", index=False)


                                                  text  cluster
0    [{'role': 'user', 'content': 'What are the mai...        9
1    [{'role': 'user', 'content': 'What is employee...        6
2    [{'role': 'user', 'content': 'Where is the hea...        0
3    [{'role': 'user', 'content': 'What happens whe...        9
4    [{'role': 'user', 'content': 'How can I instal...        9
..                                                 ...      ...
139  [{'role': 'user', 'content': 'What is the main...        9
140  [{'role': 'user', 'content': 'What actions hav...        0
141  [{'role': 'user', 'content': 'What does Lyon A...        0
142  [{'role': 'user', 'content': "What is Lyon Aér...        0
143  [{'role': 'user', 'content': "What is Lyon air...        9

[144 rows x 2 columns]


In [24]:
# Sort the dataframe by cluster labels
df_sorted = df.sort_values(by="cluster")

# Display a few sample rows from each cluster
for cluster in range(num_clusters):
    print(f"Cluster {cluster}:")
    display(df_sorted[df_sorted["cluster"] == cluster].head(5))
    print("\n")

Cluster 0:


Unnamed: 0,history,question,query_modifier_question,generated_answer,answer,text,tokens,doc_vector,cluster
98,"[{'role': 'user', 'content': 'What should I do...",And if my baggage is damaged?,What should I do if my baggage is damaged at L...,"If your baggage is damaged at Lyon Airport, re...","If your baggage is damaged, you should seek in...","[{'role': 'user', 'content': 'What should I do...","[lose, item, lyon, lose, item, lyon, contact, ...","[3.424577e-05, 0.0043963077, 0.0009705579, -0....",0
100,"[{'role': 'user', 'content': 'Is this service ...",Where is it located?,Where is the service located?,I am unable to provide the location of the ser...,The service is located outside.,"[{'role': 'user', 'content': 'Is this service ...","[service, service, service, unable, provide, l...","[0.0012432816, 0.0045177033, 0.0021609832, -2....",0
91,"[{'role': 'user', 'content': 'How can I get fr...",What about traveling to the Alps?,What are the travel options from Lyon Airport ...,"From Lyon Airport, you can travel to the Alps ...",You can travel to the Alps from Lyon Airport w...,"[{'role': 'user', 'content': 'How can I get fr...","[lyon, airport, city, connects, lyon, center, ...","[0.0008213088, 0.0018248743, 0.0004122288, 0.0...",0
95,"[{'role': 'user', 'content': 'Who manages the ...",Where is their office located?,Where is Alyzia's office for Easyjet baggage s...,Alyzia's office for Easyjet baggage service is...,Alyzia's office is located in the baggage clai...,"[{'role': 'user', 'content': 'Who manages the ...","[manages, baggage, service, baggage, service, ...","[0.0016431212, 0.0035506093, -0.00038029862, -...",0
93,"[{'role': 'user', 'content': 'Where is the Air...",What about the other airlines?,What is the location of baggage services for o...,The baggage service for other airlines is mana...,"For Corsair, Easyjet, Royal Air Maroc, Royal J...","[{'role': 'user', 'content': 'Where is the Air...","[air, france, baggage, service, air, france, b...","[0.0016661871, 0.0027206542, 0.0011027243, -0....",0




Cluster 1:


Unnamed: 0,history,question,query_modifier_question,generated_answer,answer,text,tokens,doc_vector,cluster
82,"[{'role': 'user', 'content': ""What was the rol...",How many ministerial departments were there du...,How many ministerial departments existed durin...,"During the French Revolution, there were gener...","During the Revolution, there were generally si...","[{'role': 'user', 'content': ""What was the rol...","[role, ministers, executors, similar, predeces...","[0.0004062135, 0.0012135122, -0.00035362944, 0...",1
83,"[{'role': 'user', 'content': 'What was the Fre...",Where was the general headquarters established?,Where was the French army's general headquarte...,The French army's general headquarters was est...,Napoleon established his general headquarters ...,"[{'role': 'user', 'content': 'What was the Fre...","[french, army, preparing, french, army, prepar...","[0.00023241593, 0.0024841188, -0.0003751751, -...",1
74,"[{'role': 'user', 'content': 'What did the emp...",What did he order next?,What did the emperor order after spreading the...,Not enough information is available to answer ...,He ordered nine battalions of the imperial gua...,"[{'role': 'user', 'content': 'What did the emp...","[did, emperor, motivate, emperor, spread, rumo...","[0.0016922804, 0.00040191793, 0.0015649986, 0....",1
22,"[{'role': 'user', 'content': 'When was COVID-1...",How did the EU manage to overcome it?,How did the European Union manage to overcome ...,The European Union overcame COVID-19 through c...,The EU successfully overcame the COVID-19 pand...,"[{'role': 'user', 'content': 'When was COVID-1...","[declared, longer, global, health, 5, declared...","[-0.00072067423, 0.00068698835, 0.00018260661,...",1
72,"[{'role': 'user', 'content': 'Who is mentioned...",What did he lose?,What did Napoléon lose?,"Napoléon lost his empire, his dynasty, and his...",Napoléon lost the three scrutins organized dur...,"[{'role': 'user', 'content': 'Who is mentioned...","[mentioned, mentioned, did, did, napoléon, nap...","[-0.0012891497, -0.0036646966, 0.00062188256, ...",1




Cluster 2:


Unnamed: 0,history,question,query_modifier_question,generated_answer,answer,text,tokens,doc_vector,cluster
73,"[{'role': 'user', 'content': 'Where did Wellin...",Where was the rendezvous with Blücher?,Where was the rendezvous between Wellington an...,The rendezvous between Wellington and Blücher ...,The rendezvous with Blücher was at Mont-Saint-...,"[{'role': 'user', 'content': 'Where did Wellin...","[did, wellington, spend, night, 16th, spent, n...","[-0.00011092736, 0.00030861696, 0.0010849738, ...",2
114,"[{'role': 'user', 'content': 'What is done to ...",How often are they done?,How often are water quality analyses conducted?,Not enough information is available on how oft...,The analyses are done regularly to ensure the ...,"[{'role': 'user', 'content': 'What is done to ...","[ensure, water, waters, subject, regular, anal...","[-0.0019102903, 0.00033871067, 0.002851748, 0....",2
62,"[{'role': 'user', 'content': 'Where is the Par...",Who is the head of state in Austria?,Who is the head of state of Austria?,The head of state of Austria is the president.,The president of Austria is the head of state.,"[{'role': 'user', 'content': 'Where is the Par...","[parliament, austria, parliament, austria, loc...","[-0.0010203347, -0.0007945326, -0.0020802882, ...",2
102,"[{'role': 'user', 'content': 'What can you tel...",What can you do there?,"What activities are available in Oslo, Norway?","I'm sorry, I don't have information on activit...","In Oslo, you can walk along the beautiful Karl...","[{'role': 'user', 'content': 'What can you tel...","[tell, capital, modernity, blends, harmoniousl...","[-0.0012343333, 0.0027669105, -0.00042199864, ...",2




Cluster 3:


Unnamed: 0,history,question,query_modifier_question,generated_answer,answer,text,tokens,doc_vector,cluster
20,"[{'role': 'user', 'content': 'Why is research ...",What does the EU aim to create for businesses?,What does the EU aim to create for businesses ...,The EU aims to create a business-friendly mark...,The EU aims to create a business-friendly envi...,"[{'role': 'user', 'content': 'Why is research ...","[research, innovation, important, innovation, ...","[-0.0020379203, 0.0021515177, -0.0028335708, 0...",3
30,"[{'role': 'user', 'content': 'How many EU coun...",Which countries are they?,Which EU countries use the euro?,The EU countries that use the euro are Austria...,"The countries using the euro are Austria, Belg...","[{'role': 'user', 'content': 'How many EU coun...","[eu, countries, use, 20, 27, eu, member, count...","[-0.0021190848, 0.0057171024, 0.0006055456, 0....",3
31,"[{'role': 'user', 'content': 'Are all EU count...",Do they all use the euro?,Do all EU countries in the Economic and Moneta...,"No, not all EU countries in the Economic and M...","No, only 20 of the EU countries have replaced ...","[{'role': 'user', 'content': 'Are all EU count...","[eu, countries, economic, monetary, eu, countr...","[0.0004084427, 0.0048963246, 0.0001506656, -0....",3
13,"[{'role': 'user', 'content': 'What has the EU ...",What percentage of internet users shop online?,What percentage of internet users in the EU sh...,"In the European Union, 74% of internet users a...",74% of internet users aged 16 to 74 in the EU ...,"[{'role': 'user', 'content': 'What has the EU ...","[eu, protect, individual, rights, eu, taken, s...","[-0.0022266307, 0.0014406458, 0.0013517067, -0...",3
18,"[{'role': 'user', 'content': 'What does EU pol...",What is crucial for Europe's economic prosperity?,What is crucial for Europe's economic prosperi...,"According to EU policy, a strong industrial ba...",Europe's long-term economic prosperity will de...,"[{'role': 'user', 'content': 'What does EU pol...","[does, eu, policy, aim, achieve, business, pol...","[0.00012732527, 0.0011787893, -0.0015442024, -...",3




Cluster 4:


Unnamed: 0,history,question,query_modifier_question,generated_answer,answer,text,tokens,doc_vector,cluster
125,"[{'role': 'user', 'content': 'What is the focu...",How many neighborhoods are involved?,How many neighborhoods in Lyon are involved in...,"In Lyon, 18 neighborhoods are involved in urba...",There are 18 neighborhoods in 7 districts of L...,"[{'role': 'user', 'content': 'What is the focu...","[focus, urban, policy, urban, policy, lyon, fo...","[-0.0015331163, 0.0037142036, -0.00024300917, ...",4




Cluster 5:


Unnamed: 0,history,question,query_modifier_question,generated_answer,answer,text,tokens,doc_vector,cluster
63,"[{'role': 'user', 'content': 'What is the offi...",What is its capital?,What is the capital of the Kingdom of Belgium?,The capital of the Kingdom of Belgium is Bruss...,The capital of Belgium is Brussels.,"[{'role': 'user', 'content': 'What is the offi...","[official, official, belgium, kingdom, capital...","[0.0009106346, 0.0072047743, 0.0032897806, -0....",5
28,"[{'role': 'user', 'content': 'What are the ben...",How does it affect businesses?,How does the euro affect businesses?,The euro benefits businesses by saving costs a...,"The euro makes it easier, cheaper, and safer f...","[{'role': 'user', 'content': 'What are the ben...","[benefits, euro, offers, benefits, economies, ...","[-0.0016184072, 0.0020378316, 0.00023083098, -...",5
29,"[{'role': 'user', 'content': 'What are some be...",How does it help with cooperation?,How does the euro zone economy promote coopera...,The euro zone economy promotes cooperation by ...,The euro is a key mechanism for maximizing the...,"[{'role': 'user', 'content': 'What are some be...","[benefits, euro, zone, euro, zone, economy, be...","[-0.0034273386, 0.0020923198, 0.0031368043, -0...",5
33,"[{'role': 'user', 'content': ""What is the euro...",Which regions use it as an official or de fact...,Which regions outside the EU use the euro as a...,Regions outside the EU using the euro as an of...,"Regions like the Azores, Madeira, Canary Islan...","[{'role': 'user', 'content': ""What is the euro...","[significance, outside, euro, euro, second, im...","[0.0006708014, 0.0033122818, 0.00044479186, -0...",5
32,"[{'role': 'user', 'content': 'Which country ha...",What did they keep?,What currency did Denmark keep after opting ou...,Not enough information is available to answer ...,Denmark kept its former currency.,"[{'role': 'user', 'content': 'Which country ha...","[country, euro, kept, currency, member, did, c...","[-0.0011779668, 0.004747417, 0.004969182, 0.00...",5




Cluster 6:


Unnamed: 0,history,question,query_modifier_question,generated_answer,answer,text,tokens,doc_vector,cluster
1,"[{'role': 'user', 'content': 'What is employee...",How did it help against Vincent Bolloré's atte...,How did employee shareholding at Bouygues help...,"In 1998, Bouygues' employee shareholding helpe...",The employee shareholding helped counter Vince...,"[{'role': 'user', 'content': 'What is employee...","[employee, shareholding, bouygues, developed, ...","[0.0019775215, -2.6104195e-05, 0.0016660707, 0...",6
105,"[{'role': 'user', 'content': 'What can you do ...",What about the surrounding areas?,What Halloween activities are available in the...,"In the areas surrounding Bucharest, you can ve...","For a full experience, you can venture to Tran...","[{'role': 'user', 'content': 'What can you do ...","[bucharest, explore, historic, walk, illuminat...","[-0.00041690658, 0.0011111564, 0.00078685425, ...",6
58,"[{'role': 'user', 'content': 'What was the pol...",How did other Greek city-states compare to Ath...,What was the government structure of other Gre...,"During the Archaic period, many Greek city-sta...","While Athens established a democracy, other Gr...","[{'role': 'user', 'content': 'What was the pol...","[political, structure, athens, archaic, archai...","[-0.0020245332, 0.0007417745, 0.0024070898, 0....",6
57,"[{'role': 'user', 'content': 'Who is known as ...",What did he write about?,What did Herodotus write about?,"Herodotus wrote 'The Histories, ' which provid...",Herodotus wrote about historical figures such ...,"[{'role': 'user', 'content': 'Who is known as ...","[known, father, widely, known, father, did, wr...","[0.0024846033, 0.0011428737, 0.0019468028, 0.0...",6
106,"[{'role': 'user', 'content': 'What makes Edinb...",What can you do there during Halloween?,What activities are available in Edinburgh dur...,"In Edinburgh during Halloween, visitors can ex...","In Edinburgh, you can join guided tours at nig...","[{'role': 'user', 'content': 'What makes Edinb...","[makes, edinburgh, special, capital, brings, h...","[0.00038702274, 0.00055652007, 0.00347425, 0.0...",6




Cluster 7:


Unnamed: 0,history,question,query_modifier_question,generated_answer,answer,text,tokens,doc_vector,cluster
133,"[{'role': 'user', 'content': 'How many passeng...",And how many movements were there?,How many aircraft movements were there at Atla...,Not enough information is available to determi...,"There were 8,109 movements at the Atlantic Cit...","[{'role': 'user', 'content': 'How many passeng...","[passengers, did, atlantic, city, internationa...","[0.0010982389, -0.00047543034, 0.0013788722, -...",7
116,"[{'role': 'user', 'content': 'What is the ener...",What was the initial goal?,What was the initial energy savings goal of th...,The initial energy savings goal of the City of...,The initial goal was to achieve 10% energy sav...,"[{'role': 'user', 'content': 'What is the ener...","[energy, saving, plan, city, energy, saving, p...","[0.0016732686, 0.0020316998, 0.0039005126, -0....",7
132,"[{'role': 'user', 'content': 'Who operates the...",When does the concession end?,When does VINCI Airports' concession for opera...,The operating contract for Atlantic City Inter...,The concession for the Atlantic City Internati...,"[{'role': 'user', 'content': 'Who operates the...","[operates, atlantic, city, international, airp...","[0.0010492854, -0.0014025162, 0.0005625584, -0...",7




Cluster 8:


Unnamed: 0,history,question,query_modifier_question,generated_answer,answer,text,tokens,doc_vector,cluster
129,"[{'role': 'user', 'content': 'Can you tell me ...",What about the airports in Portugal?,What are some airports in Portugal?,Some airports in Portugal include Aéroport de ...,"In Portugal, there are several airports such a...","[{'role': 'user', 'content': 'Can you tell me ...","[tell, airports, airports, including, aéroport...","[-0.004776806, 0.004311443, 0.0027643198, 0.00...",8
134,"[{'role': 'user', 'content': 'Can you tell me ...",What about in Brazil?,What are some airports in Brazil?,"Brazil has several airports, including Aéropor...","In Brazil, there are airports like Tefé Airpor...","[{'role': 'user', 'content': 'Can you tell me ...","[tell, airports, united, airports, united, inc...","[-0.0027654788, 0.0039025743, -0.0013099692, 0...",8
135,"[{'role': 'user', 'content': 'Can you tell me ...",What about airports in Portugal?,What are some airports in Portugal?,"Portugal has several airports, including Aérop...","In Portugal, there are airports like Aéroport ...","[{'role': 'user', 'content': 'Can you tell me ...","[tell, airports, airports, france, aéroport, p...","[-0.0048060925, 0.0047085225, 0.00311829, 0.00...",8




Cluster 9:


Unnamed: 0,history,question,query_modifier_question,generated_answer,answer,text,tokens,doc_vector,cluster
97,"[{'role': 'user', 'content': 'What can I bring...",What if I forget to leave something prohibited...,What happens if prohibited items are found in ...,"If prohibited items are found in luggage, Lyon...",If you forget to leave a prohibited item at ho...,"[{'role': 'user', 'content': 'What can I bring...","[bring, check, dgac, website, information, pro...","[-0.0010920127, 0.0023740272, -0.0024049792, -...",9
96,"[{'role': 'user', 'content': 'What should I be...",Where can I find the estimated wait time?,Where can I find the estimated wait time for a...,You can find the estimated wait time for airpo...,You can find the estimated wait time for secur...,"[{'role': 'user', 'content': 'What should I be...","[aware, regarding, security, checks, aware, de...","[0.0016338062, 0.002085438, 0.0025325192, 0.00...",9
101,"[{'role': 'user', 'content': 'Can I recharge m...",What should I bring to use the charging stations?,What is required to use the electric vehicle c...,To use the electric vehicle charging stations ...,You need to bring your own adapter to use the ...,"[{'role': 'user', 'content': 'Can I recharge m...","[recharge, electric, vehicle, lyon, recharge, ...","[0.0010020903, 0.005290419, -0.00027900992, 0....",9
99,"[{'role': 'user', 'content': 'Is the parking o...",Where is it located?,Where is the parking located?,The parking is located in an exterior zone and...,The parking is located at Parking P3 at Lyon A...,"[{'role': 'user', 'content': 'Is the parking o...","[parking, open, parking, open, 7, days, parkin...","[0.0011880384, 0.0034522908, 0.002829211, 0.00...",9
0,"[{'role': 'user', 'content': 'What are the mai...",What other sectors are they involved in?,What other sectors is Bouygues involved in bes...,"Besides construction, real estate, media, and ...","Besides construction and real estate, Bouygues...","[{'role': 'user', 'content': 'What are the mai...","[main, business, activities, main, business, a...","[-0.0015435533, 0.0016477973, -0.0029323648, -...",9




