In [14]:
import os
import random
import re
import string

import nltk
import numpy as np
import pandas as pd

from gensim.models import Word2Vec

from nltk import word_tokenize
from nltk.corpus import stopwords


from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

nltk.download("stopwords")
nltk.download("punkt")

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daniel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens

In [7]:
df_raw = pd.read_json("/Users/daniel/Desktop/STATAPP/Statap-panels/temp_dataset.json")
df = df_raw.copy()
for col in text_columns:
    df[col] = df[col].astype(str).fillna("")

# Combine all text fields into a single text column
df["text"] = df[text_columns].apply(lambda x: " | ".join(x), axis=1)
df.head()

Unnamed: 0,history,question,query_modifier_question,generated_answer,answer,text
0,"[{'role': 'user', 'content': 'What are the mai...",What other sectors are they involved in?,What other sectors is Bouygues involved in bes...,"Besides construction, real estate, media, and ...","Besides construction and real estate, Bouygues...","[{'role': 'user', 'content': 'What are the mai..."
1,"[{'role': 'user', 'content': 'What is employee...",How did it help against Vincent Bolloré's atte...,How did employee shareholding at Bouygues help...,"In 1998, Bouygues' employee shareholding helpe...",The employee shareholding helped counter Vince...,"[{'role': 'user', 'content': 'What is employee..."
2,"[{'role': 'user', 'content': 'Where is the hea...",And what about Bouygues Construction?,Where is the headquarters of Bouygues Construc...,"The headquarters of Bouygues Construction, kno...","The headquarters of Bouygues Construction, kno...","[{'role': 'user', 'content': 'Where is the hea..."
3,"[{'role': 'user', 'content': 'What happens whe...",How much money does this generate in France?,How much money do search engines earn from spo...,Search engines in France earn approximately 2....,"In France, this generates about 2.4 billion eu...","[{'role': 'user', 'content': 'What happens whe..."
4,"[{'role': 'user', 'content': 'How can I instal...",Is there any cost involved?,Is there any cost involved in installing Lilo ...,Not enough information is available about the ...,"No, using Lilo in a business setting is simple...","[{'role': 'user', 'content': 'How can I instal..."


In [15]:
# Define stopwords (use sklearn's built-in stopwords)
custom_stopwords = set(ENGLISH_STOP_WORDS) | {"history", "question", "query", "answer"}

# Tokenization and cleaning function (without NLTK)
def clean_and_tokenize(text):
    tokens = text.lower().split()  # Lowercase & split on whitespace
    tokens = [word for word in tokens if word.isalnum() and word not in custom_stopwords]  # Remove punctuation & stopwords
    return tokens

# Apply tokenization without using nltk
df["tokens"] = df["text"].astype(str).apply(clean_and_tokenize)

# Print dataset sizes
print(f"Original dataframe: {df_raw.shape}")
print(f"Pre-processed dataframe: {df.shape}")

Original dataframe: (144, 5)
Pre-processed dataframe: (144, 7)


In [36]:
w2v_model = Word2Vec(sentences=df["tokens"], vector_size=100, window=5, min_count=2, workers=4)

# Convert documents to feature vectors by averaging word embeddings
def document_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Create document embeddings
df["doc_vector"] = df["tokens"].map(lambda x: document_vector(x, w2v_model))

# Stack document vectors into a matrix
X = np.vstack(df["doc_vector"].values)

# Apply K-Means clustering
num_clusters = 5 # Number of clusters
kmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=42, n_init=10)
df["cluster"] = kmeans.fit_predict(X)

# Print results
print(df[["text", "cluster"]])



                                                  text  cluster
0    [{'role': 'user', 'content': 'What are the mai...        1
1    [{'role': 'user', 'content': 'What is employee...        0
2    [{'role': 'user', 'content': 'Where is the hea...        1
3    [{'role': 'user', 'content': 'What happens whe...        0
4    [{'role': 'user', 'content': 'How can I instal...        4
..                                                 ...      ...
139  [{'role': 'user', 'content': 'What is the main...        1
140  [{'role': 'user', 'content': 'What actions hav...        1
141  [{'role': 'user', 'content': 'What does Lyon A...        1
142  [{'role': 'user', 'content': "What is Lyon Aér...        1
143  [{'role': 'user', 'content': "What is Lyon air...        1

[144 rows x 2 columns]


In [37]:
##Example of our clustering

# Sort the dataframe by cluster labels
df_sorted = df.sort_values(by="cluster")

# Display a few sample rows from each cluster
for cluster in range(num_clusters):
    print(f"Cluster {cluster}:")
    display(df_sorted[df_sorted["cluster"] == cluster].head(5))
    print("\n")

Cluster 0:


Unnamed: 0,history,question,query_modifier_question,generated_answer,answer,text,tokens,doc_vector,cluster
85,"[{'role': 'user', 'content': 'What was the ord...",Who held the record for the number of girouettes?,Who held the record for the number of members ...,Not enough information is available to answer ...,The prince of Bienauvent held the record for t...,"[{'role': 'user', 'content': 'What was the ord...","[order, order, girouette, created, le, nain, j...","[-0.0004208272, -0.0012633534, 0.0028575093, 0...",0
120,"[{'role': 'user', 'content': 'What actions are...",What about the simpler actions?,What simpler actions are being taken to addres...,Simpler actions include providing a free infor...,The plan also includes many other simpler acti...,"[{'role': 'user', 'content': 'What actions are...","[actions, taken, address, plan, includes, deve...","[-0.0018059484, 0.0026421517, 0.0007904525, -0...",0
76,"[{'role': 'user', 'content': 'Who was part of ...",Who was supposed to replace the absent general...,Who was supposed to replace the absent general...,The author of the text was supposed to replace...,I was supposed to replace the absent general M...,"[{'role': 'user', 'content': 'Who was part of ...","[provisional, provisional, government, compose...","[1.35186465e-05, 0.0034857558, -0.0006935674, ...",0
36,"[{'role': 'user', 'content': 'What are the pay...",Are there any limitations on cash payments?,Are there any limitations on cash payments at ...,There is no specific information available reg...,"Yes, cash payments are limited to 1000 Euros f...","[{'role': 'user', 'content': 'What are the pay...","[payment, methods, available, galeries, lafaye...","[0.0011299823, 0.0044598384, -0.0024213914, 0....",0
37,"[{'role': 'user', 'content': 'What can I use t...",Where can I leave my car while shopping?,Where can I park my car while shopping at Gale...,I don't have information on parking at Galerie...,You can leave your car in the Galeries Lafayet...,"[{'role': 'user', 'content': 'What can I use t...","[use, make, shopping, easier, galeries, use, n...","[-2.962821e-05, 0.0022653444, -0.0016851396, 0...",0




Cluster 1:


Unnamed: 0,history,question,query_modifier_question,generated_answer,answer,text,tokens,doc_vector,cluster
91,"[{'role': 'user', 'content': 'How can I get fr...",What about traveling to the Alps?,What are the travel options from Lyon Airport ...,"From Lyon Airport, you can travel to the Alps ...",You can travel to the Alps from Lyon Airport w...,"[{'role': 'user', 'content': 'How can I get fr...","[lyon, airport, city, connects, lyon, center, ...","[0.0008213088, 0.0018248743, 0.0004122288, 0.0...",1
99,"[{'role': 'user', 'content': 'Is the parking o...",Where is it located?,Where is the parking located?,The parking is located in an exterior zone and...,The parking is located at Parking P3 at Lyon A...,"[{'role': 'user', 'content': 'Is the parking o...","[parking, open, parking, open, 7, days, parkin...","[0.0011880384, 0.0034522908, 0.002829211, 0.00...",1
88,"[{'role': 'user', 'content': 'What happened to...",Why did Catherine not travel to the baptism?,Why did Catherine not travel to the baptism of...,Catherine did not travel to the baptism becaus...,Catherine did not travel to the baptism becaus...,"[{'role': 'user', 'content': 'What happened to...","[happened, kingdom, westphalia, austria, austr...","[0.0011411353, 0.0010954213, 0.0010203412, -0....",1
101,"[{'role': 'user', 'content': 'Can I recharge m...",What should I bring to use the charging stations?,What is required to use the electric vehicle c...,To use the electric vehicle charging stations ...,You need to bring your own adapter to use the ...,"[{'role': 'user', 'content': 'Can I recharge m...","[recharge, electric, vehicle, lyon, recharge, ...","[0.0010020903, 0.005290419, -0.00027900992, 0....",1
87,"[{'role': 'user', 'content': 'What did Élisa d...",What did she do to stimulate economic activity?,What actions did Élisa take to stimulate econo...,Élisa stimulated economic activity by removing...,Élisa stimulated economic activity by creating...,"[{'role': 'user', 'content': 'What did Élisa d...","[did, élisa, arrived, lucques, arrival, lucque...","[0.0014272716, 0.002243714, 0.00073233014, 0.0...",1




Cluster 2:


Unnamed: 0,history,question,query_modifier_question,generated_answer,answer,text,tokens,doc_vector,cluster
134,"[{'role': 'user', 'content': 'Can you tell me ...",What about in Brazil?,What are some airports in Brazil?,"Brazil has several airports, including Aéropor...","In Brazil, there are airports like Tefé Airpor...","[{'role': 'user', 'content': 'Can you tell me ...","[tell, airports, united, airports, united, inc...","[-0.0027654788, 0.0039025743, -0.0013099692, 0...",2
135,"[{'role': 'user', 'content': 'Can you tell me ...",What about airports in Portugal?,What are some airports in Portugal?,"Portugal has several airports, including Aérop...","In Portugal, there are airports like Aéroport ...","[{'role': 'user', 'content': 'Can you tell me ...","[tell, airports, airports, france, aéroport, p...","[-0.0048060925, 0.0047085225, 0.00311829, 0.00...",2
129,"[{'role': 'user', 'content': 'Can you tell me ...",What about the airports in Portugal?,What are some airports in Portugal?,Some airports in Portugal include Aéroport de ...,"In Portugal, there are several airports such a...","[{'role': 'user', 'content': 'Can you tell me ...","[tell, airports, airports, including, aéroport...","[-0.004776806, 0.004311443, 0.0027643198, 0.00...",2




Cluster 3:


Unnamed: 0,history,question,query_modifier_question,generated_answer,answer,text,tokens,doc_vector,cluster
94,"[{'role': 'user', 'content': 'How can I contac...",And for Avia Partner?,What is the contact number for Avia Partner ba...,You can contact the Avia Partner baggage servi...,You can contact Avia Partner by phone at +33 (...,"[{'role': 'user', 'content': 'How can I contac...","[contact, air, france, baggage, contact, air, ...","[0.0011118106, 0.004495697, 0.0027581975, 0.00...",3
100,"[{'role': 'user', 'content': 'Is this service ...",Where is it located?,Where is the service located?,I am unable to provide the location of the ser...,The service is located outside.,"[{'role': 'user', 'content': 'Is this service ...","[service, service, service, unable, provide, l...","[0.0012432816, 0.0045177033, 0.0021609832, -2....",3
95,"[{'role': 'user', 'content': 'Who manages the ...",Where is their office located?,Where is Alyzia's office for Easyjet baggage s...,Alyzia's office for Easyjet baggage service is...,Alyzia's office is located in the baggage clai...,"[{'role': 'user', 'content': 'Who manages the ...","[manages, baggage, service, baggage, service, ...","[0.0016431212, 0.0035506093, -0.00038029862, -...",3
93,"[{'role': 'user', 'content': 'Where is the Air...",What about the other airlines?,What is the location of baggage services for o...,The baggage service for other airlines is mana...,"For Corsair, Easyjet, Royal Air Maroc, Royal J...","[{'role': 'user', 'content': 'Where is the Air...","[air, france, baggage, service, air, france, b...","[0.0016661871, 0.0027206542, 0.0011027243, -0....",3
98,"[{'role': 'user', 'content': 'What should I do...",And if my baggage is damaged?,What should I do if my baggage is damaged at L...,"If your baggage is damaged at Lyon Airport, re...","If your baggage is damaged, you should seek in...","[{'role': 'user', 'content': 'What should I do...","[lose, item, lyon, lose, item, lyon, contact, ...","[3.424577e-05, 0.0043963077, 0.0009705579, -0....",3




Cluster 4:


Unnamed: 0,history,question,query_modifier_question,generated_answer,answer,text,tokens,doc_vector,cluster
14,"[{'role': 'user', 'content': 'What is the maxi...",How many weeks of paid leave are guaranteed?,How many weeks of paid leave are guaranteed fo...,Workers in the EU are guaranteed at least 4 we...,Each worker is guaranteed a minimum period of ...,"[{'role': 'user', 'content': 'What is the maxi...","[maximum, number, hours, worker, eu, work, max...","[-0.0011307788, 0.0018871715, 0.0011318149, -0...",4
4,"[{'role': 'user', 'content': 'How can I instal...",Is there any cost involved?,Is there any cost involved in installing Lilo ...,Not enough information is available about the ...,"No, using Lilo in a business setting is simple...","[{'role': 'user', 'content': 'How can I instal...","[install, lilo, install, lilo, let, employee, ...","[-0.0046512317, 0.0017406998, 0.0032785838, 0....",4
82,"[{'role': 'user', 'content': ""What was the rol...",How many ministerial departments were there du...,How many ministerial departments existed durin...,"During the French Revolution, there were gener...","During the Revolution, there were generally si...","[{'role': 'user', 'content': ""What was the rol...","[role, ministers, executors, similar, predeces...","[0.0004062135, 0.0012135122, -0.00035362944, 0...",4
133,"[{'role': 'user', 'content': 'How many passeng...",And how many movements were there?,How many aircraft movements were there at Atla...,Not enough information is available to determi...,"There were 8,109 movements at the Atlantic Cit...","[{'role': 'user', 'content': 'How many passeng...","[passengers, did, atlantic, city, internationa...","[0.0010982389, -0.00047543034, 0.0013788722, -...",4
132,"[{'role': 'user', 'content': 'Who operates the...",When does the concession end?,When does VINCI Airports' concession for opera...,The operating contract for Atlantic City Inter...,The concession for the Atlantic City Internati...,"[{'role': 'user', 'content': 'Who operates the...","[operates, atlantic, city, international, airp...","[0.0010492854, -0.0014025162, 0.0005625584, -0...",4




