In [4]:
import os
import sys
from dotenv import load_dotenv

# Manually set the path for the src directory
src_path = os.path.abspath('src')
sys.path.append(src_path)
load_dotenv()

True

### 1 - Datasets loading

I - load "seed" dataset from local folder

In [5]:
import utils

In [None]:
# Input parameters:
# - seed_filename: Name of the file containing the SEED data (located in the raw_data folder).
# - x_label: Name of the column containing the input text (dialogue).
# - y_label: Name of the column containing the output text (summary).

seed_filename = 'data/seed_sampled_100.parquet'
x_label = 'dialogue'
y_label = 'summary'

seed = utils.read_seed(seed_filename,x_label,y_label)


II - load "unlabeled" dataset from local folder

In [None]:
# Input parameters:
# - unlabeled_filename: Name of the file containing the UNLABELED data (located in the raw_data folder).
# - x_label: Name of the column containing the input text (dialogue).

unlabeled_filename = 'data/unlabeled_sampled_1000.parquet'
x_label = 'dialogue'

unlabeled = utils.read_unlabeled(unlabeled_filename, x_label)

---
### 2 - Text preprocessing module

In [6]:
from preprocessing import *

I - "seed" dataset preprocessing

In [8]:
func_list = [
            remove_html_tags,
            remove_url, 
            chat_conversion, 
            remove_stopwords, 
            spelling_correction, 
            rewrite_emoji
            ]

# Apply the defined preprocessing functions to the 'seed' dataset
# 'seed' is the DataFrame to be processed
# 'x_label' specifies the column in 'seed' that contains the text to be preprocessed
# func_list is the list of preprocessing functions to apply
seed = process_dataset(seed, 'x_label', func_list)

II - "unlabeled" dataset preprocessing

In [9]:
# Apply the defined preprocessing functions to the 'unlabeled' dataset
unlabeled = process_dataset(unlabeled, 'x_label', func_list)

III - Save to folder (a new folder is created, and both files are saved into it)

In [11]:
utils.save_to_folder(seed, unlabeled)

Saved to cache/2025-03-23_13:38:20


---
### 3 - Embedding Generation Module

I - Load the preprocessed dataset from the data folder: "seed" and "unlabeled"

In [6]:
folder_name = '2025-03-23_13:38:20'

seed, unlabeled = utils.read_from_folder(folder_name)

In [14]:
from embeddings import EmbeddingGenerator

model = 'BAAI/bge-small-en-v1.5'
device = 'cuda'

embedding_generator = EmbeddingGenerator(
            model_name=model,
            device=device,
            trust_remote_code=True
        )

  from .autonotebook import tqdm as notebook_tqdm


Loading tokenizer and model from BAAI/bge-small-en-v1.5
Trust remote code: True
Model embedding dimension: 384


II - Check if the loaded data already contains embeddings

In [15]:
if 'embedding' not in seed.columns:
    seed_embeddings = embedding_generator.encode_batch(
            texts=seed['x_label'].tolist(),
            batch_size=1,
            max_length=512
        )
    seed['embedding'] = list(seed_embeddings)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



Tokenization statistics (from first 100 texts):
Average length: 103.5 tokens
Maximum length: 360 tokens
Texts will be truncated to 512 tokens if longer



Encoding texts: 100%|██████████| 100/100 [00:03<00:00, 27.99batch/s]


Final embedding shape: (100, 384) (100 samples, 384 dimensions)
Embeddings saved to ../embeddings/batch_bge-small-en-v1.5_20250323_133938.pkl





In [16]:
if 'embedding' not in unlabeled.columns:
    unlabeled_embeddings = embedding_generator.encode_batch(
            texts=unlabeled['x_label'].tolist(),
            batch_size=1,
            max_length=512
        )
    unlabeled['embedding'] = list(unlabeled_embeddings)


Tokenization statistics (from first 100 texts):
Average length: 103.5 tokens
Maximum length: 360 tokens
Texts will be truncated to 512 tokens if longer



Encoding texts: 100%|██████████| 1000/1000 [00:37<00:00, 27.00batch/s]


Final embedding shape: (1000, 384) (1000 samples, 384 dimensions)
Embeddings saved to ../embeddings/batch_bge-small-en-v1.5_20250323_134020.pkl





III - Save 'seed' and 'unlabeled' to the same folder from which the original data was loaded.

In [17]:
utils.save_to_folder(seed, unlabeled, folder_name = folder_name)

Saved to cache/2025-03-23_13:38:20


---
### 4 - Iteration cycle

I - Connect to the (remote) database

In [10]:
# Option with remote server
import database
collection_name = 'collection_2'
metric_type = 'IP'

# Initialize VectorDatabase
vector_db = database.VectorDatabase(
    uri=os.getenv('MILVUS_URI'),
    token=os.getenv('MILVUS_TOKEN'),
    collection_name=collection_name,
    embedding_dim=len(seed['embedding'].iloc[0]),
    metric_type=metric_type
)

Collection 'collection_2' does not exist, creating...
Collection 'collection_2' created!


II - Import 'seed' data into the database (if not already done)

In [11]:
# Get all data from collection
data_from_collection = vector_db.get_collection_data(['embedding'])

# Convert embeddings in both dataframes to lists for comparison
seed_embeddings = [embedding.tolist() for embedding in seed['embedding']]
data_from_collection_embeddings = data_from_collection['embedding'].tolist() if data_from_collection.empty != True else []

# Filter seed dataframe to exclude rows where the 'embedding' is in data_from_collection
seed_filtered = seed[~seed['embedding'].apply(lambda x: x.tolist()).isin(data_from_collection_embeddings)]

if seed_filtered.empty != True:
    print(f'Inserting seed data...({len(seed_filtered)} entities left) ')
    vector_db.bulk_upsert(
        input_texts  = seed_filtered['x_label'].tolist(),
        embeddings   = seed_filtered['embedding'].tolist(),
        output_texts = seed_filtered['y_label'].tolist(),
        batch_size   = 100
    )
else:
    print('Collection already has data. Skipping seed data insertion...')

Inserting seed data...(100 entities left) 
Inserting 100 records into collection: collection_2
Total insert time: 0 seconds
Flushing collection...


III - Get the current 'unlabeled' set in case some of its data has already been processed.

In [23]:
# Get all data from collection
data_from_collection = vector_db.get_collection_data(['embedding'])

unlabeled_embeddings = [embedding.tolist() for embedding in unlabeled['embedding']]
data_from_collection_embeddings = data_from_collection['embedding'].tolist() if data_from_collection.empty != True else []

# Filter unlabeled dataframe to exclude rows where the 'embedding' is in data_from_collection
unlabeled_filtered = unlabeled[~unlabeled['embedding'].apply(lambda x: x.tolist()).isin(data_from_collection_embeddings)]

print(f'{len(unlabeled_filtered)} entities left ')

892 entities left 


IV - Start main cycle

In [None]:
import llminteraction
from tqdm import tqdm

for index, row in tqdm(unlabeled_filtered.iterrows(), total=len(unlabeled_filtered)):

    # Get the nearest top_k neighbors
    neighbors = vector_db.search_engine(
        query_embedding=row['embedding'],
        top_k=2
    )
    
    # Build a prompt
    prompt = llminteraction.build_icl_prompt(
        examples=neighbors,
        new_dialogue = row['x_label'],
    )

    # LLM interaction
    response = llminteraction.call_openrouter_llm(
        prompt=prompt,
        model="google/gemma-3-27b-it",
        openrouter_api_key=os.getenv('OPENROUTER_API_KEY'),
        temperature=0.3
    )   
    
    # Update the database
    vector_db.upsert(
        input_text=row['x_label'],
        embedding=row['embedding'],
        output_text=response
    )
