**Load test data from excel into dataframe**

In [2]:
import pandas as pd
from tkinter import Tk
from tkinter.filedialog import askopenfilename

# Use tkinter to select the Excel file
def select_file():
    Tk().withdraw()
    file_path = askopenfilename(
        title="Select Training Excel File",
        filetypes=[("Excel files", "*.xlsx *.xls"), ("All files", "*.*")]
    )
    return file_path

# Prompt the user
print("Please select the training data file:")
training_excel_file = select_file()

if training_excel_file:
    # Read Excel file, header starts at row 2 (index 1)
    df_training = pd.read_excel(training_excel_file, sheet_name='Unique records', header=1)
    
    print("Top 5 rows in the training DataFrame:")
    print(df_training.head())
else:
    print("No file selected. Please try again.")

Please select the training data file:
Top 5 rows in the training DataFrame:
     concatenate     MM/SM no.                                Short Text  \
0   450000721010  3.781023e+09  (CAT 2-0) DELIVERY CHARGES APPLICABLE FO   
1  4500007210570  3.781023e+09  (CAT 5-0) DELIVERY CHARGES APPLICABLE FO   
2  4500007210590  3.789990e+09  (CAT 5-2B)  SOLID FIXED WALL MOUNTING <2   
3  4500007210600  3.789990e+09  (CAT 5-2C) DOUBLE ARM SOLID WALL MOUNT <   
4  4500007210650  3.789990e+09  (CAT 5-3B)  SOLID FIXED WALL MOUNTING <2   

                              Contract Header Text  \
0  Supply of Electronics and Electrical Appliances   
1  Supply of Electronics and Electrical Appliances   
2  Supply of Electronics and Electrical Appliances   
3  Supply of Electronics and Electrical Appliances   
4  Supply of Electronics and Electrical Appliances   

                                  Contract Item Text  \
0  (CAT 2-0) DELIVERY CHARGES APPLICABLE FOR PURC...   
1  (CAT 5-0) DELIVERY CHARGES 

**Data cleansing**

In [4]:
import re

# Ensure the necessary columns exist
required_columns = ['Contract Header and Contract Item text', 'SM', 'Validated GL']
for col in required_columns:
    if col not in df_training.columns:
        raise ValueError(f"The training Excel file must contain '{col}' column.")

# Data cleansing: Remove null values
df_training = df_training.dropna(subset=required_columns)

# Remove unwanted patterns'
def clean_query_text(text):
    return re.sub(r'<\(><<\)>(.*?)<\(><<\)>', '', text).strip()

df_training['Contract Header and Contract Item text'] = df_training['Contract Header and Contract Item text'].apply(clean_query_text)

df_training = df_training.rename(columns={"Contract Header and Contract Item text": "Combined Description"})
df_training['material_number'] = df_training['SM'].str.strip()
df_training['description'] = df_training['Validated GL'].str.split(' - ', n=1).str[1].str.strip()
df_training['Combined Description'] = df_training['Combined Description'].str.strip()

df_training = df_training.drop(columns=['SM', 'Validated GL'])

# Display cleaned DataFrame
print("\nCleaned training DataFrame:")
print(df_training[['material_number', 'description', 'Combined Description']].head())


Cleaned training DataFrame:
  material_number                            description  \
0            E043  Other Services511699 - Other Services   
1            E043  Other Services511699 - Other Services   
2            E043  Other Services511699 - Other Services   
3            E043  Other Services511699 - Other Services   
4            E043  Other Services511699 - Other Services   

                                Combined Description  
0  Supply of Electronics and Electrical Appliance...  
1  Supply of Electronics and Electrical Appliance...  
2  Supply of Electronics and Electrical Appliance...  
3  Supply of Electronics and Electrical Appliance...  
4  Supply of Electronics and Electrical Appliance...  


**Vector embedding of materials.json**

In [6]:
import sys
import os

sys.path.append(os.path.abspath(".."))
from src import search
json_file = '../data/processed/materials.json'

# Initialize the search engine with reference data
search_engine = search.RoBERTaSearch(data_file=json_file)

print("\nVector embedding completed for reference data")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Vector embedding completed for reference data


**Split dataframe into train test, evaluate accuracy of model against training data, export to json**

In [8]:
from sklearn.model_selection import train_test_split
from src.search import evaluate_model
import json

# Split the data into train and test sets
train_df, test_df = train_test_split(df_training, test_size=0.9, random_state=42)

# Evaluate the model on the train data
results = evaluate_model(search_engine, train_df, top_k=5)

# Save results to a JSON file for analysis
output_file = "../data/processed/results.json"
with open(output_file, "w") as f:
    json.dump(results, f, indent=4)

print(f"\nEvaluation completed. Results saved to {output_file}")


Evaluation completed. Results saved to ../data/processed/results.json


**Generic results**

In [10]:
# # Truncate long strings
# def truncate_text(text, max_length=30):
#     return text if len(text) <= max_length else text[:max_length] + "..."

# # Calculate accuracy
# accuracy = sum(1 for result in results if result["is_correct"]) / len(results)
# print(f"Accuracy: {accuracy:.2%}")

# # Display evaluation results
# print("\nEvaluation Results:")
# for idx, result in enumerate(results[:5]):  # Display first 5 results
#     query = truncate_text(result["query"], max_length=40)
#     expected = result["expected"]
#     correct = "Yes" if result["is_correct"] else "No"

#     print(f"{idx + 1}. Query: {query}")
#     print(f"   Expected: {expected}")
#     print(f"   Correct: {correct}")
#     print("   Top 5 Matches:")
    
#     for match in result["retrieved_top_5"]:
#         material_number = match["material_number"]
#         description = truncate_text(match["description"], max_length=40)
#         similarity_score = match["score"]
#         print(f"      {material_number} - {description} (Score: {similarity_score:.2f}%)")
    
#     print("-" * 40)  # Separator between results


In [13]:
# Prepare data for export
additional_data = []

# Iterate through the evaluation results
for result in results:  # 'results' is the list of evaluation results
    # Combine expected material number and description
    expected_combined = f"{result['expected']} - {result['expected_description']}"

    # Combine top 5 matched material numbers into a single string
    top_matches_combined = "; ".join(
        [f"{match['material_number']} score:{match['score']}" for match in result["retrieved_top_5"]]
    )

    additional_data.append({
        "query": result["query"],
        "expected": expected_combined,  
        "matches": top_matches_combined,  
        "is_correct": "True" if result["is_correct"] else "False"  
    })

# Convert to DataFrame
additional_df = pd.DataFrame(additional_data)

# Specify export path
output_path = "../data/output/evaluation_results.xlsx"

# Dynamically get the model name
model_name = getattr(search_engine, "model_name", "Unknown_Model")
sheet_name = model_name.replace("/", "_")[:31]  

# Ensure folder exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Append to the Excel file without overwriting existing sheets
with pd.ExcelWriter(output_path, engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
    additional_df.to_excel(writer, index=False, sheet_name=sheet_name)

print(f"Additional results appended to sheet '{sheet_name}' in: {output_path}")

Additional results appended to sheet 'roberta-base' in: ../data/output/evaluation_results.xlsx
