**Load test data from excel into dataframe**

In [4]:
import pandas as pd
from tkinter import Tk
from tkinter.filedialog import askopenfilename

# Use tkinter to select the Excel file
def select_file():
    Tk().withdraw() 
    file_path = askopenfilename(
        title="Select Training Excel File",
        filetypes=[("Excel files", "*.xlsx *.xls"), ("All files", "*.*")]
    )
    return file_path

# Prompt the user
print("Please select the training data file:")
training_excel_file = select_file()

if training_excel_file:
    df_training = pd.read_excel(training_excel_file, sheet_name='Sheet1')
    print("Top 5 rows in the training DataFrame:")
    print(df_training.head())
else:
    print("No file selected. Please try again.")

Please select the training data file:
Top 5 rows in the training DataFrame:
   Unnamed: 0                               Combined Description  \
0       93203      PROVISION OF CATERING SERVICES    [Caterin...   
1       41610    PROVISION OF LOGISTICS SERVICES FOR HOME TEA...   
2       95154     WOG Video and Animation Period Contract and...   
3       58051    PROVISION OF LOGISTICS SERVICES FOR HOME TEA...   
4        4906    SUPPLY OF CALL CENTRE SYSTEM WITH MAINTENANC...   

                                     Commitment item  \
0  212901 - Other Assets511999 - Direct Project: ...   
1  212401 - Other Equipment511999 - Direct Projec...   
2     218999 - Other Services511699 - Other Services   
3     218999 - Other Services511199 - Other Manpower   
4  226301 - Maintenance: ICT Hardware Integrated ...   

                                         predictions  confidence  \
0  212901 - Other Assets511999 - Direct Project: ...    0.969243   
1  212401 - Other Equipment511999 - Direct

**Data cleansing**

In [6]:
# Ensure the necessary columns exist 
if 'Combined Description' not in df_training.columns or 'Commitment item' not in df_training.columns:
    raise ValueError("The training Excel file must contain 'Combined Description' and 'Commitment item' columns.")

# Data cleansing: Remove null values and strip whitespace 
df_training = df_training.dropna(subset=['Combined Description', 'Commitment item'])  
df_training['Combined Description'] = df_training['Combined Description'].str.strip()  

# Split 'Commitment item' 
df_training[['material_number', 'description']] = df_training['Commitment item'].str.split(' - ', n=1, expand=True)

df_training = df_training.drop(columns=['Commitment item'])

# Display cleaned DataFrame
print("\nCleaned training DataFrame:")
print(df_training[['material_number', 'description', 'Combined Description']].head())


Cleaned training DataFrame:
  material_number                                        description  \
0          212901        Other Assets511999 - Direct Project: Others   
1          212401     Other Equipment511999 - Direct Project: Others   
2          218999              Other Services511699 - Other Services   
3          218999              Other Services511199 - Other Manpower   
4          226301  Maintenance: ICT Hardware Integrated with Soft...   

                                Combined Description  
0  PROVISION OF CATERING SERVICES    [Catering 01...  
1  PROVISION OF LOGISTICS SERVICES FOR HOME TEAM ...  
2  WOG Video and Animation Period Contract and Fr...  
3  PROVISION OF LOGISTICS SERVICES FOR HOME TEAM ...  
4  SUPPLY OF CALL CENTRE SYSTEM WITH MAINTENANCE ...  


**Vector embedding of materials.json**

In [2]:
import sys
import os
from src import search

sys.path.append(os.path.abspath(".."))
json_file = '../data/processed/materials.json'

# Initialize the search engine with reference data
search_engine = search.SentenceTransformerSearch(data_file=json_file)

print("\nVector embedding completed for reference data")


Vector embedding completed for reference data


**Split dataframe into train test, evaluate accuracy of model against training data**

In [10]:
from sklearn.model_selection import train_test_split
from src.search import evaluate_model
import json

# Split the data into train and test sets
train_df, test_df = train_test_split(df_training, test_size=0.5, random_state=42)

# Evaluate the model on the train data
results = evaluate_model(search_engine, test_df, top_k=5)

# Save results to a JSON file for analysis
output_file = "../data/processed/results.json"
with open(output_file, "w") as f:
    json.dump(results, f, indent=4)

print(f"\nEvaluation completed. Results saved to {output_file}")


Evaluation completed. Results saved to ../data/processed/results.json


**Generic results**

In [12]:
# # Truncate long strings
# def truncate_text(text, max_length=30):
#     return text if len(text) <= max_length else text[:max_length] + "..."

# # Calculate accuracy
# accuracy = sum(1 for result in results if result["is_correct"]) / len(results)
# print(f"Accuracy: {accuracy:.2%}")

# # Display evaluation results
# print("\nEvaluation Results:")
# for idx, result in enumerate(results[:5]):  # Display first 5 results
#     query = truncate_text(result["query"], max_length=40)
#     expected = result["expected"]
#     correct = "Yes" if result["is_correct"] else "No"

#     print(f"{idx + 1}. Query: {query}")
#     print(f"   Expected: {expected}")
#     print(f"   Correct: {correct}")
#     print("   Top 5 Matches:")
    
#     for match in result["retrieved_top_5"]:
#         material_number = match["material_number"]
#         description = truncate_text(match["description"], max_length=40)
#         similarity_score = match["score"]
#         print(f"      {material_number} - {description} (Score: {similarity_score:.2f}%)")
    
#     print("-" * 40)  # Separator between results
