**Load test data from excel into dataframe**

In [1]:
import pandas as pd

# Load the training data from Excel
training_excel_file = 'C:/Users/User/Desktop/insupply-main/datasets/2024-04-25validate JLJ.xlsx'
df_training = pd.read_excel(training_excel_file, sheet_name='Sheet1')

# Display the top 5 rows
print("Top 5 rows in the training DataFrame:")
print(df_training.head())

Top 5 rows in the training DataFrame:
   Unnamed: 0                               Combined Description  \
0       93203      PROVISION OF CATERING SERVICES    [Caterin...   
1       41610    PROVISION OF LOGISTICS SERVICES FOR HOME TEA...   
2       95154     WOG Video and Animation Period Contract and...   
3       58051    PROVISION OF LOGISTICS SERVICES FOR HOME TEA...   
4        4906    SUPPLY OF CALL CENTRE SYSTEM WITH MAINTENANC...   

                                     Commitment item  \
0  212901 - Other Assets511999 - Direct Project: ...   
1  212401 - Other Equipment511999 - Direct Projec...   
2     218999 - Other Services511699 - Other Services   
3     218999 - Other Services511199 - Other Manpower   
4  226301 - Maintenance: ICT Hardware Integrated ...   

                                         predictions  confidence  \
0  212901 - Other Assets511999 - Direct Project: ...    0.969243   
1  212401 - Other Equipment511999 - Direct Projec...    0.981227   
2     218999

**Data cleansing**

In [2]:
# Ensure the necessary columns exist 
if 'Combined Description' not in df_training.columns or 'Commitment item' not in df_training.columns:
    raise ValueError("The training Excel file must contain 'Combined Description' and 'Commitment item' columns.")

# Data cleansing: Remove null values and strip whitespace 
df_training = df_training.dropna(subset=['Combined Description', 'Commitment item'])  
df_training['Combined Description'] = df_training['Combined Description'].str.strip()  

# Split 'Commitment item' 
df_training[['material_number', 'description']] = df_training['Commitment item'].str.split(' - ', n=1, expand=True)

df_training = df_training.drop(columns=['Commitment item'])

# Display cleaned DataFrame
print("\nCleaned training DataFrame:")
print(df_training[['material_number', 'description', 'Combined Description']].head())


Cleaned training DataFrame:
  material_number                                        description  \
0          212901        Other Assets511999 - Direct Project: Others   
1          212401     Other Equipment511999 - Direct Project: Others   
2          218999              Other Services511699 - Other Services   
3          218999              Other Services511199 - Other Manpower   
4          226301  Maintenance: ICT Hardware Integrated with Soft...   

                                Combined Description  
0  PROVISION OF CATERING SERVICES    [Catering 01...  
1  PROVISION OF LOGISTICS SERVICES FOR HOME TEAM ...  
2  WOG Video and Animation Period Contract and Fr...  
3  PROVISION OF LOGISTICS SERVICES FOR HOME TEAM ...  
4  SUPPLY OF CALL CENTRE SYSTEM WITH MAINTENANCE ...  


**Vector embedding of materials.json**

In [3]:
from models import search
import numpy as np

json_file = 'models/materials.json'

# Embed materials.json
search_engine = search.SemanticSearch(data_file=json_file)

print("\nVectors embedding completed")


Vectors embedding completed


**Split dataframe into train test, evaluate accuracy of model against training data**

In [4]:
from sklearn.model_selection import train_test_split
import numpy as np
import json

# Split the training data (50/50)
train_df, test_df = train_test_split(df_training, test_size=0.5, random_state=42)

results = []

# Run search for every value of training data
for _, row in train_df.iterrows():
    query = row["Combined Description"]  
    expected_material_number = row["material_number"]  

    # Retrieve material number of most likely match
    search_results = json.loads(search_engine.search([query], top_k=1)) 
    retrieved_material_number = search_results[0]["matches"][0]["material_number"]

    # Check if correct
    is_correct = retrieved_material_number == expected_material_number
    results.append({"query": query, "expected": expected_material_number, "retrieved": retrieved_material_number, "is_correct": is_correct})
    
print("\nSplitting dataframe, model evaluation completed")


Splitting dataframe, model evaluation completed


**Display results**

In [5]:
# Truncate long strings
def truncate_text(text, max_length=50):
    return text if len(text) <= max_length else text[:max_length] + "..."

# Calculate accuracy
accuracy = sum(1 for result in results if result["is_correct"]) / len(results)
print(f"Accuracy: {accuracy:.2%}")

# Display evaluation results
print("\nEvaluation Results:")
for idx, result in enumerate(results[:5]): 
    query = truncate_text(result["query"], max_length=50)
    expected = result["expected"]
    retrieved = result["retrieved"]
    correct = "Yes" if result["is_correct"] else "No"
    print(f"{idx + 1}. Query: {query}")
    print(f"   Expected: {expected}")
    print(f"   Retrieved: {retrieved}")
    print(f"   Correct: {correct}")
    print("-" * 40)  

Accuracy: 12.39%

Evaluation Results:
1. Query: Provision of Generic Application Support (GAS) for...
   Expected: 217301
   Retrieved: 226201
   Correct: No
----------------------------------------
2. Query: PROVISION OF A BULK TENDER ON CYBERSECURITY AND AU...
   Expected: 217501
   Retrieved: 217501
   Correct: Yes
----------------------------------------
3. Query: Provision of Direct Mailing Services   DM2-084 - A...
   Expected: 218401
   Retrieved: 218401
   Correct: Yes
----------------------------------------
4. Query: PROVISION OF LOGISTICS SERVICES FOR HOME TEAM RETA...
   Expected: 212401
   Retrieved: 217601
   Correct: No
----------------------------------------
5. Query: Five-Year Leasing of Digital Copiers [Ricoh]   [5-...
   Expected: 212301
   Retrieved: 275201
   Correct: No
----------------------------------------
