**Load test data from excel into dataframe**

In [1]:
import pandas as pd

# Load the training data from Excel
training_excel_file = 'C:/Users/User/Desktop/2024-04-25validate JLJ.xlsx'
df_training = pd.read_excel(training_excel_file, sheet_name='Sheet1')

# Display the top 5 rows
print("Top 5 rows in the training DataFrame:")
print(df_training.head())

Top 5 rows in the training DataFrame:
   Unnamed: 0                               Combined Description  \
0       93203      PROVISION OF CATERING SERVICES    [Caterin...   
1       41610    PROVISION OF LOGISTICS SERVICES FOR HOME TEA...   
2       95154     WOG Video and Animation Period Contract and...   
3       58051    PROVISION OF LOGISTICS SERVICES FOR HOME TEA...   
4        4906    SUPPLY OF CALL CENTRE SYSTEM WITH MAINTENANC...   

                                     Commitment item  \
0  212901 - Other Assets511999 - Direct Project: ...   
1  212401 - Other Equipment511999 - Direct Projec...   
2     218999 - Other Services511699 - Other Services   
3     218999 - Other Services511199 - Other Manpower   
4  226301 - Maintenance: ICT Hardware Integrated ...   

                                         predictions  confidence  \
0  212901 - Other Assets511999 - Direct Project: ...    0.969243   
1  212401 - Other Equipment511999 - Direct Projec...    0.981227   
2     218999

**Data cleansing**

In [2]:
# Ensure the necessary columns exist 
if 'Combined Description' not in df_training.columns or 'Commitment item' not in df_training.columns:
    raise ValueError("The training Excel file must contain 'Combined Description' and 'Commitment item' columns.")

# Data cleansing: Remove null values and strip whitespace 
df_training = df_training.dropna(subset=['Combined Description', 'Commitment item'])  
df_training['Combined Description'] = df_training['Combined Description'].str.strip()  

# Split 'Commitment item' into 'material_number' and 'description'
df_training[['material_number', 'description']] = df_training['Commitment item'].str.split(' - ', n=1, expand=True)

df_training = df_training.drop(columns=['Commitment item'])

# Display cleaned DataFrame
print("\nCleaned training DataFrame:")
print(df_training[['material_number', 'description', 'Combined Description']].head())


Cleaned training DataFrame:
  material_number                                        description  \
0          212901        Other Assets511999 - Direct Project: Others   
1          212401     Other Equipment511999 - Direct Project: Others   
2          218999              Other Services511699 - Other Services   
3          218999              Other Services511199 - Other Manpower   
4          226301  Maintenance: ICT Hardware Integrated with Soft...   

                                Combined Description  
0  PROVISION OF CATERING SERVICES    [Catering 01...  
1  PROVISION OF LOGISTICS SERVICES FOR HOME TEAM ...  
2  WOG Video and Animation Period Contract and Fr...  
3  PROVISION OF LOGISTICS SERVICES FOR HOME TEAM ...  
4  SUPPLY OF CALL CENTRE SYSTEM WITH MAINTENANCE ...  


**Vector embedding for materials.json**

In [4]:
from models import search
import numpy as np

json_file = 'models/materials.json'

# Embed materials.json
search_engine = search.SemanticSearch(data_file=json_file)

print("\nPreview of Embeddings:")
embedding_array = np.array(search_engine.embeddings)
print("Embedding Shape:", embedding_array.shape)
print("First 5 Embeddings:\n", embedding_array[:5])


Preview of Embeddings:
Embedding Shape: (66, 384)
First 5 Embeddings:
 [[-0.07546962  0.05160305  0.0790607  ... -0.08441566  0.0499549
  -0.04027322]
 [-0.07687087 -0.03197528 -0.0162298  ...  0.00801737  0.03931644
  -0.0210543 ]
 [-0.07657489 -0.11438267 -0.02763658 ...  0.02153831  0.02106407
  -0.01627696]
 [-0.06721126 -0.0110262   0.05768544 ... -0.05953345  0.03353224
   0.0037238 ]
 [-0.04238549 -0.02467283  0.0833751  ... -0.0147712   0.04955109
  -0.00545708]]


**Split dataframe into train test, append new vector embeddings from training data**

In [5]:
from sklearn.model_selection import train_test_split
from models import search
import numpy as np

# Split the training data (50/50)
train_df, test_df = train_test_split(df_training, test_size=0.5, random_state=42)

# Generate embeddings for the 'Combined Description' in train_df
combined_descriptions = train_df['Combined Description'].tolist()
new_embeddings = search_engine.model.encode(combined_descriptions)

# Append the new embeddings to the existing embeddings
search_engine.embeddings = np.vstack([search_engine.embeddings, new_embeddings]) 

# Display data and embeddings
print("\nTraining Data (Top 5 rows):")
print(train_df.head())

print("\nPreview of Updated Embeddings:")
embedding_array = np.array(search_engine.embeddings)
print("Embedding Shape:", embedding_array.shape)
print("First 5 Embeddings:\n", embedding_array[:5])


Training Data (Top 5 rows):
      Unnamed: 0                               Combined Description  \
8423       19540  Provision of Generic Application Support (GAS)...   
2508      106883  PROVISION OF A BULK TENDER ON CYBERSECURITY AN...   
8534       27033  Provision of Direct Mailing Services   DM2-084...   
1510       43963  PROVISION OF LOGISTICS SERVICES FOR HOME TEAM ...   
4314      111398  Five-Year Leasing of Digital Copiers [Ricoh]  ...   

                                            predictions  confidence  \
8423  217301 - Software as a Service511603 - Softwar...    0.968961   
2508  217501 - ICT Security/Audit Services511605 - I...    0.964394   
8534  218401 - Postage and Courier Services511699 - ...    0.962395   
1510  212401 - Other Equipment511999 - Direct Projec...    0.967198   
4314  212301 - Office Equipment511999 - Direct Proje...    0.969749   

      confi(rounded)  Match Verified GL Jeanette Unnamed: 9 material_number  \
8423            0.97   True         Na