In [11]:
import pandas as pd 
import numpy as np
import os
from src.utils.functions import get_embeddings, find_best_skill_match
from fastapi import FastAPI, Request
from sentence_transformers import SentenceTransformer, util
import torch

In [12]:
os.getcwd()

'/Users/eugenechua/Downloads/ssg_crosswalking_api'

In [13]:
ssg_skills_df = pd.read_csv('./data/2k_skills_10112025.csv')
ssg_skills_df.head(5)

Unnamed: 0,skill_id,skill_title,skill_type,skill_description,is_sfs_emerging,is_casl,entry_date
0,04ca1b233344a66c112bbcc2d35c022d5b0bd17557888b...,Aircraft Fuel and Engine Systems Maintenance,tsc,Maintain aircraft fuel and engine systems usin...,False,False,16/12/2024
1,64763b9e4194a6b23bff32feaa49a531d9c8e32f7e0e38...,Cooling System Design,tsc,"Design cooling systems for ships, rigs and/or ...",False,False,16/12/2024
2,35643c9c9767853e61a35a574ca6fed5aff2d81bb496d2...,Control Room Operations Management,tsc,Perform control room operations in order to mo...,False,False,16/12/2024
3,0c0905aa03a160325bc8721681d5deefe9b3f1798680e5...,Service Partnerships,tsc,Collaborate with partners to deliver and enhan...,False,False,16/12/2024
4,68ea7964f03a86c0c30631d93d8fce4cf1fc04f6caf746...,Post-Landing Operations,tsc,Taxi aircraft from runways to parking stands a...,False,False,16/12/2024


In [14]:
# Dropping unwanted columns
ssg_skills_df = ssg_skills_df.drop(columns=['is_sfs_emerging', 'is_casl', 'entry_date'], axis=1)
ssg_skills_df.head(5)

Unnamed: 0,skill_id,skill_title,skill_type,skill_description
0,04ca1b233344a66c112bbcc2d35c022d5b0bd17557888b...,Aircraft Fuel and Engine Systems Maintenance,tsc,Maintain aircraft fuel and engine systems usin...
1,64763b9e4194a6b23bff32feaa49a531d9c8e32f7e0e38...,Cooling System Design,tsc,"Design cooling systems for ships, rigs and/or ..."
2,35643c9c9767853e61a35a574ca6fed5aff2d81bb496d2...,Control Room Operations Management,tsc,Perform control room operations in order to mo...
3,0c0905aa03a160325bc8721681d5deefe9b3f1798680e5...,Service Partnerships,tsc,Collaborate with partners to deliver and enhan...
4,68ea7964f03a86c0c30631d93d8fce4cf1fc04f6caf746...,Post-Landing Operations,tsc,Taxi aircraft from runways to parking stands a...


In [15]:
# Doing some data processing! Lowercasing the text for consistent processing
ssg_skills_df['skill_title'] = ssg_skills_df['skill_title'].str.lower() 
ssg_skills_df['skill_description'] = ssg_skills_df['skill_description'].str.lower() 
# Concatenating skill title and descriptions into a single field
ssg_skills_df['skill_description_combined'] = ssg_skills_df['skill_title'] + "-" + ssg_skills_df['skill_description']
ssg_skills_df.head(5)

Unnamed: 0,skill_id,skill_title,skill_type,skill_description,skill_description_combined
0,04ca1b233344a66c112bbcc2d35c022d5b0bd17557888b...,aircraft fuel and engine systems maintenance,tsc,maintain aircraft fuel and engine systems usin...,aircraft fuel and engine systems maintenance-m...
1,64763b9e4194a6b23bff32feaa49a531d9c8e32f7e0e38...,cooling system design,tsc,"design cooling systems for ships, rigs and/or ...",cooling system design-design cooling systems f...
2,35643c9c9767853e61a35a574ca6fed5aff2d81bb496d2...,control room operations management,tsc,perform control room operations in order to mo...,control room operations management-perform con...
3,0c0905aa03a160325bc8721681d5deefe9b3f1798680e5...,service partnerships,tsc,collaborate with partners to deliver and enhan...,service partnerships-collaborate with partners...
4,68ea7964f03a86c0c30631d93d8fce4cf1fc04f6caf746...,post-landing operations,tsc,taxi aircraft from runways to parking stands a...,post-landing operations-taxi aircraft from run...


In [17]:
# Exporting the processed ssg_skills_df
ssg_skills_df.to_csv('./data/ssg_skills.csv')

In [6]:
# Importing the mpnet model

model = SentenceTransformer("all-mpnet-base-v2")

In [7]:
# Convert Series → list
texts = ssg_skills_df['skill_description_combined'].astype(str).tolist()
# Compute embeddings
ssg_embeddings = get_embeddings(texts, model, to_tensor=True)
print(ssg_embeddings.shape) 

Batches: 100%|██████████| 49/49 [00:22<00:00,  2.22it/s]

torch.Size([3123, 768])





In [8]:
# Save embeddings
torch.save(ssg_embeddings, "./data/ssg_skill_embeddings.pt")

In [7]:
# test loading skill_embedding.pt 
skill_embeddings = torch.load('./data/ssg_skill_embeddings.pt')
print(type(skill_embeddings))
print(skill_embeddings.shape)

<class 'torch.Tensor'>
torch.Size([3123, 768])


In [None]:
### Now I am going to try testing some sample skill titles and descriptions
title = "statistical data analysis"
description = "Ability to collect, clean, model and interpret datasets using statistical methods and tools to generate insights, identify patterns, and support data-driven decision-making"
print(title)
print(description)

statistical data analysis
Ability to collect, clean, model and interpret datasets using statistical methods and tools to generate insights, identify patterns, and support data-driven decision-making.


In [9]:
query = f"{title}-{description}"
# Encoding the query part
query_emb = model.encode([query], convert_to_tensor=True)
# Taking the cosine similarity
scores = util.cos_sim(query_emb, skill_embeddings)[0]
# Taking the top matching item
top_idx = torch.argmax(scores).item()

best_match_combined = ssg_skills_df['skill_description_combined'][top_idx]
# Splitting into the mapped skill title and description respectively
mapped_skill_title, mapped_skill_description = best_match_combined.split("-",1)
print(mapped_skill_title)
print(mapped_skill_description)

print(top_idx)

business data analysis
implementing data analytics within the organisation to generate business insights and intelligence through the use of statistical and computational techniques and tools, algorithms, predictive data modelling and data visualisation
376


In [18]:
find_best_skill_match(title, description, model, skill_embeddings, ssg_skills_df)

{'input_skill_title': 'statistical data analysis',
 'input_skill_description': 'Ability to collect, clean, model and interpret datasets using statistical methods and tools to generate insights, identify patterns, and support data-driven decision-making.',
 'output_skill_id': '34aabd11c5361dd678ad053e3b5643bc8301fd0858fe9ffa3a085d19df45d498',
 'output_skill_title': 'business data analysis',
 'output_skill_description': 'implementing data analytics within the organisation to generate business insights and intelligence through the use of statistical and computational techniques and tools, algorithms, predictive data modelling and data visualisation',
 'score': 0.7070313692092896}