In [13]:
import pandas as pd
import re

In [14]:
sample_file_path = '../../Data/Sample2.csv'
sample_data = pd.read_csv(sample_file_path)


In [15]:
onet_data = pd.read_csv("../../Data/Occupation Data.txt", sep="\t")

# Displaying the first few rows of the O*NET data to understand the structure
onet_data.head()

Unnamed: 0,O*NET-SOC Code,Title,Description
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...
1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh..."
2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ..."
3,11-1031.00,Legislators,"Develop, introduce, or enact laws and statutes..."
4,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici..."


In [16]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and punctuations
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [17]:
# Apply preprocessing to the O*NET titles
onet_data['Title'] = onet_data['Title'].apply(preprocess_text)

# Apply preprocessing to the sample titles
sample_data['Title'] = sample_data['Title'].apply(preprocess_text)

# Displaying the first few rows of the preprocessed O*NET and sample data
onet_data['Title'].head(), sample_data['Title'].head()

(0                       chief executives
 1          chief sustainability officers
 2        general and operations managers
 3                            legislators
 4    advertising and promotions managers
 Name: Title, dtype: object,
 0                             rn
 1    associate software engineer
 2                           rnor
 3    certified medical assistant
 4       business process analyst
 Name: Title, dtype: object)

In [19]:
from transformers import BertTokenizer, BertModel
import torch

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to embed a list of titles using BERT
def embed_titles(titles):
    embeddings = []
    for title in titles:
        # Tokenize the title
        inputs = tokenizer(title, return_tensors="pt", padding=True, truncation=True)
        # Get BERT's output
        outputs = model(**inputs)
        # Get the average of the last hidden state to represent the title
        embedding = torch.mean(outputs.last_hidden_state, dim=1)
        embeddings.append(embedding.squeeze().detach())
    return torch.stack(embeddings)

onet_titles = onet_data['Title'].unique()

# Embed O*NET titles
onet_embeddings = embed_titles(onet_titles)

# Embed sample titles
sample_embeddings = embed_titles(sample_data['Title'])

# Check the shape of the embeddings
onet_embeddings.shape, sample_embeddings.shape


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(torch.Size([1016, 768]), torch.Size([175, 768]))

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the cosine similarity between sample and O*NET embeddings
similarities = cosine_similarity(sample_embeddings, onet_embeddings)

# Get the index of the most similar O*NET title for each sample title
most_similar_indices = similarities.argmax(axis=1)

# Map the indices to the actual O*NET titles
matched_onet_titles = [onet_titles[idx] for idx in most_similar_indices]

# Create a DataFrame with the original titles and matched O*NET titles
result_df = pd.DataFrame({
    'Original_Title': sample_data['Title'],
    'Matched_ONET_Title': matched_onet_titles
})

# Optionally, save the result to a CSV file
result_df.to_csv('matched_titles.csv', index=False)

result_df.head()


Unnamed: 0,Original_Title,Matched_ONET_Title
0,rn,legislators
1,associate software engineer,computer hardware engineers
2,rnor,legislators
3,certified medical assistant,clinical nurse specialists
4,business process analyst,business intelligence analysts


In [23]:
result_df.to_csv("result.csv")

In [24]:
result_df.shape

(175, 2)

In [25]:
result_df

Unnamed: 0,Original_Title,Matched_ONET_Title
0,rn,legislators
1,associate software engineer,computer hardware engineers
2,rnor,legislators
3,certified medical assistant,clinical nurse specialists
4,business process analyst,business intelligence analysts
...,...,...
170,job fair virtual teachers,instructional coordinators
171,parttime store associate,customer service representatives
172,store associate,model makers wood
173,mental health specialist correctional services,mental health and substance abuse social workers
