In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import pickle
import ast
import torch
import torch.nn.functional as F
import numpy as np
from transformers import AutoTokenizer, AutoModel

In [2]:
# Mean Pooling - Takes attention mask into account for correct averaging
def job_title_mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Function to get embeddings for a batch of job titles
def get_job_title_embeddings(job_titles, tokenizer, model, batch_size=32, device='cpu'):
    embeddings = []
    
    for i in range(0, len(job_titles), batch_size):
        batch_titles = job_titles[i:i + batch_size]
        
        # Tokenize the job titles
        encoded_input = tokenizer(batch_titles, padding=True, truncation=True, return_tensors='pt').to(device)
        
        # Compute token embeddings
        with torch.no_grad():
            model_output = model(**encoded_input)
        
        # Perform mean pooling
        sentence_embeddings = job_title_mean_pooling(model_output, encoded_input['attention_mask'])
        
        # Normalize the embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        
        # Append to list
        embeddings.append(sentence_embeddings.cpu().numpy())
    
    # Return embeddings as a numpy array
    return np.vstack(embeddings)

In [3]:
mlb = MultiLabelBinarizer()
mlb.fit([['contract'], ['fulltime'], ['internship'], ['parttime'], ['temporary']])

print("Loading graph from pickle file...")
with open('./recommendation_cache/final_complete_graph.pkl', 'rb') as f:
    graph = pickle.load(f)
print("Setup complete!")

Loading graph from pickle file...
Setup complete!


In [4]:
# Convert graph nodes to DataFrame
nodes_data = []
for node, attrs in graph.nodes(data=True):
    node_dict = {'node_id': node}
    node_dict.update(attrs)
    nodes_data.append(node_dict)

df = pd.DataFrame(nodes_data)

df.head()

Unnamed: 0,node_id,job_title_embedding,job_description_embedding,company,job_type_encoding,is_remote,lat_long
0,job_0,"[-0.0323692635, 0.037560612, -0.0851607397, -0...","[0.0364928544, 0.0229546651, 0.0382401571, 0.0...",PHOENIX OPCO PTE. LTD.,"[0, 1, 0, 0, 0]",False,"(1.2744927000000001, 103.84404662674353)"
1,job_1,"[-0.0593948737, 0.0716273487, -0.0681853294, 0...","[-0.00784693379, 0.0408150293, -0.0307610631, ...",Kopitiam Investment Pte Ltd,"[0, 1, 0, 0, 0]",False,"(1.2899175, 103.8519072)"
2,job_2,"[-0.0508020073, -0.00822946988, -0.0356351659,...","[-0.00660971645, 0.0656373128, -0.0250155218, ...",Oomph Pte. Ltd.,"[0, 1, 0, 0, 0]",False,"(1.2899175, 103.8519072)"
3,job_3,"[-0.0720216036, -0.000101672507, 0.0526241288,...","[0.00460857758, 0.0135099608, 0.048739396, -0....",ECOCLEAN MAINTENANCE PTE. LTD.,"[0, 1, 0, 0, 0]",False,"(1.2899175, 103.8519072)"
4,job_4,"[-0.0362352729, 0.0467952862, -0.0649954677, -...","[0.0213460047, 0.0483995192, -0.00763951195, 0...",Anradus Pte Ltd,"[0, 1, 0, 0, 0]",False,"(1.28308855, 103.85046008586423)"


In [5]:
df['job_title_embedding'] = df['job_title_embedding'].apply(np.array)

In [6]:
df_old = pd.read_csv('6.1.cleaned_and_merged_back.csv')
df_old['lat_long'] = df_old['lat_long'].apply(ast.literal_eval)

  df_old = pd.read_csv('6.1.cleaned_and_merged_back.csv')


In [7]:
# Load tokenizer and model from Hugging Face Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')

# If GPU is available, move model to GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

In [8]:
# Process job titles in batches
job_titles = df_old['title'].tolist()
embeddings = get_job_title_embeddings(job_titles, tokenizer, model, batch_size=256, device=device)

# Assign the embeddings to the dataframe as a list for each row
df_old['job_title_embedding'] = [embedding for embedding in embeddings]

In [9]:
df_old['job_title_embedding'][0]

array([-3.23692635e-02,  3.75606120e-02, -8.51607397e-02, -4.53846455e-02,
       -5.46593554e-02, -2.79194303e-02,  2.78429296e-02,  2.15941686e-02,
        3.25031728e-02,  8.87234812e-04,  3.66467610e-02, -4.50273678e-02,
       -1.09668206e-02, -3.68140303e-02, -4.51355092e-02,  5.84830716e-02,
        1.14421636e-01,  3.79366949e-02, -1.09098526e-02, -4.50774394e-02,
       -6.36187941e-02, -2.83169001e-02, -7.49196559e-02,  4.60793413e-02,
       -8.73038266e-03,  4.90450747e-02, -2.30729543e-02,  8.33488330e-02,
        1.08598638e-02, -8.04005265e-02, -9.02405754e-02,  2.68752687e-02,
        2.91939843e-02,  2.03662012e-02, -4.73717880e-03, -2.22695544e-02,
       -2.55234633e-02,  4.52349409e-02,  2.86074150e-02, -1.15934608e-03,
       -2.06206571e-02, -9.99114104e-03, -1.18169105e-02, -2.74297539e-02,
       -8.19667950e-02,  1.21426117e-02, -5.15504405e-02, -1.48908487e-02,
        3.12867016e-02,  4.25315909e-02,  5.44571318e-02,  7.82229751e-03,
        1.11881550e-02,  

In [10]:
df['job_title_embedding'][0]

array([-3.23692635e-02,  3.75606120e-02, -8.51607397e-02, -4.53846455e-02,
       -5.46593554e-02, -2.79194303e-02,  2.78429296e-02,  2.15941686e-02,
        3.25031728e-02,  8.87234812e-04,  3.66467610e-02, -4.50273678e-02,
       -1.09668206e-02, -3.68140303e-02, -4.51355092e-02,  5.84830716e-02,
        1.14421636e-01,  3.79366949e-02, -1.09098526e-02, -4.50774394e-02,
       -6.36187941e-02, -2.83169001e-02, -7.49196559e-02,  4.60793413e-02,
       -8.73038266e-03,  4.90450747e-02, -2.30729543e-02,  8.33488330e-02,
        1.08598638e-02, -8.04005265e-02, -9.02405754e-02,  2.68752687e-02,
        2.91939843e-02,  2.03662012e-02, -4.73717880e-03, -2.22695544e-02,
       -2.55234633e-02,  4.52349409e-02,  2.86074150e-02, -1.15934608e-03,
       -2.06206571e-02, -9.99114104e-03, -1.18169105e-02, -2.74297539e-02,
       -8.19667950e-02,  1.21426117e-02, -5.15504405e-02, -1.48908487e-02,
        3.12867016e-02,  4.25315909e-02,  5.44571318e-02,  7.82229751e-03,
        1.11881550e-02,  

In [11]:
type(df['job_title_embedding'][0]), type(df_old['job_title_embedding'][0])

(numpy.ndarray, numpy.ndarray)

In [12]:
# Convert lists to numpy arrays in df to match the format of df_old
df['job_title_embedding'] = df['job_title_embedding'].apply(np.array)

# Tolerance-based comparison using np.allclose to account for small differences
mismatches = ~np.array([np.allclose(a, b, atol=1e-6) for a, b in zip(df['job_title_embedding'], df_old['job_title_embedding'])])

# Get indices of mismatches
mismatch_indices = np.where(mismatches)[0]

# Print indices of mismatches
if len(mismatch_indices) > 0:
    print("Mismatches found at indices:", mismatch_indices)
else:
    print("All embeddings match exactly.")


All embeddings match exactly.


In [13]:
# Check if lat_long values match between df and df_old
mismatches = df['lat_long'] != df_old['lat_long']

# Get indices of mismatches
mismatch_indices = mismatches[mismatches].index

# Print indices of mismatches
if not mismatch_indices.empty:
    print("Mismatches found at indices:", mismatch_indices.tolist())
else:
    print("All lat_long values match exactly.")


All lat_long values match exactly.


In [14]:
df.shape, df_old.shape

((25142, 7), (25142, 34))

In [15]:
# Identify columns that are in both df and df_old
common_columns = df.columns.intersection(df_old.columns)

# Select only the columns in df_old that are not in df
df_old_unique = df_old[df_old.columns.difference(common_columns)]

# Merge on the index, bringing only unique columns from df_old
merged_df = df.merge(df_old_unique, left_index=True, right_index=True, how='left')

# Display the merged DataFrame
print("Merged DataFrame without duplicated columns:")
print(merged_df)


Merged DataFrame without duplicated columns:
         node_id                                job_title_embedding  \
0          job_0  [-0.0323692635, 0.037560612, -0.0851607397, -0...   
1          job_1  [-0.0593948737, 0.0716273487, -0.0681853294, 0...   
2          job_2  [-0.0508020073, -0.00822946988, -0.0356351659,...   
3          job_3  [-0.0720216036, -0.000101672507, 0.0526241288,...   
4          job_4  [-0.0362352729, 0.0467952862, -0.0649954677, -...   
...          ...                                                ...   
25137  job_25137  [-0.0373321287, 0.0151110766, -0.0551175214, 0...   
25138  job_25138  [-0.0560089499, 0.0340756066, -0.0320017785, -...   
25139  job_25139  [0.0100341607, -0.0136072841, -0.0523362756, 0...   
25140  job_25140  [-0.023009764, 0.0560570732, -0.0101536363, 0....   
25141  job_25141  [-0.0467093848, 0.0244118106, 0.000753627974, ...   

                               job_description_embedding  \
0      [0.0364928544, 0.0229546651, 0.0382

In [16]:
merged_df.shape

(25142, 37)

In [17]:
merged_df.columns

Index(['node_id', 'job_title_embedding', 'job_description_embedding',
       'company', 'job_type_encoding', 'is_remote', 'lat_long', 'address',
       'banner_photo_url', 'ceo_name', 'ceo_photo_url', 'cleaned_address',
       'company_addresses', 'company_description', 'company_industry',
       'company_num_employees', 'company_revenue', 'company_url',
       'company_url_direct', 'currency', 'date_posted', 'description_x',
       'description_y', 'emails', 'id', 'interval', 'job_type', 'job_url',
       'job_url_direct', 'location', 'logo_photo_url', 'max_amount',
       'min_amount', 'model_response', 'salary_source', 'site', 'title'],
      dtype='object')

In [27]:
# Group by 'id' and list the associated 'node_id' values
id_node_mappings = merged_df.groupby('node_id')['id'].apply(list)

# Display the mapping
print("Node IDs mapped to each ID:")
print(id_node_mappings)

# Define the file path
output_file = '9.2.id_node_mappings.txt'

# Open the file in write mode and save the mappings
with open(output_file, 'w') as f:
    for id_val, node_ids in id_node_mappings.items():
        # Write the id and associated node_ids list to the file
        f.write(f"ID: {id_val} -> Node IDs: {node_ids}\n")

print(f"Mappings saved to {output_file}")


Node IDs mapped to each ID:
node_id
job_0       [0005a77c5af02f32]
job_1       [000989af12dd337f]
job_10      [001b2c9b4fd18d72]
job_100     [00fa6252a365193c]
job_1000    [0a02fc38c0c685a9]
                   ...        
job_9995    [6596b4e899cf10fa]
job_9996    [659a44b63abd13a1]
job_9997    [659d566f44a48a70]
job_9998    [bf39a3334467678d]
job_9999    [659d6c7d3bf96c3a]
Name: id, Length: 25142, dtype: object
Mappings saved to 9.2.id_node_mappings.txt


In [29]:
# merged_df.to_csv('9.1.final_with_graph_data.csv', index=False)

# Save merged_df to a pickle file
merged_df.to_pickle("final_complete_graph_dataframe.pkl")
