In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import numpy as np
import textwrap
import ast

In [2]:
df = pd.read_csv('6.1.cleaned_and_merged_back.csv')
df.columns

  df = pd.read_csv('6.1.cleaned_and_merged_back.csv')


Index(['title', 'company', 'job_type', 'is_remote', 'description_x', 'address',
       'cleaned_address', 'lat_long', 'model_response', 'id', 'site',
       'job_url', 'job_url_direct', 'location', 'date_posted', 'emails',
       'description_y', 'company_url', 'company_url_direct',
       'company_addresses', 'company_num_employees', 'company_revenue',
       'company_description', 'logo_photo_url', 'banner_photo_url',
       'company_industry', 'ceo_name', 'ceo_photo_url', 'salary_source',
       'interval', 'min_amount', 'max_amount', 'currency'],
      dtype='object')

In [3]:
# Remove all spaces and ensure the comma is the only separator
df['job_type_cleaned'] = df['job_type'].str.replace(r'\s+', '', regex=True)

In [4]:
df['job_type'].value_counts()

job_type
fulltime                                               22894
fulltime, contract                                      1086
parttime, fulltime                                       516
fulltime, internship                                     275
fulltime, temporary                                      123
parttime, fulltime, contract                              72
parttime, fulltime, internship                            50
fulltime, temporary, contract                             31
parttime, fulltime, temporary, contract, internship       25
parttime, fulltime, contract, internship                  20
parttime, fulltime, temporary, contract                   17
parttime, fulltime, temporary                             12
fulltime, contract, internship                            10
parttime, fulltime, temporary, internship                  8
fulltime, temporary, internship                            2
fulltime, temporary, contract, internship                  1
Name: count, dt

In [5]:
df['job_type_cleaned'].value_counts()

job_type_cleaned
fulltime                                           22894
fulltime,contract                                   1086
parttime,fulltime                                    516
fulltime,internship                                  275
fulltime,temporary                                   123
parttime,fulltime,contract                            72
parttime,fulltime,internship                          50
fulltime,temporary,contract                           31
parttime,fulltime,temporary,contract,internship       25
parttime,fulltime,contract,internship                 20
parttime,fulltime,temporary,contract                  17
parttime,fulltime,temporary                           12
fulltime,contract,internship                          10
parttime,fulltime,temporary,internship                 8
fulltime,temporary,internship                          2
fulltime,temporary,contract,internship                 1
Name: count, dtype: int64

In [6]:
# Multi-hot Encode the 'job_type' column
mlb = MultiLabelBinarizer()
df['job_type_encoded'] = mlb.fit_transform(df['job_type_cleaned'].str.split(',')).tolist()

In [7]:
# Show the resulting encoded column and the associated class labels
print("Classes:", mlb.classes_)  # This shows the sorted order of the job types

df['job_type_encoded'].value_counts()

Classes: ['contract' 'fulltime' 'internship' 'parttime' 'temporary']


job_type_encoded
[0, 1, 0, 0, 0]    22894
[1, 1, 0, 0, 0]     1086
[0, 1, 0, 1, 0]      516
[0, 1, 1, 0, 0]      275
[0, 1, 0, 0, 1]      123
[1, 1, 0, 1, 0]       72
[0, 1, 1, 1, 0]       50
[1, 1, 0, 0, 1]       31
[1, 1, 1, 1, 1]       25
[1, 1, 1, 1, 0]       20
[1, 1, 0, 1, 1]       17
[0, 1, 0, 1, 1]       12
[1, 1, 1, 0, 0]       10
[0, 1, 1, 1, 1]        8
[0, 1, 1, 0, 1]        2
[1, 1, 1, 0, 1]        1
Name: count, dtype: int64

In [8]:
# Mean Pooling - Takes attention mask into account for correct averaging
def job_title_mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Function to get embeddings for a batch of job titles
def get_job_title_embeddings(job_titles, tokenizer, model, batch_size=32, device='cpu'):
    embeddings = []
    
    for i in range(0, len(job_titles), batch_size):
        batch_titles = job_titles[i:i + batch_size]
        
        # Tokenize the job titles
        encoded_input = tokenizer(batch_titles, padding=True, truncation=True, return_tensors='pt').to(device)
        
        # Compute token embeddings
        with torch.no_grad():
            model_output = model(**encoded_input)
        
        # Perform mean pooling
        sentence_embeddings = job_title_mean_pooling(model_output, encoded_input['attention_mask'])
        
        # Normalize the embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        
        # Append to list
        embeddings.append(sentence_embeddings.cpu().numpy())
    
    # Return embeddings as a numpy array
    return np.vstack(embeddings)

In [9]:
# Mean Pooling - Takes attention mask into account for correct averaging
def job_description_mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Function to process long job descriptions by splitting into overlapping chunks of text
def get_job_description_embedding(job_description, tokenizer, model, max_chunk_length=512, overlap=50, device='cpu'):
    # Split the raw text into chunks, ensuring no chunk exceeds max_chunk_length
    tokens_per_chunk = max_chunk_length - 2  # Leave space for special tokens
    words = job_description.split()
    
    # Create overlapping text chunks
    chunks = [' '.join(words[i:i+tokens_per_chunk]) for i in range(0, len(words), tokens_per_chunk - overlap)]
    
    embeddings = []
    for chunk in chunks:
        # Tokenize the chunk (now we're passing the raw text directly)
        encoded_input = tokenizer(chunk, return_tensors='pt', padding=True, truncation=True, max_length=max_chunk_length).to(device)
        
        # Compute token embeddings
        with torch.no_grad():
            model_output = model(**encoded_input)
        
        # Perform mean pooling
        sentence_embedding = job_description_mean_pooling(model_output, encoded_input['attention_mask'])
        
        # Normalize the embedding
        sentence_embedding = F.normalize(sentence_embedding, p=2, dim=1)
        
        # Append the chunk embedding
        embeddings.append(sentence_embedding.cpu().numpy())

    # Average embeddings from all chunks to get a single embedding
    if embeddings:
        return np.mean(embeddings, axis=0).squeeze()
    else:
        return np.zeros((384,))  # Return a zero vector if no chunks were processed

In [10]:
# Load tokenizer and model from Hugging Face Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')

# If GPU is available, move model to GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

In [None]:
df_new = df[['title', 'company', 'job_type_encoded', 'is_remote', 'lat_long', 'model_response']]
df_new.columns

In [None]:
df_new.head()

In [None]:
# Process job titles in batches
job_titles = df_new['title'].tolist()
embeddings = get_job_title_embeddings(job_titles, tokenizer, model, batch_size=256, device=device)

# Assign the embeddings to the dataframe as a list for each row
df_new['job_title_embedding'] = [embedding for embedding in embeddings]

# Process job descriptions with overlapping chunks
df_new['job_description_embedding'] = df_new['model_response'].apply(
    get_job_description_embedding, 
    tokenizer=tokenizer, 
    model=model, 
    max_chunk_length=512,  # Model's max token length
    overlap=50,            # Amount of token overlap between chunks
    device=device
)

df_new['job_description_embedding'][0]

In [None]:
df_new['job_description_embedding'][0]

In [None]:
df_ready = df_new.drop(columns=['title', 'model_response'])
df_ready['lat_long'] = df_ready['lat_long'].apply(ast.literal_eval)

df_ready.to_csv('final_graph_model_training.csv', index=False)

# Start here if final_graph_model_training.csv exists

In [1]:
# uncomment the correct line below depending on if you want to start from scratch or load the csv
import pandas as pd

# df_ready.to_csv('final_graph_model_training.csv', index=False)
df_ready = pd.read_csv('7.1.final_graph_model_training.csv')

In [2]:
# Cell 2: Helper Functions

import networkx as nx
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from geopy.distance import geodesic
from tqdm import tqdm
import multiprocessing as mp
from itertools import combinations
import time
import torch
import pickle
from annoy import AnnoyIndex
from scipy.spatial import cKDTree
import faiss
from sklearn.cluster import KMeans
import cupy as cp
import ast

def calculate_total_potential_edges(df_ready):
    """
    Calculate the total number of potential edges in a complete graph.
    
    Args:
        df_ready (pd.DataFrame): Input dataframe
        
    Returns:
        int: Total number of potential edges
    """
    n = len(df_ready)
    return (n * (n-1)) // 2

def save_graph_checkpoint(graph, filename):
    """
    Save the graph to a pickle file checkpoint.
    
    Args:
        graph (nx.Graph): Graph to save
        filename (str): Path to save the checkpoint
    """
    with open(filename, 'wb') as f:
        pickle.dump(graph, f)
    print(f"Saved checkpoint: {filename}")

def load_graph_checkpoint(filename):
    """
    Load a graph checkpoint from a pickle file.
    
    Args:
        filename (str): Path to the checkpoint file
        
    Returns:
        nx.Graph: Loaded graph
    """
    with open(filename, 'rb') as f:
        graph = pickle.load(f)
    
    # Parse string embeddings and lat_long back to proper format
    for node in graph.nodes():
        if 'job_title_embedding' in graph.nodes[node]:
            graph.nodes[node]['job_title_embedding'] = parse_embedding(graph.nodes[node]['job_title_embedding'])
        if 'job_description_embedding' in graph.nodes[node]:
            graph.nodes[node]['job_description_embedding'] = parse_embedding(graph.nodes[node]['job_description_embedding'])
        if 'lat_long' in graph.nodes[node]:
            graph.nodes[node]['lat_long'] = ast.literal_eval(graph.nodes[node]['lat_long'])
            
    return graph
    
def parse_embedding(embedding_str):
    # Remove newlines and replace multiple spaces with a single space
    cleaned_str = embedding_str.replace('\n', ' ').strip()
    
    # Split the string into individual components and convert them to floats
    try:
        embedding_list = [float(val) for val in cleaned_str.strip('[]').split()]
        return embedding_list
    except ValueError:
        # In case of any parsing errors, return np.nan or handle as needed
        return np.nan
    
def initialize_graph(df_ready):
    """
    Initialize graph and add nodes with their attributes.
    
    Args:
        df_ready (pd.DataFrame): Input dataframe with job data
        
    Returns:
        nx.Graph: Graph with nodes added
    """
    print("Initializing graph and adding nodes...")
    graph = nx.Graph()
    
    # Convert embeddings from string to list
    df_ready['job_title_embedding'] = df_ready['job_title_embedding'].apply(parse_embedding)
    df_ready['job_description_embedding'] = df_ready['job_description_embedding'].apply(parse_embedding)
    
    for idx, row in tqdm(df_ready.iterrows(), total=len(df_ready), desc="Adding nodes"):
        graph.add_node(
            f"job_{idx}",
            job_title_embedding=row['job_title_embedding'],
            job_description_embedding=row['job_description_embedding'],
            company=row['company'],
            job_type_encoding=row['job_type_encoded'],
            is_remote=row['is_remote'],
            lat_long=row['lat_long']
        )
    
    save_graph_checkpoint(graph, 'graph_with_nodes.pkl')
    return graph

def create_company_edges(df_ready, graph):
    """
    Create edges between jobs from the same company.
    
    Args:
        df_ready (pd.DataFrame): Input dataframe with job data
        graph (nx.Graph): Graph to add edges to
        
    Returns:
        nx.Graph: Graph with company edges added
    """
    print("Creating edges between jobs from the same company...")
    company_groups = df_ready.groupby('company').groups
    
    total_company_edges = sum(len(indices) * (len(indices) - 1) // 2 
                            for indices in company_groups.values())
    
    pbar = tqdm(total=total_company_edges, desc="Company edges")
    edge_count = 0
    
    for company, indices in company_groups.items():
        if len(indices) > 1:
            for idx1, idx2 in combinations(indices, 2):
                graph.add_edge(f"job_{idx1}", f"job_{idx2}", 
                             type="same_company")
                edge_count += 1
                pbar.update(1)
    
    pbar.close()
    print(f"Added {edge_count} company edges")
    save_graph_checkpoint(graph, 'graph_with_company_edges.pkl')
    return graph

def create_job_type_edges(df_ready, graph, threshold=0.5, n_trees=10, k=10):
    """
    Create edges between jobs with similar job types using Jaccard similarity.
    
    Args:
        df_ready (pd.DataFrame): Input dataframe with job data
        graph (nx.Graph): Graph to add edges to
        threshold (float): Minimum similarity threshold for creating edges
        n_trees (int): Number of trees for Annoy index
        k (int): Number of nearest neighbors to search for (reduced from default)
        
    Returns:
        nx.Graph: Graph with job type similarity edges added
    """
    print(f"Creating edges between jobs with similar job types (threshold={threshold})...")
    
    job_types = np.array([eval(x) for x in df_ready['job_type_encoded']])
    n_jobs = len(job_types)
    edge_count = 0

    index = AnnoyIndex(job_types.shape[1], 'hamming')
    
    for i in tqdm(range(n_jobs), desc="Building index"):
        index.add_item(i, job_types[i])
    
    index.build(n_trees)
    
    for i in tqdm(range(n_jobs), desc="Finding similar jobs"):
        similar_indices = index.get_nns_by_item(i, k)  # Reduced k for better performance
        
        for j in similar_indices:
            if j > i:
                intersection = np.sum(np.logical_and(job_types[i], job_types[j]))
                union = np.sum(np.logical_or(job_types[i], job_types[j]))
                similarity = intersection / union if union > 0 else 0
                
                if similarity > threshold:
                    graph.add_edge(f"job_{i}", f"job_{j}",
                                type="job_type_similarity",
                                weight=float(similarity))
                    edge_count += 1
    
    print(f"Added {edge_count} job type similarity edges")
    save_graph_checkpoint(graph, 'graph_with_job_type_edges.pkl')
    return graph

def create_location_edges(df_ready, graph, max_distance=3, min_weight=0.4, chunk_size=500, sigma=1.5):
    """
    Create edges between jobs within geographical proximity using Gaussian decay.
    Optimized for Singapore's scale.
    
    Args:
        df_ready (pd.DataFrame): Input dataframe with job data
        graph (nx.Graph): Graph to add edges to
        max_distance (float): Maximum distance in km (3km default for Singapore's context)
        min_weight (float): Minimum weight threshold for creating edges
        chunk_size (int): Size of chunks for processing
        sigma (float): Standard deviation for Gaussian decay (1.5km default)
    """
    print(f"Creating edges between jobs within {max_distance}km of each other...")
    print(f"Using minimum weight threshold of {min_weight}")
    edge_count = 0

    # Extract coordinates
    lat_longs = []
    valid_indices = []
    
    for idx in tqdm(range(len(df_ready)), desc="Extracting coordinates"):
        try:
            lat_long = eval(df_ready.iloc[idx]['lat_long'])
            lat_longs.append(lat_long)
            valid_indices.append(idx)
        except:
            continue
    
    print(f"Processing {len(valid_indices)} locations")
    
    lat_longs = cp.array(lat_longs)
    n_points = len(lat_longs)
    
    # Build KD-tree for efficient spatial querying
    tree = cKDTree(cp.asnumpy(lat_longs))
    
    # Process in chunks
    for i in tqdm(list(range(0, n_points, chunk_size)), desc="Processing location proximity"):
        chunk_end = min(i + chunk_size, n_points)
        chunk_points = lat_longs[i:chunk_end]
        
        # Find nearby points within max_distance
        chunk_points_cpu = cp.asnumpy(chunk_points)
        nearby_points = tree.query_ball_point(chunk_points_cpu, max_distance/111.32)
        
        for j, neighbors in enumerate(nearby_points):
            if not neighbors:
                continue
                
            point1 = chunk_points[j]
            points2 = lat_longs[neighbors]
            
            # Calculate distances using haversine formula
            lat1, lon1 = point1[0], point1[1]
            lat2, lon2 = points2[:, 0], points2[:, 1]
            
            dlat = cp.radians(lat2 - lat1)
            dlon = cp.radians(lon2 - lon1)
            lat1, lat2 = cp.radians(lat1), cp.radians(lat2)
            
            a = cp.sin(dlat/2)**2 + cp.cos(lat1) * cp.cos(lat2) * cp.sin(dlon/2)**2
            distances = 2 * 6371 * cp.arcsin(cp.sqrt(a))  # Earth radius in km
            
            # Calculate weights with Gaussian decay
            weights = cp.exp(-(distances**2)/(2*sigma**2))
            
            # Apply distance and weight thresholds
            weights = cp.where((distances <= max_distance) & (weights >= min_weight), weights, 0)
            
            weights_cpu = cp.asnumpy(weights)
            actual_idx1 = valid_indices[i+j]
            
            # Add edges only for significant weights and avoid duplicates
            for k, neighbor_idx in enumerate(neighbors):
                if (neighbor_idx > i+j and  # Only process upper triangle
                    weights_cpu[k] > min_weight):
                    actual_idx2 = valid_indices[neighbor_idx]
                    
                    # Add edge with weight
                    graph.add_edge(f"job_{actual_idx1}", f"job_{actual_idx2}",
                                 type="location_proximity", 
                                 weight=float(weights_cpu[k]))
                    edge_count += 1
            
            # Print progress every 5000 edges
            if edge_count % 5000 == 0:
                print(f"Created {edge_count} edges so far...")

    print(f"Added {edge_count} location proximity edges")
    save_graph_checkpoint(graph, 'graph_with_location_edges.pkl')
    return graph

def create_embedding_edges(df_ready, graph, embedding_type, threshold=0.7, k=10, n_clusters=500):
    """
    Create edges between jobs with similar embeddings using parallel processing and optimized FAISS search.
    
    Args:
        df_ready (pd.DataFrame): Input dataframe with job data
        graph (nx.Graph): Graph to add edges to
        embedding_type (str): Either 'job_title_embedding' or 'job_description_embedding'
        threshold (float): Minimum similarity threshold for creating edges
        k (int): Number of nearest neighbors to search for
        n_clusters (int): Number of clusters to use
        
    Returns:
        nx.Graph: Graph with embedding similarity edges added
    """
    print(f"Creating edges between jobs with similar {embedding_type} (threshold={threshold})...")
    edge_count = 0
    use_gpu = torch.cuda.is_available()
    
    # Get embeddings and normalize once
    embeddings = np.array([parse_embedding(row[embedding_type]) for _, row in df_ready.iterrows()]).astype('float32')
    faiss.normalize_L2(embeddings)
    n_jobs = len(df_ready)
    
    # Create optimized IVF index
    d = embeddings.shape[1]  # Embedding dimension
    nlist = min(n_clusters, int(np.sqrt(n_jobs)))  # Number of Voronoi cells
    quantizer = faiss.IndexFlatIP(d)
    index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
    
    if use_gpu:
        res = faiss.StandardGpuResources()
        index = faiss.index_cpu_to_gpu(res, 0, index)
    
    # Train and add vectors
    index.train(embeddings)
    index.add(embeddings)
    
    # Process in batches
    edge_type = "title_similarity" if embedding_type == "job_title_embedding" else "description_similarity"
    batch_size = 1000
    
    for start_idx in tqdm(range(0, n_jobs, batch_size), desc="Finding similar jobs"):
        end_idx = min(start_idx + batch_size, n_jobs)
        batch_embeddings = embeddings[start_idx:end_idx]
        
        # Batch search
        similarities, indices = index.search(batch_embeddings, k)
        
        # Process results
        for i, (sims, nbrs) in enumerate(zip(similarities, indices)):
            global_idx = start_idx + i
            for sim, nbr in zip(sims, nbrs):
                if nbr <= global_idx or sim < threshold or nbr == -1:
                    continue
                graph.add_edge(f"job_{global_idx}", f"job_{nbr}",
                             type=edge_type, weight=float(sim))
                edge_count += 1
    
    print(f"Added {edge_count} {edge_type} edges")
    checkpoint_name = 'graph_with_title_edges.pkl' if edge_type == "title_similarity" else 'graph_with_description_edges.pkl'
    save_graph_checkpoint(graph, checkpoint_name)
    return graph

def build_complete_graph(df_ready, resume_from=None):
    """
    Build the complete graph with all edge types.
    
    Args:
        df_ready (pd.DataFrame): Input dataframe with job data
        resume_from (str): Checkpoint to resume from ('nodes', 'company', 'job_type', 'location', 'title', 'description')
        
    Returns:
        nx.Graph: Complete graph with all edges
    """
    start_time = time.time()
    
    if resume_from is None:
        graph = initialize_graph(df_ready)
    else:
        checkpoint_files = {
            'nodes': 'graph_with_nodes.pkl',
            'company': 'graph_with_company_edges.pkl',
            'job_type': 'graph_with_job_type_edges.pkl',
            'location': 'graph_with_location_edges.pkl',
            'title': 'graph_with_title_edges.pkl',
            'description': 'graph_with_description_edges.pkl'
        }
        graph = load_graph_checkpoint(checkpoint_files[resume_from])
    
    steps = ['company', 'job_type', 'location', 'title', 'description']
    start_idx = steps.index(resume_from) + 1 if resume_from in steps else 0
    
    for step in steps[start_idx:]:
        step_start = time.time()
        if step == 'company':
            graph = create_company_edges(df_ready, graph)
        elif step == 'job_type':
            graph = create_job_type_edges(df_ready, graph)
        elif step == 'location':
            graph = create_location_edges(
                    df_ready, 
                    graph,
                    max_distance=2,      # Only connect very close jobs
                    min_weight=0.5,      # Only strong connections
                    sigma=1,            # Sharp distance decay
                    chunk_size=500
                )
        elif step == 'title':
            graph = create_embedding_edges(df_ready, graph, 'job_title_embedding', threshold=0.7, k=10)
        elif step == 'description':
            graph = create_embedding_edges(df_ready, graph, 'job_description_embedding', threshold=0.7, k=10)
        step_end = time.time()
        print(f"{step} step took {(step_end - step_start)/60:.2f} minutes")
    
    total_time = time.time() - start_time
    print(f"\nTotal graph construction time: {total_time/60:.2f} minutes")
    print("\nGraph construction complete!")
    print(f"Nodes: {graph.number_of_nodes()}, Edges: {graph.number_of_edges()}")
    
    save_graph_checkpoint(graph, 'final_complete_graph.pkl')
    print("Final graph saved to 'final_complete_graph.pkl'")
    
    return graph 

In [3]:
graph = build_complete_graph(df_ready, resume_from='description')



Total graph construction time: 7.08 minutes

Graph construction complete!
Nodes: 25142, Edges: 79444658
Saved checkpoint: final_complete_graph.pkl
Final graph saved to 'final_complete_graph.pkl'


In [4]:
print(f"Nodes: {len(graph.nodes)}")
print(f"Edges: {len(graph.edges)}")
print("\nSample node data:", next(iter(graph.nodes(data=True))))
print("\nSample edge:", next(iter(graph.edges())))

Nodes: 25142
Edges: 79444658

Sample node data: ('job_0', {'job_title_embedding': [-0.0323692635, 0.037560612, -0.0851607397, -0.0453846455, -0.0546593554, -0.0279194303, 0.0278429296, 0.0215941686, 0.0325031728, 0.000887234812, 0.036646761, -0.0450273678, -0.0109668206, -0.0368140303, -0.0451355092, 0.0584830716, 0.114421636, 0.0379366949, -0.0109098526, -0.0450774394, -0.0636187941, -0.0283169001, -0.0749196559, 0.0460793413, -0.00873038266, 0.0490450747, -0.0230729543, 0.083348833, 0.0108598638, -0.0804005265, -0.0902405754, 0.0268752687, 0.0291939843, 0.0203662012, -0.0047371788, -0.0222695544, -0.0255234633, 0.0452349409, 0.028607415, -0.00115934608, -0.0206206571, -0.00999114104, -0.0118169105, -0.0274297539, -0.081966795, 0.0121426117, -0.0515504405, -0.0148908487, 0.0312867016, 0.0425315909, 0.0544571318, 0.00782229751, 0.011188155, 0.0547044016, -0.0220145993, 0.0660661459, -0.000419043761, 0.0151083982, -0.0381165333, -0.0169295501, -0.0283929724, 0.0029114394, -0.0551629476,

In [5]:
del graph

print("Loading graph from pickle file...")
with open('final_complete_graph.pkl', 'rb') as f:
    new_graph = pickle.load(f)

print(f"Nodes: {len(new_graph.nodes)}")
print(f"Edges: {len(new_graph.edges)}")
print("\nSample node data:", next(iter(new_graph.nodes(data=True))))
print("\nSample edge:", next(iter(new_graph.edges())))

Loading graph from pickle file...
Nodes: 25142
Edges: 79444658

Sample node data: ('job_0', {'job_title_embedding': [-0.0323692635, 0.037560612, -0.0851607397, -0.0453846455, -0.0546593554, -0.0279194303, 0.0278429296, 0.0215941686, 0.0325031728, 0.000887234812, 0.036646761, -0.0450273678, -0.0109668206, -0.0368140303, -0.0451355092, 0.0584830716, 0.114421636, 0.0379366949, -0.0109098526, -0.0450774394, -0.0636187941, -0.0283169001, -0.0749196559, 0.0460793413, -0.00873038266, 0.0490450747, -0.0230729543, 0.083348833, 0.0108598638, -0.0804005265, -0.0902405754, 0.0268752687, 0.0291939843, 0.0203662012, -0.0047371788, -0.0222695544, -0.0255234633, 0.0452349409, 0.028607415, -0.00115934608, -0.0206206571, -0.00999114104, -0.0118169105, -0.0274297539, -0.081966795, 0.0121426117, -0.0515504405, -0.0148908487, 0.0312867016, 0.0425315909, 0.0544571318, 0.00782229751, 0.011188155, 0.0547044016, -0.0220145993, 0.0660661459, -0.000419043761, 0.0151083982, -0.0381165333, -0.0169295501, -0.028392