In [2]:
import pandas as pd
import numpy as np
import openai
import json
import os
from tqdm import tqdm
from sklearn.cluster import KMeans
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from openai.types import CreateEmbeddingResponse, Embedding
from collections import Counter
from dotenv import load_dotenv
from openai import OpenAI
from functools import lru_cache

load_dotenv()

# Directory containing the JSON files
directory = 'data/extracted/gemini'

# List to hold all job entries
all_jobs = []

# Loop through the files and read each JSON file
for filename in os.listdir(directory):
    if filename.endswith('.json'):
        with open(os.path.join(directory, filename), 'r') as file:
            jobs = json.load(file)
            all_jobs.extend(jobs)

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(all_jobs)
jobs_df = df.copy()

In [2]:
import pandas as pd
import numpy as np
import openai
from tqdm import tqdm
import os
from openai import OpenAI
import tiktoken

# Initialize the OpenAI client with the API key directly
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

# Initialize the tokenizer for the specified model
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

# Function to ensure the input does not exceed the max token limit
def truncate_texts(texts, max_tokens=8191):
    tokens = []
    for text in texts:
        if not isinstance(text, str):
            continue  # Skip non-string inputs
        tokens += encoding.encode(text) + [encoding.encode(" ")[0]]  # Add a space token between texts
        if len(tokens) > max_tokens:
            tokens = tokens[:max_tokens]
            break
    return encoding.decode(tokens)

# Function to generate embeddings for a list of texts using OpenAI API (batch processing)
def embed_texts(texts):
    truncated_text = truncate_texts(texts)
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=[truncated_text]
    )
    return [item.embedding for item in response.data]

# Function to preprocess and embed each field
def preprocess_and_embed(skills_list):
    if len(skills_list) == 0:
        return np.zeros((1536,))  # Adjust the dimension according to the model's output
    embeddings = embed_texts(skills_list)
    return np.mean(embeddings, axis=0)  # Average the embeddings for the list

# Generate and save embeddings to CSV
def generate_and_save_embeddings(df, column_name, file_name):
    embeddings = []
    skills_lists = df[column_name].tolist()
    for idx, skills_list in tqdm(enumerate(skills_lists), total=len(skills_lists)):
        try:
            if isinstance(skills_list, list):
                embedding = preprocess_and_embed(skills_list)
            else:
                embedding = embed_texts([skills_list])[0]
        except Exception as e:
            print(f"Error processing index {idx}: {e}")
            embedding = np.zeros((1536,))  # Use a zero vector in case of error
        embeddings.append(embedding)
    embeddings_df = pd.DataFrame(embeddings)
    embeddings_df.to_csv(file_name, index=False)
    return embeddings_df

# Assuming jobs_df is already defined and populated with job data
# hard_skills_embeddings_df = generate_and_save_embeddings(jobs_df, 'hard_skills', 'data/embeddings/hard_skills_embeddings.csv')
tech_stack_embeddings_df = generate_and_save_embeddings(jobs_df, 'tech_stack', 'data/embeddings/tech_stack_embeddings.csv')
soft_skills_embeddings_df = generate_and_save_embeddings(jobs_df, 'soft_skills', 'data/embeddings/soft_skills_embeddings.csv')
industries_embeddings_df = generate_and_save_embeddings(jobs_df, 'industries', 'data/embeddings/industries_embeddings.csv')


  5%|▍         | 970/20732 [03:19<58:14,  5.66it/s]  

Error processing index 968: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


  5%|▌         | 1095/20732 [03:45<58:39,  5.58it/s]  

Error processing index 1093: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


  6%|▌         | 1230/20732 [04:14<1:03:14,  5.14it/s]

Error processing index 1228: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


  7%|▋         | 1511/20732 [05:14<49:44,  6.44it/s]  

Error processing index 1509: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 19%|█▉        | 3942/20732 [13:39<51:03,  5.48it/s]  

Error processing index 3940: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 20%|█▉        | 4086/20732 [14:10<57:31,  4.82it/s]  

Error processing index 4084: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 20%|██        | 4166/20732 [14:27<49:57,  5.53it/s]  

Error processing index 4164: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 20%|██        | 4210/20732 [14:38<46:31,  5.92it/s]  

Error processing index 4209: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 20%|██        | 4228/20732 [14:42<46:38,  5.90it/s]  

Error processing index 4226: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 23%|██▎       | 4691/20732 [16:18<43:17,  6.17it/s]  

Error processing index 4689: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 23%|██▎       | 4699/20732 [16:20<45:06,  5.92it/s]  

Error processing index 4697: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 25%|██▌       | 5195/20732 [18:03<53:00,  4.88it/s]  

Error processing index 5193: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 28%|██▊       | 5821/20732 [20:15<43:13,  5.75it/s]  

Error processing index 5819: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 28%|██▊       | 5884/20732 [20:27<39:00,  6.34it/s]  

Error processing index 5882: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 32%|███▏      | 6550/20732 [22:43<44:34,  5.30it/s]  

Error processing index 6548: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 45%|████▌     | 9344/20732 [32:24<31:51,  5.96it/s]  

Error processing index 9342: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 49%|████▉     | 10178/20732 [35:24<32:09,  5.47it/s]  

Error processing index 10177: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 52%|█████▏    | 10801/20732 [37:42<39:14,  4.22it/s]  

Error processing index 10800: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 58%|█████▊    | 12005/20732 [42:21<25:48,  5.64it/s]  

Error processing index 12003: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 58%|█████▊    | 12086/20732 [42:39<27:34,  5.22it/s]

Error processing index 12084: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 59%|█████▊    | 12152/20732 [42:53<32:42,  4.37it/s]  

Error processing index 12150: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 59%|█████▉    | 12316/20732 [43:29<24:46,  5.66it/s]  

Error processing index 12315: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 59%|█████▉    | 12319/20732 [43:30<23:55,  5.86it/s]

Error processing index 12318: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 59%|█████▉    | 12334/20732 [43:33<22:29,  6.22it/s]

Error processing index 12332: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 60%|█████▉    | 12343/20732 [43:34<25:30,  5.48it/s]

Error processing index 12341: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 62%|██████▏   | 12913/20732 [45:34<23:03,  5.65it/s]  

Error processing index 12911: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 74%|███████▍  | 15401/20732 [54:39<17:56,  4.95it/s]  

Error processing index 15399: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 74%|███████▍  | 15412/20732 [54:42<20:24,  4.35it/s]

Error processing index 15410: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 75%|███████▍  | 15522/20732 [55:06<15:11,  5.71it/s]

Error processing index 15520: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 76%|███████▌  | 15736/20732 [55:51<13:40,  6.09it/s]

Error processing index 15734: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 80%|████████  | 16603/20732 [58:59<11:52,  5.80it/s]  

Error processing index 16601: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 82%|████████▏ | 16991/20732 [1:00:20<11:07,  5.61it/s]  

Error processing index 16989: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 86%|████████▌ | 17754/20732 [1:03:08<11:30,  4.31it/s]

Error processing index 17753: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 88%|████████▊ | 18341/20732 [1:05:17<06:53,  5.78it/s]

Error processing index 18339: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


 99%|█████████▊| 20448/20732 [1:12:44<00:49,  5.70it/s]

Error processing index 20446: Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


100%|██████████| 20732/20732 [1:13:39<00:00,  4.69it/s]
  0%|          | 27/20732 [00:04<59:48,  5.77it/s]  


KeyboardInterrupt: 

In [21]:
import tiktoken

# Initialize the tokenizer
tokenizer = tiktoken.encoding_for_model("text-embedding-ada-002")

# Function to estimate token count for a list of strings
def estimate_token_count(texts):
    total_tokens = 0
    for text in texts:
        tokens = tokenizer.encode(text)
        total_tokens += len(tokens)
    return total_tokens

# Preprocess each field
def preprocess_field(skills_list):
    return ' '.join(skills_list) if isinstance(skills_list, list) else ''

jobs_df['hard_skills_processed'] = jobs_df['hard_skills'].apply(preprocess_field)
jobs_df['tech_stack_processed'] = jobs_df['tech_stack'].apply(preprocess_field)
jobs_df['soft_skills_processed'] = jobs_df['soft_skills'].apply(preprocess_field)
jobs_df['industries_processed'] = jobs_df['industries'].apply(preprocess_field)

# Combine all preprocessed fields into one list for token estimation
all_texts = jobs_df['hard_skills_processed'].tolist() + jobs_df['tech_stack_processed'].tolist() + jobs_df['soft_skills_processed'].tolist() + jobs_df['industries_processed'].tolist()

# Estimate the total token count
total_tokens = estimate_token_count(all_texts)
print(f"Total token count: {total_tokens}")

# Predict the cost based on the token count
# Example pricing: $0.0004 per 1,000 tokens
cost = 0.1
total_cost = (total_tokens / 1_000_000) * cost
print(f"Estimated cost: ${total_cost:.4f}")


Total token count: 1456397
Estimated cost: $0.1456


In [8]:

# Load embeddings from CSV
def load_embeddings(file_name):
    return pd.read_csv(file_name).values

hard_skills_embeddings = load_embeddings('data/embeddings/hard_skills_embeddings.csv')
tech_stack_embeddings = load_embeddings('data/embeddings/tech_stack_embeddings.csv')
soft_skills_embeddings = load_embeddings('data/embeddings/soft_skills_embeddings.csv')
industries_embeddings = load_embeddings('data/embeddings/industries_embeddings.csv')

# Function to cluster embeddings
def cluster_embeddings(embeddings, num_clusters=10):
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    clusters = kmeans.fit_predict(embeddings)
    return clusters, kmeans

# Cluster each category
num_clusters = 10
hard_skills_clusters, hard_skills_kmeans = cluster_embeddings(hard_skills_embeddings, 3)
tech_stack_clusters, tech_stack_kmeans = cluster_embeddings(tech_stack_embeddings, 2)
soft_skills_clusters, soft_skills_kmeans = cluster_embeddings(soft_skills_embeddings, 3)
industries_clusters, industries_kmeans = cluster_embeddings(industries_embeddings, 2)

# Assign clusters to the DataFrame
jobs_df['hard_skills_cluster'] = hard_skills_clusters
jobs_df['tech_stack_cluster'] = tech_stack_clusters
jobs_df['soft_skills_cluster'] = soft_skills_clusters
jobs_df['industries_cluster'] = industries_clusters












In [9]:

# Function to visualize clusters
def visualize_clusters(df, cluster_column, title):
    cluster_counts = df[cluster_column].value_counts().reset_index()
    cluster_counts.columns = ['Cluster', 'Count']

    fig = px.bar(
        cluster_counts,
        x='Cluster',
        y='Count',
        color='Count',
        title=title,
        color_continuous_scale="Emrld",
    )
    fig.update_yaxes(title="Count")
    fig.update_layout(
        showlegend=False,
        template="plotly_white",
        xaxis=go.layout.XAxis(tickangle=45)
    )
    fig.show()

# Visualize each category
visualize_clusters(jobs_df, 'hard_skills_cluster', 'Hard Skills Clusters')
visualize_clusters(jobs_df, 'tech_stack_cluster', 'Tech Stack Clusters')
visualize_clusters(jobs_df, 'soft_skills_cluster', 'Soft Skills Clusters')
visualize_clusters(jobs_df, 'industries_cluster', 'Industries Clusters')


In [10]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter

# Function to get top items in each cluster
def get_top_items_in_clusters(df, cluster_column, items_column, top_n=20):
    cluster_top_items = {}
    for cluster in df[cluster_column].unique():
        cluster_items = df[df[cluster_column] == cluster][items_column]
        # Convert lists to strings if necessary
        all_items = []
        for item_list in cluster_items:
            if isinstance(item_list, list):
                all_items.extend(item_list)
            elif isinstance(item_list, str):
                all_items.append(item_list)
        most_common_items = Counter(all_items).most_common(top_n)
        cluster_top_items[cluster] = most_common_items
    return cluster_top_items

# Analyze and visualize top items
hard_skills_top_items = get_top_items_in_clusters(jobs_df, 'hard_skills_cluster', 'hard_skills')
tech_stack_top_items = get_top_items_in_clusters(jobs_df, 'tech_stack_cluster', 'tech_stack')
soft_skills_top_items = get_top_items_in_clusters(jobs_df, 'soft_skills_cluster', 'soft_skills')
industries_top_items = get_top_items_in_clusters(jobs_df, 'industries_cluster', 'industries')

def visualize_top_items(cluster_top_items, title):
    for cluster, items in cluster_top_items.items():
        items_df = pd.DataFrame(items, columns=['Item', 'Frequency'])
        fig = px.bar(
            items_df,
            x='Item',
            y='Frequency',
            color='Frequency',
            title=f'{title} in Cluster {cluster}',
            color_continuous_scale="Emrld",
        )
        fig.update_yaxes(title="Frequency")
        fig.update_layout(
            showlegend=False,
            template="plotly_white",
            xaxis=go.layout.XAxis(tickangle=45)
        )
        fig.show()

# Example usage
visualize_top_items(hard_skills_top_items, 'Top Hard Skills')
visualize_top_items(tech_stack_top_items, 'Top Tech Stack')
visualize_top_items(soft_skills_top_items, 'Top Soft Skills')
visualize_top_items(industries_top_items, 'Top Industries')


In [11]:
hard_skills_top_items

{0: [('Python', 6163),
  ('SQL', 3890),
  ('Machine Learning', 3155),
  ('AWS', 1813),
  ('R', 1609),
  ('Data Analysis', 1588),
  ('Java', 1389),
  ('Data Visualization', 1365),
  ('PyTorch', 1329),
  ('Spark', 1329),
  ('Tableau', 1174),
  ('TensorFlow', 1143),
  ('Data Modeling', 1123),
  ('Deep Learning', 1112),
  ('ETL', 1002),
  ('Azure', 898),
  ('Snowflake', 777),
  ('Hadoop', 752),
  ('Scala', 722),
  ('Kubernetes', 717)],
 2: [('Python', 2097),
  ('Java', 1907),
  ('JavaScript', 1572),
  ('AWS', 1395),
  ('React', 1220),
  ('SQL', 1117),
  ('Git', 1012),
  ('Kubernetes', 1009),
  ('Docker', 926),
  ('CSS', 782),
  ('HTML', 762),
  ('C#', 695),
  ('CI/CD', 691),
  ('Linux', 667),
  ('C++', 657),
  ('Angular', 630),
  ('Azure', 618),
  ('Jenkins', 583),
  ('TypeScript', 491),
  ('Node.js', 485)],
 1: [('Data Analysis', 2501),
  ('SQL', 1837),
  ('Excel', 1248),
  ('Project Management', 1164),
  ('Reporting', 906),
  ('Data Visualization', 764),
  ('Requirements Gathering', 725)

In [12]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px
from sklearn.cluster import KMeans

# Function to reduce dimensions using PCA or t-SNE
def reduce_dimensions(embeddings, method='pca', n_components=2):
    if method == 'pca':
        reducer = PCA(n_components=n_components)
    elif method == 'tsne':
        reducer = TSNE(n_components=n_components, random_state=42)
    else:
        raise ValueError("Method must be 'pca' or 'tsne'")
    reduced_embeddings = reducer.fit_transform(embeddings)
    return reduced_embeddings

# Function to visualize clusters
def visualize_clusters(embeddings, clusters, method='pca'):
    reduced_embeddings = reduce_dimensions(embeddings, method)
    
    reduced_df = pd.DataFrame(reduced_embeddings, columns=['Component 1', 'Component 2'])
    reduced_df['Cluster'] = clusters

    fig = px.scatter(
        reduced_df,
        x='Component 1',
        y='Component 2',
        color='Cluster',
        title=f'Clusters Visualized with {method.upper()}',
        labels={'Component 1': 'Component 1', 'Component 2': 'Component 2'}
    )
    fig.show()

# Visualize clusters for hard skills using PCA
visualize_clusters(hard_skills_embeddings, hard_skills_clusters, method='pca')
visualize_clusters(tech_stack_embeddings, tech_stack_clusters, method='pca')
visualize_clusters(soft_skills_embeddings, soft_skills_clusters, method='pca')
visualize_clusters(industries_embeddings, industries_clusters, method='pca')

# Visualize clusters for hard skills using t-SNE (if needed)
# visualize_clusters(hard_skills_embeddings, hard_skills_clusters, method='tsne')


In [14]:
from sklearn.metrics import silhouette_score

# Calculate silhouette score
silhouette_hard_skills = silhouette_score(hard_skills_embeddings, hard_skills_clusters)
silhouette_tech_stack = silhouette_score(tech_stack_embeddings, tech_stack_clusters)
silhouette_soft_skills = silhouette_score(soft_skills_embeddings, soft_skills_clusters)
silhouette_industries = silhouette_score(industries_embeddings, industries_clusters)

print(f"Silhouette Score for Hard Skills: {silhouette_hard_skills}")
print(f"Silhouette Score for Tech Stack: {silhouette_tech_stack}")
print(f"Silhouette Score for Soft Skills: {silhouette_soft_skills}")
print(f"Silhouette Score for Industries: {silhouette_industries}")


Silhouette Score for Hard Skills: 0.07248988244810543
Silhouette Score for Tech Stack: 0.06532237005399552
Silhouette Score for Soft Skills: 0.062144007369467506
Silhouette Score for Industries: 0.05756971890679639


In [46]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px

# Function to reduce dimensions using PCA or t-SNE
def reduce_dimensions(embeddings, method='pca', n_components=2):
    if method == 'pca':
        reducer = PCA(n_components=n_components)
    elif method == 'tsne':
        reducer = TSNE(n_components=n_components, random_state=42)
    else:
        raise ValueError("Method must be 'pca' or 'tsne'")
    reduced_embeddings = reducer.fit_transform(embeddings)
    return reduced_embeddings

# Function to visualize clusters
def visualize_clusters(embeddings, clusters, method='pca'):
    reduced_embeddings = reduce_dimensions(embeddings, method)
    
    reduced_df = pd.DataFrame(reduced_embeddings, columns=['Component 1', 'Component 2'])
    reduced_df['Cluster'] = clusters

    fig = px.scatter(
        reduced_df,
        x='Component 1',
        y='Component 2',
        color='Cluster',
        title=f'Clusters Visualized with {method.upper()}',
        labels={'Component 1': 'Component 1', 'Component 2': 'Component 2'}
    )
    fig.show()

# Visualize clusters for hard skills using PCA
# visualize_clusters(hard_skills_embeddings, hard_skills_clusters, method='pca')

# Visualize clusters for hard skills using t-SNE (if needed)
# visualize_clusters(hard_skills_embeddings, hard_skills_clusters, method='tsne')
