In [7]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score

## Sample the Embeddings
Since the data is from over 45,000 jobs, there are over 690,000 rows. The 21.1 GB embeddings data just for the hard_skills column is too much to process, so we can sample 10,000 rows from BigQuery. The sampled dataset is in `data/hard_skills_embeddings_sample.csv`, which is a manageable 300MB. This file is generated by running `python big_query_embedding.py`

In [8]:
column = 'hard_skills'
# Load the individual skills embeddings
skills_df = pd.read_csv(f'data/{column}_embeddings_sample.csv')
skills_df['embedding'] = skills_df['embedding'].apply(lambda x: np.array(list(map(float, x.split(',')))))

In [9]:
# Function to perform clustering and calculate silhouette score
def perform_clustering(embeddings, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(embeddings)
    silhouette_avg = silhouette_score(embeddings, clusters)
    return clusters, silhouette_avg

# Function to find optimal number of clusters
def find_optimal_clusters(embeddings, max_clusters=20):
    silhouette_scores = []
    for n_clusters in range(2, max_clusters + 1):
        clusters, silhouette_avg = perform_clustering(embeddings, n_clusters)
        silhouette_scores.append(silhouette_avg)
        print(f"For n_clusters = {n_clusters}, the average silhouette score is : {silhouette_avg}")
    
    optimal_clusters = silhouette_scores.index(max(silhouette_scores)) + 2
    return optimal_clusters, silhouette_scores

# Extract embeddings
embeddings = np.array(skills_df['embedding'].tolist())

# Find optimal number of clusters
optimal_clusters, silhouette_scores = find_optimal_clusters(embeddings)
print(f"The optimal number of clusters is: {optimal_clusters}")

# Perform final clustering with optimal number of clusters
final_clusters, final_silhouette_avg = perform_clustering(embeddings, optimal_clusters)
skills_df['cluster'] = final_clusters

# Visualize silhouette scores
fig = px.line(x=range(2, len(silhouette_scores) + 2), y=silhouette_scores, 
              labels={'x': 'Number of clusters', 'y': 'Silhouette Score'},
              title=f'Silhouette Score vs Number of Clusters for {column}')
fig.show()

# Function to reduce dimensions using PCA or t-SNE
def reduce_dimensions(embeddings, method='pca', n_components=2):
    if method == 'pca':
        reducer = PCA(n_components=n_components)
    elif method == 'tsne':
        reducer = TSNE(n_components=n_components, random_state=42)
    else:
        raise ValueError("Method must be 'pca' or 'tsne'")
    reduced_embeddings = reducer.fit_transform(embeddings)
    return reduced_embeddings

# Function to visualize clusters
def visualize_clusters(df, method='pca'):
    embeddings = np.array(df['embedding'].tolist())
    reduced_embeddings = reduce_dimensions(embeddings, method)
    
    reduced_df = pd.DataFrame(reduced_embeddings, columns=['Component 1', 'Component 2'])
    reduced_df['Cluster'] = df['cluster']
    reduced_df['Skill'] = df['skill']

    fig = px.scatter(
        reduced_df,
        x='Component 1',
        y='Component 2',
        color='Cluster',
        hover_data=['Skill'],
        title=f'Skills Clusters Visualized with {method.upper()} (Silhouette Score: {final_silhouette_avg:.3f})',
        labels={'Component 1': 'Component 1', 'Component 2': 'Component 2'}
    )
    fig.show()

# Function to get top items in each cluster
def get_top_items_in_clusters(df, cluster_column, item_column, top_n=20):
    cluster_top_items = {}
    for cluster in df[cluster_column].unique():
        cluster_items = df[df[cluster_column] == cluster][item_column]
        most_common_items = Counter(cluster_items).most_common(top_n)
        cluster_top_items[cluster] = most_common_items
    return cluster_top_items

# Analyze and visualize top items
hard_skills_top_items = get_top_items_in_clusters(skills_df, 'cluster', 'skill')

def visualize_top_items(cluster_top_items, title):
    for cluster, items in cluster_top_items.items():
        items_df = pd.DataFrame(items, columns=['Item', 'Frequency'])
        fig = px.bar(
            items_df,
            x='Item',
            y='Frequency',
            color='Frequency',
            title=f'{title} in Cluster {cluster}',
            color_continuous_scale="Emrld",
        )
        fig.update_yaxes(title="Frequency")
        fig.update_layout(
            showlegend=False,
            template="plotly_white",
            xaxis=go.layout.XAxis(tickangle=45)
        )
        fig.show()


For n_clusters = 2, the average silhouette score is : 0.04045207589042154
For n_clusters = 3, the average silhouette score is : 0.026440954150738088
For n_clusters = 4, the average silhouette score is : 0.03650163152973717
For n_clusters = 5, the average silhouette score is : 0.039814777391271566
For n_clusters = 6, the average silhouette score is : 0.04680066069084447
For n_clusters = 7, the average silhouette score is : 0.05096879446092985
For n_clusters = 8, the average silhouette score is : 0.04881980300300866
For n_clusters = 9, the average silhouette score is : 0.05612935541495256
For n_clusters = 10, the average silhouette score is : 0.05861034835549041
For n_clusters = 11, the average silhouette score is : 0.06265783201210372
For n_clusters = 12, the average silhouette score is : 0.06615965124414942
For n_clusters = 13, the average silhouette score is : 0.07391350548501531
For n_clusters = 14, the average silhouette score is : 0.07926916566708589
For n_clusters = 15, the averag

In [10]:
# Visualize clusters for hard skills
visualize_clusters(skills_df, method='pca')


In [11]:

# Visualize top items in each cluster
visualize_top_items(hard_skills_top_items, 'Top Hard Skills')