In [None]:
import pandas as pd
import plotly.express as px
from collections import Counter
import plotly.graph_objects as go

## Load Data

In [None]:
df = pd.read_csv("../_LinkedIn Data Exploration/REQ DATA/df_A_clustered_PS.csv")

In [None]:
df.head(2)
# Standardizing the cluster labels (removing extra spaces and making them lowercase)
df['primary_cluster'] = df['primary_cluster'].str.strip().str.lower()
df['secondary_cluster'] = df['secondary_cluster'].str.strip().str.lower()

In [None]:
print(df['primary_cluster'].nunique())
print(df['secondary_cluster'].nunique())

In [None]:
print(df['primary_cluster'].unique())

In [None]:
df.columns

## Job Count Distribution by Cluster Type (Primary & Secondary)

In [None]:
# Step 1: Group by 'primary_cluster' and count unique 'job_id' for each cluster
count_data_primary = df.groupby('primary_cluster')['job_id'].nunique().reset_index()
count_data_primary.columns = ['cluster', 'job_count']
count_data_primary['cluster_type'] = 'Primary Cluster'

# Step 2: Group by 'secondary_cluster' and count unique 'job_id' for each cluster
count_data_secondary = df.groupby('secondary_cluster')['job_id'].nunique().reset_index()
count_data_secondary.columns = ['cluster', 'job_count']
count_data_secondary['cluster_type'] = 'Secondary Cluster'

# Step 3: Combine both datasets
count_data = pd.concat([count_data_primary, count_data_secondary])

# Step 4: Create the bubble plot
fig = px.scatter(count_data, 
                 x='cluster', 
                 y='job_count', 
                 size='job_count', 
                 color='cluster_type', 
                 hover_name='cluster',
                 size_max=50,  # Maximum size of the bubbles
                 title="Job Count by Primary and Secondary Cluster",
                 labels={"cluster": "Cluster", "job_count": "Job Count", "cluster_type": "Cluster Type"})

# Increase plot size
fig.update_layout(
    width=1000,  # Width of the plot
    height=800,  # Height of the plot
)

# Show the plot
fig.show()

## Top 20 Skills Required based Primary Cluster

In [None]:
# Unique primary clusters
primary_clusters = df['primary_cluster'].unique()

# Function to generate skill counts for a given primary cluster
def get_skill_counts(cluster):
    filtered_df = df[df['primary_cluster'] == cluster]
    skills = filtered_df['skills_desc'].dropna().str.split(',').explode()
    skill_counts = Counter(skills)
    skill_df = pd.DataFrame(skill_counts.items(), columns=['Skill', 'Count']).sort_values(by='Count', ascending=False)
    return skill_df

# Create the figure
fig = go.Figure()

# Add traces for each primary cluster
for cluster in primary_clusters:
    skill_df = get_skill_counts(cluster)
    fig.add_trace(go.Bar(
        x=skill_df['Count'].head(10),
        y=skill_df['Skill'].head(10),
        orientation='h',
        name=f'Cluster {cluster}',
        visible=True if cluster == primary_clusters[0] else False  # Only the first cluster is visible initially
    ))

# Create dropdown buttons
dropdown_buttons = [
    {
        'label': f'Cluster {cluster}',
        'method': 'update',
        'args': [{'visible': [cluster == c for c in primary_clusters]}, {'title': f'Top 10 Skills Required in Primary Cluster {cluster}'}]
    }
    for cluster in primary_clusters
]

# Update layout with dropdown
fig.update_layout(
    updatemenus=[{
        'buttons': dropdown_buttons,
        'direction': 'down',
        'showactive': True,
        'x': 0.8,  # Horizontal position
        'y': 1.15,  # Vertical position
        'xanchor': 'left',
        'yanchor': 'top'
    }],
    title=f'Top 10 Skills Required in Primary Cluster {primary_clusters[0]}',
    width=1000,
    height=800,
    yaxis=dict(autorange="reversed")  # Reverse the order of y-axis for better readability
)

# Show the plot
fig.show()

## Skills Dataframe usage

In [None]:
skills_id = pd.read_csv("../Job_Postings_Data_2023_24/jobs/job_skills.csv")

In [None]:
skills = pd.read_csv("../Job_Postings_Data_2023_24/mappings/skills.csv")

In [None]:
skills_df = pd.merge(skills_id, skills, on='skill_abr', how='left')

In [None]:
skills_df.head(2)

In [None]:
skills_grouped_df = skills_df.groupby('job_id')['skill_name'].agg(', '.join).reset_index()

In [None]:
skills_grouped_df.head(2)