In [3]:
#  Setup
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), '..'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [4]:
from src.data.collector import JobDataCollector
from src.features.skill_extractor import SkillExtractor

In [6]:
# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Career Compass - Initial Analysis")

Career Compass - Initial Analysis


In [6]:
# Collecting Data
collector = JobDataCollector()
jobs_df = collector.collect_all_jobs()

print(f"Total jobs collected: {len(jobs_df)}")
print(f"Columns: {list(jobs_df.columns)}")
jobs_df.head()

Starting job data collection...
Fetching GitHub Jobs for: data scientist
Fetching GitHub Jobs for: machine learning engineer
Fetching simulated LinkedIn data for: data scientist
Saved 40 jobs to data\raw\jobs_20260208_051810.json
Total jobs collected: 40
Columns: ['id', 'title', 'company', 'location', 'description', 'skills', 'posted_date', 'source', 'salary_range', 'experience_level']


Unnamed: 0,id,title,company,location,description,skills,posted_date,source,salary_range,experience_level
0,gh_0,Senior Data Scientist,TechCorp,Remote,"Looking for a data scientist with Python, SQL,...","[Python, SQL, Machine Learning, TensorFlow]",2026-02-08,github_jobs,,
1,gh_1,Senior Data Scientist,DataWorks,Remote,"Looking for a data scientist with Python, SQL,...","[Python, SQL, Machine Learning, PyTorch]",2026-02-08,github_jobs,,
2,gh_2,Senior Data Scientist,AIStartup,Remote,"Looking for a data scientist with Python, SQL,...","[Python, SQL, Machine Learning, AWS]",2026-02-08,github_jobs,,
3,gh_3,Senior Data Scientist,TechCorp,Remote,"Looking for a data scientist with Python, SQL,...","[Python, SQL, Machine Learning, TensorFlow]",2026-02-08,github_jobs,,
4,gh_4,Senior Data Scientist,DataWorks,Remote,"Looking for a data scientist with Python, SQL,...","[Python, SQL, Machine Learning, PyTorch]",2026-02-08,github_jobs,,


In [7]:
# Extract Skills
extractor = SkillExtractor()
jobs_df_with_skills, skill_freq = extractor.extract_skills_from_dataframe(jobs_df)

# Display skills per job
jobs_df_with_skills[['title', 'company', 'all_skills']].head(10)

Extracting skills from 40 job postings...
Found 19 unique skills
Top 10 skills: [('Python', 40), ('SQL', 30), ('Machine Learning', 25), ('machine learning', 25), ('AWS', 21), ('sql', 20), ('python', 20), ('Docker', 10), ('TensorFlow', 8), ('tensorflow', 8)]


Unnamed: 0,title,company,all_skills
0,Senior Data Scientist,TechCorp,"[sql, TensorFlow, tensorflow, python, Machine ..."
1,Senior Data Scientist,DataWorks,"[sql, PyTorch, python, Machine Learning, machi..."
2,Senior Data Scientist,AIStartup,"[sql, python, Machine Learning, machine learni..."
3,Senior Data Scientist,TechCorp,"[sql, TensorFlow, tensorflow, python, Machine ..."
4,Senior Data Scientist,DataWorks,"[sql, PyTorch, python, Machine Learning, machi..."
5,Senior Data Scientist,AIStartup,"[sql, python, Machine Learning, machine learni..."
6,Senior Data Scientist,TechCorp,"[sql, TensorFlow, tensorflow, python, Machine ..."
7,Senior Data Scientist,DataWorks,"[sql, PyTorch, python, Machine Learning, machi..."
8,Senior Data Scientist,AIStartup,"[sql, python, Machine Learning, machine learni..."
9,Senior Data Scientist,TechCorp,"[sql, TensorFlow, tensorflow, python, Machine ..."


In [8]:
#  Basic Visualizations
# 1. Top Skills Visualization
top_skills = dict(skill_freq.most_common(15))

fig = go.Figure(data=[
    go.Bar(
        x=list(top_skills.keys()),
        y=list(top_skills.values()),
        marker_color='rgb(55, 83, 109)'
    )
])

fig.update_layout(
    title='Top 15 Most Requested Skills',
    xaxis_title='Skill',
    yaxis_title='Frequency',
    template='plotly_white'
)

fig.show()

In [9]:
#  Company Analysis
# Group by company
company_skills = {}
for _, row in jobs_df_with_skills.iterrows():
    company = row['company']
    if company not in company_skills:
        company_skills[company] = []
    company_skills[company].extend(row['all_skills'])

# Find unique tech stacks
print("Company Tech Stacks:")
for company, skills in list(company_skills.items())[:5]:
    unique_skills = list(set(skills))
    print(f"{company}: {', '.join(unique_skills[:5])}...")

Company Tech Stacks:
TechCorp: sql, TensorFlow, tensorflow, python, Machine Learning...
DataWorks: sql, PyTorch, python, Machine Learning, machine learning...
AIStartup: sql, python, Machine Learning, machine learning, SQL...
Company_0: Airflow, Spark, Machine Learning, SQL, Python...
Company_1: Redis, FastAPI, MLOps, PostgreSQL, Docker...


In [10]:
# Experience Level Analysis
if 'experience_level' in jobs_df_with_skills.columns:
    exp_level_skills = {}
    for level in jobs_df_with_skills['experience_level'].unique():
        level_jobs = jobs_df_with_skills[jobs_df_with_skills['experience_level'] == level]
        all_skills = [skill for skills in level_jobs['all_skills'] for skill in skills]
        exp_level_skills[level] = Counter(all_skills)
    
    # Create comparison
    fig = make_subplots(
        rows=1, 
        cols=len(exp_level_skills),
        subplot_titles=list(exp_level_skills.keys())
    )
    
    for i, (level, skills_counter) in enumerate(exp_level_skills.items(), 1):
        top_5 = skills_counter.most_common(5)
        fig.add_trace(
            go.Bar(
                x=[s[0] for s in top_5],
                y=[s[1] for s in top_5],
                name=level
            ),
            row=1, col=i
        )
    
    fig.update_layout(
        title='Top Skills by Experience Level',
        showlegend=False,
        height=400
    )
    fig.show()