In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

# Plot style
sns.set_theme(style="whitegrid", context="notebook")

In [None]:
file_path = "cleaned_top_jobs.csv"
df = pd.read_csv(file_path)

print("Shape of dataset:", df.shape)
df.head()

In [None]:
# Data types and non-null counts
df.info()

In [None]:
# Summary statistics (numerical)
df.describe().T

In [None]:
# Summary statistics (categorical)
df.describe(include="object").T

## Missing Value Analysis

In [None]:
missing_df = pd.DataFrame({
    "missing_count": df.isna().sum(),
    "missing_percent": (df.isna().sum() / len(df)) * 100
}).sort_values(by="missing_percent", ascending=False)

missing_df

## Univariate Analysis

In [None]:
# Get top 10 posts in descending order
top_posts = df['post'].value_counts().head(10).sort_values(ascending=True)  # ascending=True for hbar (largest on top)

plt.figure(figsize=(10, 5))
bars = plt.barh(top_posts.index, top_posts.values) # color='skyblue'

# Add count labels on each bar
for bar in bars:
    width = bar.get_width()
    plt.text(width + 0.5, bar.get_y() + bar.get_height()/2, int(width), va='center')

plt.title("Top 10 Posts")
plt.xlabel("Count")
plt.ylabel("Post")
plt.tight_layout()
plt.show()


In [None]:
# Get top 10 posts in descending order
top_posts = df['company'].value_counts().head(10).sort_values(ascending=True)  # ascending=True for hbar (largest on top)

plt.figure(figsize=(10, 5))
bars = plt.barh(top_posts.index, top_posts.values) # color='skyblue'

# Add count labels on each bar
for bar in bars:
    width = bar.get_width()
    plt.text(width + 0.5, bar.get_y() + bar.get_height()/2, int(width), va='center')

plt.title("Top 10 Companies")
plt.xlabel("Count")
plt.ylabel("Company")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
bars = df['experience'].value_counts().plot(kind='bar', rot=0)

# Add number labels on top of each bar
for bar in bars.patches:  # 'patches' contains the rectangles (bars)
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,  # x position at center of bar
        height,                             # y position on top of bar
        int(height),                         # label (count)
        ha='center', va='bottom'            # center horizontally, bottom vertically
    )

plt.title("Experience Requirement Distribution")
plt.xlabel("Experience")
plt.ylabel("Count")
plt.show()

In [None]:
# Create a mapping dictionary
level_labels = {
    1: "Entry",
    2: "Medium",
    3: "Senior",
    4: "Top"
}

# Map the levels to labels
df['level_label'] = df['level'].map(level_labels)

# Plot using the new labels
plt.figure(figsize=(10, 6))
bars = df['level_label'].value_counts().sort_index().plot(kind='bar', rot=0)  # sort_index ensures proper order

# Add number labels on top of each bar
for bar in bars.patches:
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height,
        int(height),
        ha='center', va='bottom'
    )

plt.title("Levels Distribution")
plt.xlabel("Levels")
plt.ylabel("Count")
plt.show()

## Scatterplot

In [None]:
# Compute average salary per level
salary_avg = df.groupby('level')['salary_avg'].mean().sort_index()

salary_avg
plt.figure(figsize=(8, 5))
plt.scatter(salary_avg.index, salary_avg.values, color='blue', s=100)  # s= marker size

# Add labels on top of each point
for i, value in enumerate(salary_avg.values):
    plt.text(salary_avg.index[i], value, f"{value:,.0f}", ha='center', va='bottom')

plt.title("Average Salary vs Level")
plt.xlabel("Level")
plt.ylabel("Average Salary")
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.5)
plt.xticks(range(1, int(salary_avg.index.max())+1))
plt.tight_layout()
plt.show()

### Correlation: Heatmap

In [None]:
df_num = df.drop(columns=["year", "month", "day"], errors="ignore")

numerical_cols = df_num.select_dtypes(include=["int64", "float64"]).columns
corr_matrix = df_num[numerical_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(
    corr_matrix,
    annot=True,
    cmap="coolwarm",
    fmt=".2f",
    linewidths=0.5
)
plt.title("Correlation Heatmap")
plt.show()
