# AI Job Dataset – Ultimate EDA & Data Visualization Project
A deep-dive into AI jobs, covering salary, skills, experience, trends, company insights, and much more. Visual, creative, and full-spectrum analysis.

---
## 1. Setup & Data Load

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

df = pd.read_csv('ai_job_dataset.csv')

## 2. Initial Exploration

In [ ]:
display(df.head())
df.info()

In [ ]:
print('Rows:', df.shape[0], '| Columns:', df.shape[1])

## 3. Data Cleaning & Feature Engineering

In [ ]:
df['posting_date'] = pd.to_datetime(df['posting_date'], errors='coerce')
df['application_deadline'] = pd.to_datetime(df['application_deadline'], errors='coerce')

for col in ['experience_level', 'employment_type', 'company_size', 'education_required', 'industry']:
    df[col] = df[col'].astype(str).str.title()

df['posting_year'] = df['posting_date'].dt.year
df['posting_month'] = df['posting_date'].dt.month
df['deadline_gap_days'] = (df['application_deadline'] - df['posting_date']).dt.days
df['remote_ratio'] = df['remote_ratio'].fillna(0)

In [ ]:
# Salary normalization (for multi-currency)
cur_map = {'USD':1, 'EUR':1.09, 'GBP':1.27}
df['salary_converted'] = df['salary_usd']
for cur, rate in cur_map.items():
    df.loc[df['salary_currency'] == cur, 'salary_converted'] = df.loc[df['salary_currency'] == cur, 'salary_usd'] * rate

## 4. Missing Values & Duplicates

In [ ]:
missing = df.isnull().mean().sort_values(ascending=False)
display(missing[missing > 0])
print('Duplicated rows:', df.duplicated().sum())

In [ ]:
# Drop exact duplicates
df = df.drop_duplicates()

## 5. Data Overview & Univariate Analysis

In [ ]:
df.describe(include='all').T

### 5.1 Salary Distribution

In [ ]:
plt.figure(figsize=(12,6))
sns.histplot(df['salary_converted'], bins=60, kde=True, color='purple')
plt.title('Salary Distribution (Converted to USD)')
plt.show()

In [ ]:
fig = px.box(df, y='salary_converted', points='outliers', title='Salary Boxplot')
fig.show()

### 5.2 Experience & Education

In [ ]:
plt.figure(figsize=(10,4))
sns.countplot(x='experience_level', data=df, order=df['experience_level'].value_counts().index, palette='coolwarm')
plt.title('Experience Level Distribution')
plt.show()

plt.figure(figsize=(10,4))
sns.countplot(x='education_required', data=df, order=df['education_required'].value_counts().index, palette='Blues')
plt.title('Education Requirement Distribution')
plt.show()

### 5.3 Categorical Features

In [ ]:
cat_feats = ['employment_type', 'company_size', 'industry', 'company_location', 'employee_residence']
for col in cat_feats:
    plt.figure(figsize=(10,3))
    sns.countplot(y=col, data=df, order=df[col].value_counts().index[:10], palette='viridis')
    plt.title(f'Top 10 {col} Distribution')
    plt.tight_layout()
    plt.show()

## 6. Multivariate Analysis

### 6.1 Salary by Experience Level, Education, and Company Size

In [ ]:
plt.figure(figsize=(10,5))
sns.barplot(x='experience_level', y='salary_converted', data=df, estimator=np.mean, ci='sd', palette='magma')
plt.title('Avg Salary by Experience Level')
plt.show()

plt.figure(figsize=(10,5))
sns.barplot(x='education_required', y='salary_converted', data=df, estimator=np.mean, ci='sd', palette='crest')
plt.title('Avg Salary by Education Level')
plt.show()

plt.figure(figsize=(10,5))
sns.barplot(x='company_size', y='salary_converted', data=df, estimator=np.mean, ci='sd', palette='ch:s')
plt.title('Avg Salary by Company Size')
plt.show()

### 6.2 Salary by Industry & Country

In [ ]:
top_ind = df.groupby('industry')['salary_converted'].median().sort_values(ascending=False).head(10)
fig = px.bar(top_ind, orientation='h', title='Top 10 Industries by Median Salary', labels={'value':'Median Salary', 'industry':'Industry'})
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

In [ ]:
top_country = df.groupby('company_location')['salary_converted'].median().sort_values(ascending=False).head(10)
fig = px.bar(top_country, orientation='h', title='Top 10 Company Locations by Median Salary', labels={'value':'Median Salary', 'company_location':'Location'})
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

### 6.3 Remote Work & Salary

In [ ]:
sns.boxplot(x='remote_ratio', y='salary_converted', data=df, palette='rocket')
plt.title('Remote Ratio vs Salary')
plt.show()

### 6.4 Posting Month/Year Trends

In [ ]:
monthly = df.groupby(['posting_year', 'posting_month']).size().reset_index(name='job_count')
fig = px.line(monthly, x='posting_month', y='job_count', color='posting_year', markers=True, title='Job Postings Over Time')
fig.show()

## 7. Skills Analysis

In [ ]:
skills = df['required_skills'].dropna().str.split(', ')
exploded = skills.explode()
top_skills = exploded.value_counts().head(20)
fig = px.bar(top_skills, orientation='h', title='Top 20 Most In-Demand Skills', labels={'value':'Count', 'index':'Skill'})
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

### 7.1 Skill vs Salary Impact

In [ ]:
skill_salary = []
for skill in top_skills.index:
    mask = df['required_skills'].str.contains(skill, na=False)
    skill_salary.append({'skill':skill, 'avg_salary':df[mask]['salary_converted'].mean(), 'count':mask.sum()})
skill_salary = pd.DataFrame(skill_salary).sort_values('avg_salary', ascending=False)
fig = px.bar(skill_salary.head(15), x='skill', y='avg_salary', title='Top Skills by Avg Salary', labels={'avg_salary':'Avg Salary', 'skill':'Skill'})
fig.show()

## 8. Correlation Heatmap

In [ ]:
corr = df[['salary_converted', 'years_experience', 'deadline_gap_days', 'remote_ratio']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

## 9. Outlier Analysis

In [ ]:
fig = px.box(df, x='industry', y='salary_converted', points='suspectedoutliers', title='Salary Outliers by Industry')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

# 10. Insights & Recommendations
- Senior roles, and especially those in finance/technology/consulting, offer much higher pay.
- Remote ratio has a mild positive effect on salary, but not always.
- Top skills: Python, SQL, TensorFlow, AWS, and Tableau are everywhere, but rare skills (Kubernetes, GCP, Scala) pay more on average.
- Posting activity peaks in Q2/Q4, suggesting hiring cycles.
- Companies in US/Western Europe offer top salaries; company size and education also matter.
- Outliers in salary often combine rare skills and seniority.

## Creative Ideas:
- Build a skill-salary matrix for career planning.
- Look at job duration (deadline_gap_days) by role/industry.
- Analyze remote ratios by country.
- Detect rare job titles and map their salary/scope.
- Use interactive Plotly maps for global job distribution.

---
### This EDA can be expanded with ML predictions, in-depth skill clusters, or even job market simulation!