# Importing required libraries

In [3]:
# Import necessary libraries for data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set default plot style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

# Loading the dataset

In [4]:
# Load the dataset into a DataFrame
df = pd.read_csv("../data/ai_job_dataset.csv")

# Basic information about the dataset

In [5]:
# Display the shape of the dataset
print("🔹 Dataset shape:", df.shape)

# Display the column names
print("\n🔹 Column names:\n", df.columns.tolist())

🔹 Dataset shape: (15000, 19)

🔹 Column names:
 ['job_id', 'job_title', 'salary_usd', 'salary_currency', 'experience_level', 'employment_type', 'company_location', 'company_size', 'employee_residence', 'remote_ratio', 'required_skills', 'education_required', 'years_experience', 'industry', 'posting_date', 'application_deadline', 'job_description_length', 'benefits_score', 'company_name']


# Previewing the first rows

In [6]:
# Show the first few rows of the dataset
df.head()

Unnamed: 0,job_id,job_title,salary_usd,salary_currency,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,required_skills,education_required,years_experience,industry,posting_date,application_deadline,job_description_length,benefits_score,company_name
0,AI00001,AI Research Scientist,90376,USD,SE,CT,China,M,China,50,"Tableau, PyTorch, Kubernetes, Linux, NLP",Bachelor,9,Automotive,2024-10-18,2024-11-07,1076,5.9,Smart Analytics
1,AI00002,AI Software Engineer,61895,USD,EN,CT,Canada,M,Ireland,100,"Deep Learning, AWS, Mathematics, Python, Docker",Master,1,Media,2024-11-20,2025-01-11,1268,5.2,TechCorp Inc
2,AI00003,AI Specialist,152626,USD,MI,FL,Switzerland,L,South Korea,0,"Kubernetes, Deep Learning, Java, Hadoop, NLP",Associate,2,Education,2025-03-18,2025-04-07,1974,9.4,Autonomous Tech
3,AI00004,NLP Engineer,80215,USD,SE,FL,India,M,India,50,"Scala, SQL, Linux, Python",PhD,7,Consulting,2024-12-23,2025-02-24,1345,8.6,Future Systems
4,AI00005,AI Consultant,54624,EUR,EN,PT,France,S,Singapore,100,"MLOps, Java, Tableau, Python",Master,0,Media,2025-04-15,2025-06-23,1989,6.6,Advanced Robotics


# Descriptive statistics

In [7]:
# Display descriptive statistics for all columns
df.describe(include="all")

Unnamed: 0,job_id,job_title,salary_usd,salary_currency,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,required_skills,education_required,years_experience,industry,posting_date,application_deadline,job_description_length,benefits_score,company_name
count,15000,15000,15000.0,15000,15000,15000,15000,15000,15000,15000.0,15000,15000,15000.0,15000,15000,15000,15000.0,15000.0,15000
unique,15000,20,,3,4,4,20,3,20,,13663,4,,15,486,543,,,16
top,AI00001,Machine Learning Researcher,,USD,MI,FT,Germany,S,Sweden,,"Python, TensorFlow, PyTorch",Bachelor,,Retail,2024-07-05,2025-01-05,,,TechCorp Inc
freq,1,808,,11957,3781,3812,814,5007,790,,17,3789,,1063,51,47,,,980
mean,,,115348.965133,,,,,,,49.483333,,,6.2532,,,,1503.314733,7.504273,
std,,,60260.940438,,,,,,,40.812712,,,5.545768,,,,576.127083,1.45087,
min,,,32519.0,,,,,,,0.0,,,0.0,,,,500.0,5.0,
25%,,,70179.75,,,,,,,0.0,,,2.0,,,,1003.75,6.2,
50%,,,99705.0,,,,,,,50.0,,,5.0,,,,1512.0,7.5,
75%,,,146408.5,,,,,,,100.0,,,10.0,,,,2000.0,8.8,


# Data types and missing values

In [8]:
# Display column data types and non-null counts
print("\n🔹 Data types and non-null values:\n")
df.info()

# Display missing values per column
print("\n🔹 Number of missing values per column:\n")
print(df.isnull().sum())


🔹 Data types and non-null values:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   job_id                  15000 non-null  object 
 1   job_title               15000 non-null  object 
 2   salary_usd              15000 non-null  int64  
 3   salary_currency         15000 non-null  object 
 4   experience_level        15000 non-null  object 
 5   employment_type         15000 non-null  object 
 6   company_location        15000 non-null  object 
 7   company_size            15000 non-null  object 
 8   employee_residence      15000 non-null  object 
 9   remote_ratio            15000 non-null  int64  
 10  required_skills         15000 non-null  object 
 11  education_required      15000 non-null  object 
 12  years_experience        15000 non-null  int64  
 13  industry                15000 non-null  object 
 14  po