In [1]:
import pandas as pd 
import numpy as np

In [2]:
## Data Uploading

In [3]:
df = pd.read_csv(r"ai_job_dataset.csv")

In [4]:
df.shape

(15000, 19)

In [5]:
## EDA & Data Processing 

In [6]:
df

Unnamed: 0,job_id,job_title,salary_usd,salary_currency,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,required_skills,education_required,years_experience,industry,posting_date,application_deadline,job_description_length,benefits_score,company_name
0,AI00001,AI Research Scientist,90376,USD,SE,CT,China,M,China,50,"Tableau, PyTorch, Kubernetes, Linux, NLP",Bachelor,9,Automotive,2024-10-18,2024-11-07,1076,5.9,Smart Analytics
1,AI00002,AI Software Engineer,61895,USD,EN,CT,Canada,M,Ireland,100,"Deep Learning, AWS, Mathematics, Python, Docker",Master,1,Media,2024-11-20,2025-01-11,1268,5.2,TechCorp Inc
2,AI00003,AI Specialist,152626,USD,MI,FL,Switzerland,L,South Korea,0,"Kubernetes, Deep Learning, Java, Hadoop, NLP",Associate,2,Education,2025-03-18,2025-04-07,1974,9.4,Autonomous Tech
3,AI00004,NLP Engineer,80215,USD,SE,FL,India,M,India,50,"Scala, SQL, Linux, Python",PhD,7,Consulting,2024-12-23,2025-02-24,1345,8.6,Future Systems
4,AI00005,AI Consultant,54624,EUR,EN,PT,France,S,Singapore,100,"MLOps, Java, Tableau, Python",Master,0,Media,2025-04-15,2025-06-23,1989,6.6,Advanced Robotics
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,AI14996,Robotics Engineer,38604,USD,EN,FL,Finland,S,Finland,50,"Java, Kubernetes, Azure",Bachelor,1,Energy,2025-02-06,2025-03-25,1635,7.9,Advanced Robotics
14996,AI14997,Machine Learning Researcher,57811,GBP,EN,CT,United Kingdom,M,United Kingdom,0,"Mathematics, Docker, SQL, Deep Learning",Master,0,Government,2024-10-16,2024-10-30,1624,8.2,Smart Analytics
14997,AI14998,NLP Engineer,189490,USD,EX,CT,South Korea,L,South Korea,50,"Scala, Spark, NLP",Associate,17,Manufacturing,2024-03-19,2024-05-02,1336,7.4,AI Innovations
14998,AI14999,Head of AI,79461,EUR,EN,FT,Netherlands,M,Netherlands,0,"Java, Computer Vision, Python, TensorFlow",PhD,1,Real Estate,2024-03-22,2024-04-23,1935,5.6,Smart Analytics


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   job_id                  15000 non-null  object 
 1   job_title               15000 non-null  object 
 2   salary_usd              15000 non-null  int64  
 3   salary_currency         15000 non-null  object 
 4   experience_level        15000 non-null  object 
 5   employment_type         15000 non-null  object 
 6   company_location        15000 non-null  object 
 7   company_size            15000 non-null  object 
 8   employee_residence      15000 non-null  object 
 9   remote_ratio            15000 non-null  int64  
 10  required_skills         15000 non-null  object 
 11  education_required      15000 non-null  object 
 12  years_experience        15000 non-null  int64  
 13  industry                15000 non-null  object 
 14  posting_date            15000 non-null

In [8]:
df['job_id'].nunique()

15000

In [9]:
df['required_skills'].head(10)

0           Tableau, PyTorch, Kubernetes, Linux, NLP
1    Deep Learning, AWS, Mathematics, Python, Docker
2       Kubernetes, Deep Learning, Java, Hadoop, NLP
3                          Scala, SQL, Linux, Python
4                       MLOps, Java, Tableau, Python
5                  Data Visualization, R, SQL, Linux
6                                   R, Docker, MLOps
7          Python, SQL, Computer Vision, Java, Azure
8                   Hadoop, Git, Mathematics, Python
9                    MLOps, GCP, Scala, Azure, Linux
Name: required_skills, dtype: object

In [10]:
## Data Preprocessing 

In [11]:
# Converting Columns to dates
df['posting_date'] = pd.to_datetime(df['posting_date'])
df['application_deadline'] = pd.to_datetime(df['application_deadline'])

# spliting posting_date
df['posting_day'] = df['posting_date'].dt.day
df['posting_month'] = df['posting_date'].dt.month
df['posting_year'] = df['posting_date'].dt.year

# spliting application_deadline
df['deadline_day'] = df['application_deadline'].dt.day
df['deadline_month'] = df['application_deadline'].dt.month
df['deadline_year'] = df['application_deadline'].dt.year

# calculating days for deadline
df['days_to_deadline'] = (df['application_deadline'] - df['posting_date']).dt.days


# skills separating
df['required_skills'] = df['required_skills'].apply(lambda x: [skill.strip() for skill in str(x).split(',')])

# exploading
df_exploded = df.explode('required_skills')

df_exploded.to_excel("ai_jobs_skills_exploded_with_dates.xlsx", index=False)

In [12]:
df_exploded.head()

Unnamed: 0,job_id,job_title,salary_usd,salary_currency,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,...,job_description_length,benefits_score,company_name,posting_day,posting_month,posting_year,deadline_day,deadline_month,deadline_year,days_to_deadline
0,AI00001,AI Research Scientist,90376,USD,SE,CT,China,M,China,50,...,1076,5.9,Smart Analytics,18,10,2024,7,11,2024,20
0,AI00001,AI Research Scientist,90376,USD,SE,CT,China,M,China,50,...,1076,5.9,Smart Analytics,18,10,2024,7,11,2024,20
0,AI00001,AI Research Scientist,90376,USD,SE,CT,China,M,China,50,...,1076,5.9,Smart Analytics,18,10,2024,7,11,2024,20
0,AI00001,AI Research Scientist,90376,USD,SE,CT,China,M,China,50,...,1076,5.9,Smart Analytics,18,10,2024,7,11,2024,20
0,AI00001,AI Research Scientist,90376,USD,SE,CT,China,M,China,50,...,1076,5.9,Smart Analytics,18,10,2024,7,11,2024,20


In [13]:
df_exploded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59893 entries, 0 to 14999
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   job_id                  59893 non-null  object        
 1   job_title               59893 non-null  object        
 2   salary_usd              59893 non-null  int64         
 3   salary_currency         59893 non-null  object        
 4   experience_level        59893 non-null  object        
 5   employment_type         59893 non-null  object        
 6   company_location        59893 non-null  object        
 7   company_size            59893 non-null  object        
 8   employee_residence      59893 non-null  object        
 9   remote_ratio            59893 non-null  int64         
 10  required_skills         59893 non-null  object        
 11  education_required      59893 non-null  object        
 12  years_experience        59893 non-null  int64      