## 1. Handling Missing Values, Data Types, and Duplicates
In this section, we clean the dataset by:
- Removing columns or rows with excessive null values
- Converting date columns to datetime format
- Dropping duplicate records

In [4]:
# Load the dataset
import pandas as pd

df = pd.read_csv("../data/ai_job_dataset.csv")

# Show initial info
print("Initial dataset shape:", df.shape)
df.info()

# Drop columns with excessive missing values if any
threshold = 0.5  # drop columns with more than 50% NaNs
null_ratios = df.isna().mean()
cols_to_drop = null_ratios[null_ratios > threshold].index.tolist()
print("Dropping columns with too many missing values:", cols_to_drop)
df.drop(columns=cols_to_drop, inplace=True)

# Drop rows with any remaining missing values
print("Missing values before dropna:", df.isna().sum().sum())
df.dropna(inplace=True)
print("Missing values after dropna:", df.isna().sum().sum())
print("Dataset shape after removing missing values:", df.shape)

# Convert date columns to datetime
if "posting_date" in df.columns:
    df["posting_date"] = pd.to_datetime(df["posting_date"], errors="coerce")

if "application_deadline" in df.columns:
    df["application_deadline"] = pd.to_datetime(df["application_deadline"], errors="coerce")

# Drop duplicate rows
before = df.shape[0]
df.drop_duplicates(inplace=True)
after = df.shape[0]
print(f"Removed {before - after} duplicate rows.")

Initial dataset shape: (15000, 19)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   job_id                  15000 non-null  object 
 1   job_title               15000 non-null  object 
 2   salary_usd              15000 non-null  int64  
 3   salary_currency         15000 non-null  object 
 4   experience_level        15000 non-null  object 
 5   employment_type         15000 non-null  object 
 6   company_location        15000 non-null  object 
 7   company_size            15000 non-null  object 
 8   employee_residence      15000 non-null  object 
 9   remote_ratio            15000 non-null  int64  
 10  required_skills         15000 non-null  object 
 11  education_required      15000 non-null  object 
 12  years_experience        15000 non-null  int64  
 13  industry                15000 non-null  object 
 14  pos