In [2]:
# --------------------------------------------------
# STEP: Reload and Inspect Data
# --------------------------------------------------

import pandas as pd
df = pd.read_csv('hr_synthetic_5000.csv', parse_dates=['HireDate', 'LastPromotionDate'])

# Quick overview
print("Dataset shape:", df.shape)
print("\nData types:")
print(df.dtypes.head())

print("\nMissing values per column:")
print(df.isna().sum())

 

Dataset shape: (5000, 28)

Data types:
EmployeeID     int64
Age            int64
Gender        object
Department    object
JobRole       object
dtype: object

Missing values per column:
EmployeeID                    0
Age                           0
Gender                        0
Department                    0
JobRole                       0
Education                     0
EducationField               50
MaritalStatus                 0
HireDate                      0
YearsAtCompany                0
TotalWorkingYears             0
NumCompaniesWorked            0
MonthlyIncome                 0
PercentSalaryHike             0
PerformanceRating             0
JobSatisfaction               0
EnvironmentSatisfaction       0
WorkLifeBalance               0
TrainingTimesLastYear         0
OverTime                      0
BusinessTravel                0
Promotions                    0
LastPromotionDate          3048
DistanceFromHome              0
LeaveBalance                  0
StockOptionLev

In [3]:
# --------------------------------------------------
# STEP: Handle Missing Values
# --------------------------------------------------

# Count missing values
missing_before = df.isna().sum()
print("Missing values before cleaning:\n", missing_before[missing_before > 0])

# Fill missing EducationField with mode
df['EducationField'].fillna(df['EducationField'].mode()[0], inplace=True)

# Fill missing LastPromotionDate with median HireDate + 3 years (approximation)
median_hire_date = df['HireDate'].median()
df['LastPromotionDate'] = df['LastPromotionDate'].fillna(median_hire_date + pd.DateOffset(years=3))

# Recheck
9print("\nMissing values after cleaning:\n", df.isna().sum().sum()) 

Missing values before cleaning:
 EducationField         50
LastPromotionDate    3048
dtype: int64

Missing values after cleaning:
 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['EducationField'].fillna(df['EducationField'].mode()[0], inplace=True)


In [5]:
df.isna().sum()

EmployeeID                 0
Age                        0
Gender                     0
Department                 0
JobRole                    0
Education                  0
EducationField             0
MaritalStatus              0
HireDate                   0
YearsAtCompany             0
TotalWorkingYears          0
NumCompaniesWorked         0
MonthlyIncome              0
PercentSalaryHike          0
PerformanceRating          0
JobSatisfaction            0
EnvironmentSatisfaction    0
WorkLifeBalance            0
TrainingTimesLastYear      0
OverTime                   0
BusinessTravel             0
Promotions                 0
LastPromotionDate          0
DistanceFromHome           0
LeaveBalance               0
StockOptionLevel           0
Attrition                  0
ManagerID                  0
dtype: int64

In [None]:
5 5 5 5 

In [None]:
5