# 🧹 Data Preprocessing for Salary Analysis
This notebook handles the data cleaning and preparation steps for the **Company Size vs Salary Analysis** project.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load dataset
df = pd.read_csv('data/raw/job_salary_dataset.csv')
df.head()

In [None]:
# Check for missing data
df.info()
df.isnull().sum()

In [None]:
# Handle missing values
df = df.dropna(subset=['salary', 'company_size'])
df['salary'] = df['salary'].fillna(df['salary'].median())

In [None]:
# Standardize company size categories
df['company_size'] = df['company_size'].replace({
    '1-50': 'Small',
    '51-200': 'Medium',
    '201-1000': 'Large',
    '5000+': 'Very Large'
})

In [None]:
# Remove outliers in salary
q99 = df['salary'].quantile(0.99)
df = df[df['salary'] < q99]
print(f"Rows remaining after outlier removal: {len(df)}")

In [None]:
# Quick visualization to verify cleaned data
sns.boxplot(x='company_size', y='salary', data=df)
plt.title('Salary Distribution by Company Size (Cleaned)')
plt.show()

In [None]:
# Save the cleaned dataset
output_path = Path('data/processed/cleaned_salary_data.csv')
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(output_path, index=False)
print(f"Cleaned dataset saved to: {output_path}")