# ðŸ“Š Exploratory Data Analysis - Job Application Dataset

This notebook explores the job application dataset to understand patterns and relationships before building predictive models.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Load & Inspect Data

In [None]:
# Generate data if not exists
import sys
sys.path.append('..')
from src.generate_data import generate_job_application_data

df = generate_job_application_data(n_samples=2000)
df.to_csv('../data/job_applications.csv', index=False)

print(f"Dataset Shape: {df.shape}")
df.head()

In [None]:
# Data types and info
print("Data Types:")
print(df.dtypes)
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
# Statistical summary
df.describe().round(2)

## 2. Target Variable Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Count plot
sns.countplot(x='hired', data=df, ax=axes[0], palette=['#ff6b6b', '#4ecdc4'])
axes[0].set_title('Hired vs Not Hired Distribution')
axes[0].set_xlabel('Hired')
axes[0].set_xticklabels(['Not Hired', 'Hired'])

# Pie chart
hired_counts = df['hired'].value_counts()
axes[1].pie(hired_counts, labels=['Not Hired', 'Hired'], autopct='%1.1f%%',
            colors=['#ff6b6b', '#4ecdc4'], explode=(0, 0.05))
axes[1].set_title('Hiring Rate')

plt.tight_layout()
plt.show()

print(f"\nClass Distribution:\n{df['hired'].value_counts(normalize=True)}")

## 3. Numerical Features Analysis

In [None]:
numerical_cols = ['years_experience', 'skills_match_score', 'salary_expectation',
                  'interview_score', 'technical_test_score', 'cultural_fit_score']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(numerical_cols):
    sns.histplot(data=df, x=col, hue='hired', kde=True, ax=axes[i], palette=['#ff6b6b', '#4ecdc4'])
    axes[i].set_title(f'{col} by Hiring Status')

plt.tight_layout()
plt.show()

In [None]:
# Box plots
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(numerical_cols):
    sns.boxplot(data=df, x='hired', y=col, ax=axes[i], palette=['#ff6b6b', '#4ecdc4'])
    axes[i].set_title(f'{col}')
    axes[i].set_xticklabels(['Not Hired', 'Hired'])

plt.tight_layout()
plt.show()

## 4. Categorical Features Analysis

In [None]:
categorical_cols = ['education_level', 'job_category', 'company_size']

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, col in enumerate(categorical_cols):
    # Calculate hiring rate per category
    hire_rate = df.groupby(col)['hired'].mean().sort_values(ascending=False)
    sns.barplot(x=hire_rate.index, y=hire_rate.values, ax=axes[i], palette='viridis')
    axes[i].set_title(f'Hiring Rate by {col}')
    axes[i].set_ylabel('Hiring Rate')
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 5. Correlation Analysis

In [None]:
# Correlation matrix (numerical features only)
numeric_df = df.select_dtypes(include=[np.number])

plt.figure(figsize=(12, 10))
corr_matrix = numeric_df.corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='RdYlBu_r',
            center=0, square=True)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Correlation with target
target_corr = numeric_df.corr()['hired'].drop('hired').sort_values(ascending=False)

plt.figure(figsize=(10, 6))
colors = ['#4ecdc4' if x > 0 else '#ff6b6b' for x in target_corr]
plt.barh(target_corr.index, target_corr.values, color=colors)
plt.xlabel('Correlation with Hired')
plt.title('Feature Correlation with Target')
plt.tight_layout()
plt.show()

## 6. Key Insights

### Findings:
1. **Class Imbalance**: Dataset shows some imbalance between hired and not hired
2. **Strong Predictors**: Skills match score, interview score, and technical test score show positive correlation with hiring
3. **Experience**: Higher experience tends to increase hiring probability
4. **Education**: Master's and PhD holders have slightly higher hiring rates
5. **Referrals**: Having a referral positively impacts hiring chances

In [None]:
# Save summary statistics
summary = {
    'total_applications': len(df),
    'hired_count': df['hired'].sum(),
    'hire_rate': df['hired'].mean(),
    'avg_experience': df['years_experience'].mean(),
    'avg_skills_match': df['skills_match_score'].mean(),
    'referral_hire_rate': df[df['has_referral'] == 1]['hired'].mean(),
    'no_referral_hire_rate': df[df['has_referral'] == 0]['hired'].mean()
}

print("Dataset Summary:")
for k, v in summary.items():
    print(f"  {k}: {v:.3f}" if isinstance(v, float) else f"  {k}: {v}")