# Campus Placement Prediction - Exploratory Data Analysis (EDA)

This notebook performs exploratory data analysis on the Campus Placement dataset.

## Objectives:
1. Load and inspect the dataset
2. Understand the data structure and types
3. Analyze missing values and data quality
4. Perform univariate analysis
5. Perform bivariate analysis
6. Visualize relationships between features and target
7. Identify key insights for modeling

## 1. Setup and Imports

In [None]:
# Import required libraries
import sys
import os

# Add src directory to path
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from data import load_data, display_data_info, RANDOM_SEED

# Set random seed for reproducibility
np.random.seed(RANDOM_SEED)

# Configure visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Setup complete!")
print(f"Random seed: {RANDOM_SEED}")

## 2. Load Dataset

In [None]:
# Load the dataset
try:
    df = load_data()
except FileNotFoundError as e:
    print(e)
    print("\nPlease download the dataset following the instructions in data/README.md")
    df = None

## 3. Initial Data Inspection

In [None]:
# Display comprehensive data information
if df is not None:
    display_data_info(df)

## 4. Target Variable Analysis

In [None]:
# Analyze target variable distribution
if df is not None:
    print("Target Variable (status) Distribution:")
    print(df['status'].value_counts())
    print("\nPercentage Distribution:")
    print(df['status'].value_counts(normalize=True) * 100)
    
    # Visualize
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    df['status'].value_counts().plot(kind='bar', ax=axes[0])
    axes[0].set_title('Placement Status Distribution')
    axes[0].set_xlabel('Status')
    axes[0].set_ylabel('Count')
    
    df['status'].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%')
    axes[1].set_title('Placement Status Proportion')
    axes[1].set_ylabel('')
    
    plt.tight_layout()
    plt.show()

## 5. Numerical Features Analysis

In [None]:
# Distribution of numerical features
if df is not None:
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'sl_no' in numerical_cols:
        numerical_cols.remove('sl_no')
    
    print(f"Numerical Features: {numerical_cols}")
    print(df[numerical_cols].describe())

## 6. Categorical Features Analysis

In [None]:
# Categorical features analysis
if df is not None:
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    if 'status' in categorical_cols:
        categorical_cols.remove('status')
    
    print(f"Categorical Features: {categorical_cols}")
    for col in categorical_cols:
        print(f"\n{col}:")
        print(df[col].value_counts())

## 7. Correlation Analysis

In [None]:
# Correlation matrix
if df is not None:
    df_corr = df.copy()
    df_corr['status_encoded'] = (df_corr['status'] == 'Placed').astype(int)
    
    corr_cols = [col for col in numerical_cols if col != 'salary'] + ['status_encoded']
    corr_matrix = df_corr[corr_cols].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.show()

## 8. Key Insights

In [None]:
if df is not None:
    print("Key Insights:")
    print(f"- Dataset size: {df.shape}")
    print(f"- Placement rate: {(df['status'] == 'Placed').mean() * 100:.1f}%")
    print(f"- Missing values: {df.isnull().sum().sum()}")