# Campus Placement - EDA and Models

Notebook này thực hiện:
1. Exploratory Data Analysis (EDA)
2. Training các mô hình classification và regression
3. Đánh giá và so sánh kết quả

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 1. Load Data

In [None]:
# Load dataset
df = pd.read_csv('data/Placement_Data_Full_Class.csv')
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Basic info
df.info()

In [None]:
# Statistical summary
df.describe()

## 2. Exploratory Data Analysis (EDA)

In [None]:
# Check missing values
print("Missing values:")
print(df.isnull().sum())

In [None]:
# Distribution of placement status
plt.figure(figsize=(8, 5))
df['status'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Distribution of Placement Status')
plt.xlabel('Status')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

print(df['status'].value_counts())
print(f"\nPlacement Rate: {(df['status']=='Placed').sum()/len(df)*100:.2f}%")

In [None]:
# Salary distribution (for placed students)
plt.figure(figsize=(10, 5))
df[df['status']=='Placed']['salary'].hist(bins=20, edgecolor='black', alpha=0.7)
plt.title('Salary Distribution (Placed Students)')
plt.xlabel('Salary')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

print(f"Mean Salary: {df[df['status']=='Placed']['salary'].mean():.2f}")
print(f"Median Salary: {df[df['status']=='Placed']['salary'].median():.2f}")

In [None]:
# Gender distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

df['gender'].value_counts().plot(kind='bar', ax=axes[0], color=['lightblue', 'pink'])
axes[0].set_title('Gender Distribution')
axes[0].set_xlabel('Gender')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=0)

pd.crosstab(df['gender'], df['status'], normalize='index').plot(kind='bar', ax=axes[1], stacked=False)
axes[1].set_title('Placement Status by Gender')
axes[1].set_xlabel('Gender')
axes[1].set_ylabel('Proportion')
axes[1].tick_params(axis='x', rotation=0)
axes[1].legend(title='Status')

plt.tight_layout()
plt.show()

In [None]:
# Work experience vs placement
pd.crosstab(df['workex'], df['status'], normalize='index').plot(kind='bar', figsize=(8, 5))
plt.title('Placement Status by Work Experience')
plt.xlabel('Work Experience')
plt.ylabel('Proportion')
plt.xticks(rotation=0)
plt.legend(title='Status')
plt.tight_layout()
plt.show()

In [None]:
# Academic performance distribution
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
academic_cols = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']

for idx, col in enumerate(academic_cols):
    row = idx // 3
    col_idx = idx % 3
    axes[row, col_idx].hist(df[col].dropna(), bins=20, edgecolor='black', alpha=0.7)
    axes[row, col_idx].set_title(f'Distribution of {col}')
    axes[row, col_idx].set_xlabel(col)
    axes[row, col_idx].set_ylabel('Frequency')

# Hide the last subplot
axes[1, 2].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
numeric_df = df.select_dtypes(include=[np.number])
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_df.corr(), annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

## 3. Data Preprocessing

In [None]:
# Preprocess data for classification
def preprocess_classification(df):
    data = df.copy()
    
    # Drop sl_no
    if 'sl_no' in data.columns:
        data = data.drop(columns=['sl_no'])
    
    # Trim whitespace
    for c in data.select_dtypes(include='object').columns:
        data[c] = data[c].str.strip()
    
    # Map status to binary
    data['status_bin'] = data['status'].map({'Placed': 1, 'Not Placed': 0})
    
    # Define features
    numeric_cols = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']
    cat_cols = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation']
    
    # Prepare X and y
    X_num = data[numeric_cols]
    X_cat = pd.get_dummies(data[cat_cols].astype(str), drop_first=True)
    X = pd.concat([X_num.reset_index(drop=True), X_cat.reset_index(drop=True)], axis=1)
    y = data['status_bin'].astype(int)
    
    return X, y

X, y = preprocess_classification(df)
print(f"Feature shape: {X.shape}")
print(f"Target shape: {y.shape}")

## 4. Classification Models (Predicting Placement Status)

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# Train Logistic Regression
log_model = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)

print("=== Logistic Regression ===")
print(classification_report(y_test, log_pred, digits=4))
print(f"Accuracy: {log_model.score(X_test, y_test):.4f}")

In [None]:
# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

print("=== Random Forest ===")
print(classification_report(y_test, rf_pred, digits=4))
print(f"Accuracy: {rf_model.score(X_test, y_test):.4f}")

In [None]:
# Confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.heatmap(confusion_matrix(y_test, log_pred), annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title('Confusion Matrix - Logistic Regression')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('True')

sns.heatmap(confusion_matrix(y_test, rf_pred), annot=True, fmt='d', cmap='Greens', ax=axes[1])
axes[1].set_title('Confusion Matrix - Random Forest')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('True')

plt.tight_layout()
plt.show()

In [None]:
# Feature importance (Random Forest)
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'].head(10), feature_importance['importance'].head(10))
plt.xlabel('Importance')
plt.title('Top 10 Feature Importances (Random Forest)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 5. Regression Models (Predicting Salary for Placed Students)

In [None]:
# Preprocess data for regression (only placed students)
def preprocess_regression(df):
    data = df.copy()
    
    # Filter placed students only
    data = data[data['status'] == 'Placed'].copy()
    
    # Drop sl_no and status
    if 'sl_no' in data.columns:
        data = data.drop(columns=['sl_no'])
    
    # Trim whitespace
    for c in data.select_dtypes(include='object').columns:
        data[c] = data[c].str.strip()
    
    # Ensure salary is numeric
    data['salary'] = pd.to_numeric(data['salary'], errors='coerce')
    data = data.dropna(subset=['salary'])
    
    # Define features
    numeric_cols = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']
    cat_cols = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation']
    
    # Prepare X and y
    X_num = data[numeric_cols]
    X_cat = pd.get_dummies(data[cat_cols].astype(str), drop_first=True)
    X = pd.concat([X_num.reset_index(drop=True), X_cat.reset_index(drop=True)], axis=1)
    y = data['salary'].astype(float)
    
    return X, y

X_reg, y_reg = preprocess_regression(df)
print(f"Regression feature shape: {X_reg.shape}")
print(f"Regression target shape: {y_reg.shape}")

In [None]:
# Split data for regression
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)
print(f"Train set: {Xr_train.shape}")
print(f"Test set: {Xr_test.shape}")

In [None]:
# Train Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=200, random_state=42)
rf_reg.fit(Xr_train, yr_train)
rf_reg_pred = rf_reg.predict(Xr_test)

print("=== Random Forest Regressor ===")
print(f"MAE: {mean_absolute_error(yr_test, rf_reg_pred):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(yr_test, rf_reg_pred)):.2f}")
print(f"R2 Score: {r2_score(yr_test, rf_reg_pred):.4f}")

In [None]:
# Predicted vs Actual Salary
plt.figure(figsize=(8, 8))
plt.scatter(yr_test, rf_reg_pred, alpha=0.6)
plt.plot([yr_test.min(), yr_test.max()], [yr_test.min(), yr_test.max()], '--', color='red', linewidth=2)
plt.xlabel('Actual Salary')
plt.ylabel('Predicted Salary')
plt.title('Predicted vs Actual Salary (Random Forest Regressor)')
plt.tight_layout()
plt.show()

## 6. Save Models

In [None]:
# Save classification models
import os
os.makedirs('outputs', exist_ok=True)

joblib.dump(log_model, 'outputs/logistic_regression_clf.joblib')
joblib.dump(rf_model, 'outputs/random_forest_clf.joblib')
joblib.dump(rf_reg, 'outputs/random_forest_reg.joblib')

print("Models saved successfully!")

## 7. Summary

### Key Findings:
1. **Placement Rate**: Hiển thị tỷ lệ sinh viên được tuyển dụng
2. **Important Features**: Work experience, academic scores, và specialisation có ảnh hưởng quan trọng
3. **Model Performance**: 
   - Classification: Random Forest và Logistic Regression đều cho kết quả tốt
   - Regression: Random Forest Regressor dự đoán mức lương khá chính xác

### Next Steps:
- Thử nghiệm thêm các mô hình khác (XGBoost, SVM, etc.)
- Hyperparameter tuning để cải thiện performance
- Feature engineering để tạo thêm các features hữu ích