# Campus Placement Analysis - EDA and Models

Notebook toàn diện cho phân tích dữ liệu và huấn luyện mô hình dự đoán kết quả tuyển dụng.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

try:
    from xgboost import XGBClassifier, XGBRegressor
    xgboost_available = True
except:
    xgboost_available = False
    print("XGBoost not available")

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 2. Load Data

In [None]:
# Load dataset
df = pd.read_csv('Placement_Data_Full_Class.csv')
print(f"Dataset shape: {df.shape}")
df.head()

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Basic info
print("Dataset Info:")
df.info()
print("\nDataset Description:")
df.describe()

In [None]:
# Check missing values
print("Missing values:")
print(df.isnull().sum())
print(f"\nTotal missing: {df.isnull().sum().sum()}")

In [None]:
# Check placement status distribution
print("Placement Status Distribution:")
print(df['status'].value_counts())
print(f"\nPlacement rate: {(df['status'] == 'Placed').mean()*100:.2f}%")

# Visualize
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
df['status'].value_counts().plot(kind='bar', ax=ax[0], color=['skyblue', 'salmon'])
ax[0].set_title('Placement Status Count')
ax[0].set_xlabel('Status')
ax[0].set_ylabel('Count')

df['status'].value_counts().plot(kind='pie', ax=ax[1], autopct='%1.1f%%', colors=['skyblue', 'salmon'])
ax[1].set_title('Placement Status Percentage')
ax[1].set_ylabel('')
plt.tight_layout()
plt.show()

In [None]:
# Salary distribution for placed students
placed_df = df[df['status'] == 'Placed'].copy()
print(f"Number of placed students: {len(placed_df)}")
print(f"\nSalary statistics:")
print(placed_df['salary'].describe())

# Visualize salary distribution
plt.figure(figsize=(10, 5))
sns.histplot(placed_df['salary'].dropna(), kde=True, bins=20, color='green')
plt.title('Salary Distribution for Placed Students')
plt.xlabel('Salary')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Analyze gender distribution
print("Gender distribution:")
print(df['gender'].value_counts())

# Gender vs Placement
gender_placement = pd.crosstab(df['gender'], df['status'], normalize='index') * 100
print("\nPlacement rate by gender (%):")
print(gender_placement)

gender_placement.plot(kind='bar', figsize=(8, 5), color=['salmon', 'skyblue'])
plt.title('Placement Rate by Gender')
plt.xlabel('Gender')
plt.ylabel('Percentage')
plt.xticks(rotation=0)
plt.legend(title='Status')
plt.tight_layout()
plt.show()

In [None]:
# Work experience vs Placement
workex_placement = pd.crosstab(df['workex'], df['status'], normalize='index') * 100
print("Placement rate by work experience (%):")
print(workex_placement)

workex_placement.plot(kind='bar', figsize=(8, 5), color=['salmon', 'skyblue'])
plt.title('Placement Rate by Work Experience')
plt.xlabel('Work Experience')
plt.ylabel('Percentage')
plt.xticks(rotation=0)
plt.legend(title='Status')
plt.tight_layout()
plt.show()

In [None]:
# Analyze academic scores
numeric_cols = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']

# Box plots for academic scores by placement status
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, col in enumerate(numeric_cols):
    df.boxplot(column=col, by='status', ax=axes[idx])
    axes[idx].set_title(f'{col} by Placement Status')
    axes[idx].set_xlabel('Status')
    axes[idx].set_ylabel(col)

# Remove extra subplot
fig.delaxes(axes[5])
plt.suptitle('')
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
# Create binary status
df_corr = df.copy()
df_corr['status_bin'] = df_corr['status'].map({'Placed': 1, 'Not Placed': 0})

# Select numeric columns for correlation
corr_cols = numeric_cols + ['status_bin']
correlation_matrix = df_corr[corr_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

## 4. Data Preprocessing

In [None]:
def preprocess_data(df, for_regression=False):
    """
    Preprocess data for classification or regression
    """
    data = df.copy()
    
    # Drop sl_no if exists
    if 'sl_no' in data.columns:
        data = data.drop(columns=['sl_no'])
    
    # Trim whitespace
    for c in data.select_dtypes(include='object').columns:
        data[c] = data[c].str.strip()
    
    # Create binary status
    if 'status' in data.columns:
        data['status_bin'] = data['status'].map({'Placed': 1, 'Not Placed': 0})
    
    # For regression, filter only placed students
    if for_regression:
        data = data[data['status'] == 'Placed'].copy()
        data = data.dropna(subset=['salary'])
        data['salary'] = pd.to_numeric(data['salary'], errors='coerce')
    
    return data

def prepare_features(df, numeric_cols, cat_cols, target_col):
    """
    Prepare features and target for modeling
    """
    y = df[target_col]
    
    # Numeric features
    X_num = df[numeric_cols]
    
    # Categorical features - one-hot encoding
    X_cat = pd.get_dummies(df[cat_cols].astype(str), drop_first=True) if cat_cols else pd.DataFrame(index=df.index)
    
    # Combine
    X = pd.concat([X_num.reset_index(drop=True), X_cat.reset_index(drop=True)], axis=1)
    
    return X, y

## 5. Classification - Predict Placement Status

In [None]:
# Preprocess for classification
data_class = preprocess_data(df, for_regression=False)

# Define features
numeric_features = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']
categorical_features = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation']

# Prepare X and y
X_class, y_class = prepare_features(data_class, numeric_features, categorical_features, 'status_bin')
y_class = y_class.astype(int)

print(f"Feature matrix shape: {X_class.shape}")
print(f"Target shape: {y_class.shape}")
print(f"\nFeature columns: {list(X_class.columns)}")

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42, stratify=y_class
)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")
print(f"\nClass distribution in training set:")
print(y_train.value_counts())

In [None]:
# Train Logistic Regression
print("Training Logistic Regression...")
log_model = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)
log_acc = accuracy_score(y_test, log_pred)

print(f"Logistic Regression Accuracy: {log_acc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, log_pred, target_names=['Not Placed', 'Placed']))

In [None]:
# Train Random Forest
print("Training Random Forest...")
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)

print(f"Random Forest Accuracy: {rf_acc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, rf_pred, target_names=['Not Placed', 'Placed']))

In [None]:
# Train XGBoost (if available)
if xgboost_available:
    print("Training XGBoost...")
    xgb_model = XGBClassifier(n_estimators=200, eval_metric='logloss', random_state=42)
    xgb_model.fit(X_train, y_train)
    xgb_pred = xgb_model.predict(X_test)
    xgb_acc = accuracy_score(y_test, xgb_pred)
    
    print(f"XGBoost Accuracy: {xgb_acc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, xgb_pred, target_names=['Not Placed', 'Placed']))
else:
    print("XGBoost not available")

In [None]:
# Confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Logistic Regression
cm_log = confusion_matrix(y_test, log_pred)
sns.heatmap(cm_log, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title(f'Logistic Regression\nAccuracy: {log_acc:.4f}')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

# Random Forest
cm_rf = confusion_matrix(y_test, rf_pred)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', ax=axes[1])
axes[1].set_title(f'Random Forest\nAccuracy: {rf_acc:.4f}')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')

# XGBoost
if xgboost_available:
    cm_xgb = confusion_matrix(y_test, xgb_pred)
    sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='Blues', ax=axes[2])
    axes[2].set_title(f'XGBoost\nAccuracy: {xgb_acc:.4f}')
    axes[2].set_xlabel('Predicted')
    axes[2].set_ylabel('Actual')
else:
    axes[2].text(0.5, 0.5, 'XGBoost not available', ha='center', va='center')
    axes[2].set_xticks([])
    axes[2].set_yticks([])

plt.tight_layout()
plt.show()

In [None]:
# Feature importance (Random Forest)
feature_importance = pd.DataFrame({
    'feature': X_class.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print(feature_importance.head(10))

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance.head(10), x='importance', y='feature', palette='viridis')
plt.title('Top 10 Feature Importances (Random Forest)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

## 6. Regression - Predict Salary for Placed Students

In [None]:
# Preprocess for regression
data_reg = preprocess_data(df, for_regression=True)

print(f"Number of placed students with salary data: {len(data_reg)}")

if len(data_reg) < 5:
    print("Not enough data for regression. Skipping.")
else:
    # Prepare features
    X_reg, y_reg = prepare_features(data_reg, numeric_features, categorical_features, 'salary')
    y_reg = y_reg.astype(float)
    
    print(f"\nRegression feature matrix shape: {X_reg.shape}")
    print(f"Regression target shape: {y_reg.shape}")

In [None]:
if len(data_reg) >= 5:
    # Split data for regression
    Xr_train, Xr_test, yr_train, yr_test = train_test_split(
        X_reg, y_reg, test_size=0.2, random_state=42
    )
    
    print(f"Regression training set size: {Xr_train.shape}")
    print(f"Regression test set size: {Xr_test.shape}")

In [None]:
if len(data_reg) >= 5:
    # Train Random Forest Regressor
    print("Training Random Forest Regressor...")
    rfr_model = RandomForestRegressor(n_estimators=200, random_state=42)
    rfr_model.fit(Xr_train, yr_train)
    rfr_pred = rfr_model.predict(Xr_test)
    
    rfr_mae = mean_absolute_error(yr_test, rfr_pred)
    rfr_rmse = np.sqrt(mean_squared_error(yr_test, rfr_pred))
    rfr_r2 = r2_score(yr_test, rfr_pred)
    
    print(f"Random Forest Regressor:")
    print(f"  MAE: {rfr_mae:.2f}")
    print(f"  RMSE: {rfr_rmse:.2f}")
    print(f"  R² Score: {rfr_r2:.4f}")

In [None]:
if len(data_reg) >= 5 and xgboost_available:
    # Train XGBoost Regressor
    print("Training XGBoost Regressor...")
    xgbr_model = XGBRegressor(n_estimators=200, random_state=42)
    xgbr_model.fit(Xr_train, yr_train)
    xgbr_pred = xgbr_model.predict(Xr_test)
    
    xgbr_mae = mean_absolute_error(yr_test, xgbr_pred)
    xgbr_rmse = np.sqrt(mean_squared_error(yr_test, xgbr_pred))
    xgbr_r2 = r2_score(yr_test, xgbr_pred)
    
    print(f"XGBoost Regressor:")
    print(f"  MAE: {xgbr_mae:.2f}")
    print(f"  RMSE: {xgbr_rmse:.2f}")
    print(f"  R² Score: {xgbr_r2:.4f}")

In [None]:
if len(data_reg) >= 5:
    # Visualize predictions vs actual
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Random Forest
    axes[0].scatter(yr_test, rfr_pred, alpha=0.6)
    maxv = max(yr_test.max(), rfr_pred.max()) * 1.05
    minv = min(yr_test.min(), rfr_pred.min()) * 0.95
    axes[0].plot([minv, maxv], [minv, maxv], '--', color='red', linewidth=2)
    axes[0].set_xlabel('Actual Salary')
    axes[0].set_ylabel('Predicted Salary')
    axes[0].set_title(f'Random Forest Regressor\nR² = {rfr_r2:.4f}')
    axes[0].grid(True, alpha=0.3)
    
    # XGBoost
    if xgboost_available:
        axes[1].scatter(yr_test, xgbr_pred, alpha=0.6)
        maxv = max(yr_test.max(), xgbr_pred.max()) * 1.05
        minv = min(yr_test.min(), xgbr_pred.min()) * 0.95
        axes[1].plot([minv, maxv], [minv, maxv], '--', color='red', linewidth=2)
        axes[1].set_xlabel('Actual Salary')
        axes[1].set_ylabel('Predicted Salary')
        axes[1].set_title(f'XGBoost Regressor\nR² = {xgbr_r2:.4f}')
        axes[1].grid(True, alpha=0.3)
    else:
        axes[1].text(0.5, 0.5, 'XGBoost not available', ha='center', va='center', transform=axes[1].transAxes)
        axes[1].set_xticks([])
        axes[1].set_yticks([])
    
    plt.tight_layout()
    plt.show()

## 7. Summary and Conclusions

In [None]:
print("=" * 60)
print("CLASSIFICATION RESULTS (Placement Prediction)")
print("=" * 60)
print(f"Logistic Regression Accuracy: {log_acc:.4f}")
print(f"Random Forest Accuracy: {rf_acc:.4f}")
if xgboost_available:
    print(f"XGBoost Accuracy: {xgb_acc:.4f}")

if len(data_reg) >= 5:
    print("\n" + "=" * 60)
    print("REGRESSION RESULTS (Salary Prediction for Placed Students)")
    print("=" * 60)
    print(f"Random Forest Regressor R² Score: {rfr_r2:.4f}")
    if xgboost_available:
        print(f"XGBoost Regressor R² Score: {xgbr_r2:.4f}")

print("\n" + "=" * 60)
print("KEY INSIGHTS")
print("=" * 60)
print("1. Academic performance (especially etest_p, mba_p) is highly correlated with placement")
print("2. Work experience significantly increases placement chances")
print("3. Tree-based models (Random Forest, XGBoost) generally perform well")
print("4. The models can effectively predict both placement status and salary")

## 8. Next Steps

- Experiment with hyperparameter tuning (GridSearchCV, RandomizedSearchCV)
- Try ensemble methods combining multiple models
- Feature engineering: create interaction features, polynomial features
- Handle class imbalance if present (SMOTE, class weights)
- Cross-validation for more robust evaluation
- Deploy the best model for real-time predictions