In [15]:
# Install LightGBM if not available
try:
    import lightgbm as lgb
except ImportError:
    !pip install lightgbm
    import lightgbm as lgb

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import (accuracy_score, classification_report, 
                           mean_squared_error, r2_score, mean_absolute_error)
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore")

# 1. Load and prepare data
file_path = r'C:\Users\VIBHANSHU JAIN\Desktop\Client Project\campus-placement-analysis\EDA_Notebooks\datasets\cleaned_placement_dataset.csv'
data = pd.read_csv(file_path)

# Feature engineering
data['tier_score'] = data['tier_1']*1.5 + data['tier_2']*1.0 + data['tier_3']*0.5
data['core_skills'] = data['dsa']*0.5 + data['web_dev']*0.3 + data['mobile_dev']*0.2
data['academic_strength'] = data['cgpa']*0.7 + data['inter_gpa']*0.2 + data['ssc_gpa']*0.1
data['experience_score'] = np.log1p(data['internships']) * np.sqrt(data['no_of_projects']+1)

# Separate features and targets
X = data[['tier_score', 'core_skills', 'academic_strength', 'experience_score',
         'no_of_programming_languages', 'is_participate_hackathon']]
y_class = data['is_placed']
y_reg = data['salary_as_fresher']

# 2. Split data
X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(
    X, y_class, y_reg, test_size=0.15, random_state=42, stratify=y_class
)

# 3. Handle class imbalance
if y_class_train.mean() < 0.4 or y_class_train.mean() > 0.6:
    print("Applying SMOTE to handle class imbalance...")
    smote = SMOTE(random_state=42)
    X_train, y_class_train = smote.fit_resample(X_train, y_class_train)
    # Align y_reg_train with resampled X_train
    X_train = X_train.reset_index(drop=True)
    y_reg_train = y_reg_train.reset_index(drop=True)
    y_reg_train = pd.concat([y_reg_train, pd.Series([np.nan]*(len(X_train)-len(y_reg_train)))])

# 4. Placement Classifier
print("\nTraining Placement Classifier...")
lgb_classifier = lgb.LGBMClassifier(
    objective='binary',
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    num_leaves=31,
    min_child_samples=20,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

lgb_classifier.fit(X_train, y_class_train,
                  eval_set=[(X_test, y_class_test)],
                  eval_metric='auc',
                  callbacks=[lgb.early_stopping(stopping_rounds=20),
                            lgb.log_evaluation(period=10)])

# 5. Salary Prediction (Only for placed students)
placed_train_mask = y_class_train == 1
placed_test_mask = y_class_test == 1

if placed_train_mask.sum() > 10:
    print(f"\nTraining Salary Predictor with {placed_train_mask.sum()} samples...")
    
    # Prepare data
    X_train_placed = X_train[placed_train_mask].reset_index(drop=True)
    y_reg_train_placed = y_reg_train[placed_train_mask].reset_index(drop=True)
    
    # Remove samples with NaN salaries
    valid_samples = ~y_reg_train_placed.isna()
    X_train_placed = X_train_placed.loc[valid_samples]
    y_reg_train_placed = y_reg_train_placed[valid_samples]
    
    if len(y_reg_train_placed) > 10:
        # Handle outliers
        upper_clip = y_reg_train_placed.quantile(0.95)
        y_reg_train_placed = y_reg_train_placed.clip(upper=upper_clip)
        
        # Transform data
        qt = QuantileTransformer(output_distribution='normal')
        y_reg_train_transformed = qt.fit_transform(y_reg_train_placed.values.reshape(-1, 1)).ravel()
        
        # Prepare evaluation set
        X_test_placed = X_test[placed_test_mask]
        y_test_placed = y_reg_test[placed_test_mask]
        
        if len(y_test_placed) > 0:
            y_test_transformed = qt.transform(y_test_placed.values.reshape(-1, 1)).ravel()
            eval_set = [(X_test_placed, y_test_transformed)]
        else:
            eval_set = None
        
        # Train model
        lgb_regressor = lgb.LGBMRegressor(
            objective='regression',
            n_estimators=150,
            learning_rate=0.01,
            max_depth=3,
            num_leaves=7,
            min_child_samples=10,
            reg_alpha=1.0,
            reg_lambda=1.0,
            random_state=42,
            n_jobs=-1,
            verbose=-1
        )
        
        if eval_set:
            lgb_regressor.fit(X_train_placed, y_reg_train_transformed,
                            eval_set=eval_set,
                            callbacks=[lgb.early_stopping(stopping_rounds=20),
                                      lgb.log_evaluation(period=10)])
        else:
            lgb_regressor.fit(X_train_placed, y_reg_train_transformed)

# 6. Evaluation
print("\n=== Evaluation ===")
# Placement evaluation
y_class_pred = lgb_classifier.predict(X_test)
print(f"\nPlacement Accuracy: {accuracy_score(y_class_test, y_class_pred):.4f}")
print(classification_report(y_class_test, y_class_pred))

# Salary evaluation
if placed_test_mask.sum() > 0 and 'lgb_regressor' in locals():
    try:
        y_pred_transformed = lgb_regressor.predict(X_test[placed_test_mask])
        y_pred = qt.inverse_transform(y_pred_transformed.reshape(-1, 1))
        
        print("\nSalary Prediction Results:")
        print(f"RMSE: {np.sqrt(mean_squared_error(y_reg_test[placed_test_mask], y_pred)):.2f}")
        print(f"R²: {r2_score(y_reg_test[placed_test_mask], y_pred):.4f}")
        print(f"MAE: {mean_absolute_error(y_reg_test[placed_test_mask], y_pred):.2f}")
        
        if r2_score(y_reg_test[placed_test_mask], y_pred) < 0:
            print("\nFalling back to mean prediction due to negative R²")
            mean_pred = np.full_like(y_reg_test[placed_test_mask], y_reg_train_placed.mean())
            print(f"Mean predictor R²: {r2_score(y_reg_test[placed_test_mask], mean_pred):.4f}")
    except Exception as e:
        print(f"\nError in salary evaluation: {str(e)}")

# Feature importance
print("\nPlacement Feature Importance:")
print(pd.DataFrame({
    'Feature': X.columns,
    'Importance': lgb_classifier.feature_importances_
}).sort_values('Importance', ascending=False).to_string(index=False))

if 'lgb_regressor' in locals():
    print("\nSalary Feature Importance:")
    print(pd.DataFrame({
        'Feature': X.columns,
        'Importance': lgb_regressor.feature_importances_
    }).sort_values('Importance', ascending=False).to_string(index=False))

Applying SMOTE to handle class imbalance...

Training Placement Classifier...
Training until validation scores don't improve for 20 rounds
[10]	valid_0's auc: 0.980952	valid_0's binary_logloss: 0.493767
[20]	valid_0's auc: 0.971429	valid_0's binary_logloss: 0.387683
Early stopping, best iteration is:
[5]	valid_0's auc: 0.980952	valid_0's binary_logloss: 0.575911

Training Salary Predictor with 84 samples...
Training until validation scores don't improve for 20 rounds
[10]	valid_0's l2: 2.55109
[20]	valid_0's l2: 2.38383
[30]	valid_0's l2: 2.2746
[40]	valid_0's l2: 2.19475
[50]	valid_0's l2: 2.12448
[60]	valid_0's l2: 2.07353
[70]	valid_0's l2: 2.01893
[80]	valid_0's l2: 1.9792
[90]	valid_0's l2: 1.95289
[100]	valid_0's l2: 1.93801
[110]	valid_0's l2: 1.94186
Early stopping, best iteration is:
[99]	valid_0's l2: 1.93607

=== Evaluation ===

Placement Accuracy: 0.9091
              precision    recall  f1-score   support

         0.0       1.00      0.71      0.83         7
         1.0