In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")

# 1. Load and prepare data
file_path = r'C:\Users\VIBHANSHU JAIN\Desktop\Client Project\campus-placement-analysis\EDA_Notebooks\datasets\cleaned_placement_dataset.csv'
data = pd.read_csv(file_path)

# 2. Enhanced Feature Engineering
data['skill_score'] = (0.4*data['dsa'] + 0.2*data['web_dev'] + 
                      0.2*data['mobile_dev'] + 0.1*data['Machine Learning'] + 
                      0.1*data['cloud'])
data['academic_power'] = (0.6*data['cgpa'] + 0.3*data['inter_gpa'] + 0.1*data['ssc_gpa'])
data['project_impact'] = data['no_of_projects'] * data['no_of_programming_languages']
data['experience_score'] = np.log1p(data['internships']) * data['no_of_projects']

# 3. Prepare data
X = data[['skill_score', 'academic_power', 'project_impact', 
          'experience_score', 'tier_1', 'tier_2', 'tier_3']]
y = data['salary_as_fresher']
placed_mask = data['is_placed'] == 1

# 4. Train-test split ONLY on placed students
X_train, X_test, y_train, y_test = train_test_split(
    X[placed_mask], y[placed_mask], 
    test_size=0.2, 
    random_state=42
)

# 5. Model Training - Ensemble Approach
print(f"\nTraining with {len(X_train)} placed students...")

# Impute missing salaries with median
imputer = SimpleImputer(strategy='median')
y_train_imputed = imputer.fit_transform(y_train.values.reshape(-1, 1)).ravel()

# Model 1: Random Forest with optimized parameters
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=7,
    min_samples_leaf=5,
    max_features=0.6,
    random_state=42
)
rf.fit(X_train, y_train_imputed)

# Model 2: Gradient Boosting
gbr = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    min_samples_leaf=5,
    random_state=42
)
gbr.fit(X_train, y_train_imputed)

# 6. Ensemble Prediction
rf_pred = rf.predict(X_test)
gbr_pred = gbr.predict(X_test)
ensemble_pred = 0.7*gbr_pred + 0.3*rf_pred  # Weighted ensemble

# 7. Comprehensive Evaluation
print("\n=== Evaluation Results ===")
print(f"Random Forest R²: {r2_score(y_test, rf_pred):.4f}")
print(f"Gradient Boosting R²: {r2_score(y_test, gbr_pred):.4f}")
print(f"Ensemble R²: {r2_score(y_test, ensemble_pred):.4f}")

print("\nError Analysis:")
errors = y_test - ensemble_pred
print(f"Median Salary: {np.median(y_test):.2f}")
print(f"MAE: {mean_absolute_error(y_test, ensemble_pred):.2f}")
print(f"Error Range: {errors.min():.2f} to {errors.max():.2f}")

# 8. Feature Importance
print("\nTop Predictive Features:")
importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': gbr.feature_importances_
}).sort_values('Importance', ascending=False)
print(importance.to_string(index=False))

# 9. Final Recommendations
if r2_score(y_test, ensemble_pred) < 0.3:
    print("\nWarning: Model still performing poorly. Recommendations:")
    print("1. Collect more salary data (minimum 100 placed students recommended)")
    print("2. Add more salary-determining features (company type, job role, etc.)")
    print("3. Consider converting to classification (salary ranges instead of exact values)")
    print("4. Verify salary data quality - remove extreme outliers if present")


Training with 79 placed students...

=== Evaluation Results ===
Random Forest R²: 0.5074
Gradient Boosting R²: 0.3538
Ensemble R²: 0.4315

Error Analysis:
Median Salary: 11.50
MAE: 3.95
Error Range: -9.15 to 13.32

Top Predictive Features:
         Feature  Importance
     skill_score    0.391258
  academic_power    0.264174
  project_impact    0.101887
experience_score    0.094109
          tier_3    0.079343
          tier_1    0.046178
          tier_2    0.023051
