In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")

# 1. Load and prepare data
file_path = r'C:\Users\VIBHANSHU JAIN\Desktop\Client Project\campus-placement-analysis\EDA_Notebooks\datasets\cleaned_placement_dataset.csv'
data = pd.read_csv(file_path)

# Data Validation
print("=== Data Validation ===")
print(f"Total samples: {len(data)}")
placed = data[data['is_placed']==1]
print(f"Placed students: {len(placed)}")
print("\nSalary stats for placed students:")
print(placed['salary_as_fresher'].describe())

# 2. Enhanced Feature Engineering - Focus on Salary Determinants
data['tier_strength'] = data['tier_1']*1.5 + data['tier_2']*1.0 + data['tier_3']*0.5
data['core_skills'] = (data['dsa']*0.5 + data['web_dev']*0.3 + data['mobile_dev']*0.2)
data['academic_strength'] = (data['cgpa']*0.7 + data['inter_gpa']*0.2 + data['ssc_gpa']*0.1)
data['practical_experience'] = np.log1p(data['internships']+1) * np.sqrt(data['no_of_projects']+1)
data['coding_mastery'] = data['no_of_programming_languages'] * (data['dsa']**0.8)

# Select only the most relevant features
salary_features = [
    'tier_strength', 
    'core_skills',
    'academic_strength',
    'practical_experience',
    'coding_mastery',
    'is_participate_hackathon'
]

# 3. Prepare data - Only use placed students for salary prediction
X_salary = data[data['is_placed']==1][salary_features]
y_salary = data[data['is_placed']==1]['salary_as_fresher']

# Handle missing salaries if any
imputer = SimpleImputer(strategy='median')
y_salary = imputer.fit_transform(y_salary.values.reshape(-1, 1)).ravel()

# 4. Transform target variable - Critical Step!
qt = QuantileTransformer(output_distribution='normal')
try:
    y_transformed = qt.fit_transform(y_salary.reshape(-1, 1)).ravel()
except:
    print("QuantileTransformer failed, using log transformation instead")
    y_transformed = np.log1p(y_salary)

# 5. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_salary, y_transformed, test_size=0.2, random_state=42
)

# 6. Model Training - Start Simple
print("\n=== Model Training ===")
models = {
    "RandomForest": RandomForestRegressor(
        n_estimators=200,
        max_depth=5,
        min_samples_leaf=5,
        random_state=42
    ),
    "GradientBoosting": GradientBoostingRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=4,
        min_samples_leaf=7,
        random_state=42
    )
}

best_model = None
best_r2 = -np.inf

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    print(f"{name} R²: {r2:.4f}")
    
    if r2 > best_r2:
        best_r2 = r2
        best_model = model

# 7. Final Evaluation
print("\n=== Final Evaluation ===")
y_pred = best_model.predict(X_test)
y_pred_original = qt.inverse_transform(y_pred.reshape(-1, 1)) if 'qt' in locals() else np.expm1(y_pred)

final_r2 = r2_score(qt.inverse_transform(y_test.reshape(-1, 1)), y_pred_original) if 'qt' in locals() else r2_score(np.expm1(y_test), y_pred_original)
print(f"\nBest Model: {best_model.__class__.__name__}")
print(f"Final R²: {final_r2:.4f}")

if final_r2 < 0.3:
    print("\nWarning: Poor predictive performance. Recommendations:")
    print("1. Collect more salary data (minimum 100 placed students recommended)")
    print("2. Add more salary-determining features:")
    print("   - Company type/name")
    print("   - Specific job role")
    print("   - Location information")
    print("3. Consider converting to classification problem:")
    print("   bins = [0, 50000, 100000, np.inf]")
    print("   labels = ['low', 'medium', 'high']")
    
    # Implement binned classification fallback
    bins = [0, 50000, 100000, np.inf]
    labels = ['low', 'medium', 'high']
    y_binned = pd.cut(y_salary, bins=bins, labels=labels)
    
    from sklearn.ensemble import RandomForestClassifier
    X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
        X_salary, y_binned, test_size=0.2, random_state=42
    )
    
    clf = RandomForestClassifier(n_estimators=200, random_state=42)
    clf.fit(X_train_clf, y_train_clf)
    y_pred_clf = clf.predict(X_test_clf)
    
    print("\nBinned Classification Performance:")
    print(classification_report(y_test_clf, y_pred_clf))

=== Data Validation ===
Total samples: 145
Placed students: 99

Salary stats for placed students:
count    99.000000
mean     14.466667
std       8.155304
min       4.000000
25%       7.850000
50%      14.000000
75%      18.000000
max      40.000000
Name: salary_as_fresher, dtype: float64

=== Model Training ===
RandomForest R²: 0.4170
GradientBoosting R²: -0.2043

=== Final Evaluation ===

Best Model: RandomForestRegressor
Final R²: 0.6334
