# Machine Learning Analysis of Global College Statistics

This notebook performs comprehensive machine learning analysis on the Global College Statistics Dataset to predict placement rates and identify important factors affecting college performance.

## Table of Contents:
1. Data Loading and Preprocessing
2. Exploratory Data Analysis
3. Feature Engineering
4. Model Training and Evaluation
5. Advanced Model Tuning
6. Results Visualization

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Set style for visualizations
plt.style.use('seaborn')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = [12, 8]

## 1. Data Loading and Preprocessing

In [None]:
# Load the dataset
df = pd.read_csv('College Data.csv')

# Display basic information
print("Dataset Info:")
df.info()

print("\nFirst few rows:")
df.head()

# Check for missing values
print("\nMissing Values:")
df.isnull().sum()

## 2. Exploratory Data Analysis

In [None]:
# Distribution of target variable (Placement Rate)
plt.figure(figsize=(10, 6))
sns.histplot(df['Placement Rate'], kde=True)
plt.title('Distribution of Placement Rates')
plt.show()

# Correlation matrix
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation = df[numeric_cols].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

## 3. Feature Engineering

In [None]:
# Create new features
df['Student_Faculty_Ratio'] = df['Total Students'] / df['Faculty Count']
df['Female_Ratio'] = df['Female'] / df['Total Students']
df['Research_Per_Faculty'] = df['Research Papers Published'] / df['Faculty Count']

# Select features for modeling
features = ['CGPA', 'Research Papers Published', 'Faculty Count', 'Total Students',
           'Student_Faculty_Ratio', 'Female_Ratio', 'Research_Per_Faculty', 'Annual Family Income']
target = 'Placement Rate'

X = df[features]
y = df[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 4. Model Training and Evaluation

In [None]:
def train_and_evaluate_model(model, name, X_train, X_test, y_train, y_test):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Calculate metrics
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    print(f"\n{name} Results:")
    print(f"Train R2: {train_r2:.4f}")
    print(f"Test R2: {test_r2:.4f}")
    print(f"Train RMSE: {train_rmse:.4f}")
    print(f"Test RMSE: {test_rmse:.4f}")
    
    return model, y_pred_test

# Train different models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=1.0),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42)
}

results = {}
for name, model in models.items():
    model, predictions = train_and_evaluate_model(model, name, X_train_scaled, X_test_scaled, y_train, y_test)
    results[name] = {'model': model, 'predictions': predictions}

## 5. Advanced Model Tuning

In [None]:
# Tune the best performing model (assuming Random Forest)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Train final model with best parameters
best_rf = grid_search.best_estimator_
y_pred_best = best_rf.predict(X_test_scaled)

print("\nFinal Model Performance:")
print(f"R2 Score: {r2_score(y_test, y_pred_best):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_best)):.4f}")

## 6. Results Visualization

In [None]:
# Feature importance plot
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': best_rf.feature_importances_
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature')
plt.title('Feature Importance (Random Forest)')
plt.show()

# Actual vs Predicted plot
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_best, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Placement Rate')
plt.ylabel('Predicted Placement Rate')
plt.title('Actual vs Predicted Placement Rates')
plt.show()

# Model comparison plot
model_scores = {
    name: r2_score(y_test, results[name]['predictions'])
    for name in results.keys()
}

plt.figure(figsize=(10, 6))
sns.barplot(x=list(model_scores.keys()), y=list(model_scores.values()))
plt.title('Model Comparison (R² Score)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Conclusions and Insights

1. Model Performance:
   - Compare the performance of different models
   - Identify the best performing model and its key parameters

2. Feature Importance:
   - List the most important features affecting placement rates
   - Discuss the relationships between features

3. Recommendations:
   - Suggest ways to improve placement rates based on the model insights
   - Identify areas for further investigation