# University Data Analysis with Machine Learning

This notebook contains a comprehensive analysis of university data using various machine learning techniques.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.metrics import (
    mean_squared_error, r2_score, mean_absolute_error,
    accuracy_score, classification_report, confusion_matrix
)
import xgboost as xgb
from scipy import stats

# Set style for better visualizations
plt.style.use('seaborn')
sns.set_palette('husl')
%matplotlib inline

## 1. Data Loading and Preprocessing

In [None]:
def load_and_preprocess_data():
    # Load data
    df = pd.read_csv('NorthAmericaUniversities.csv', encoding='latin-1')
    
    # Clean numeric columns
    df['Established'] = pd.to_numeric(df['Established'], errors='coerce')
    df['Academic Staff'] = pd.to_numeric(df['Academic Staff'].astype(str).str.replace(',', ''), errors='coerce')
    df['Number of Students'] = pd.to_numeric(df['Number of Students'].astype(str).str.replace(',', ''), errors='coerce')
    
    # Clean Endowment
    df['Endowment'] = df['Endowment'].astype(str).str.replace('$', '').str.replace('B', '').str.replace(',', '')
    df['Endowment'] = pd.to_numeric(df['Endowment'], errors='coerce') * 1e9
    
    # Feature Engineering
    df['Age'] = 2024 - df['Established']
    df['Student_Staff_Ratio'] = df['Number of Students'] / df['Academic Staff']
    df['Students_per_Age'] = df['Number of Students'] / df['Age']
    
    # Handle missing values
    df = df.dropna(subset=['Endowment'])
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())
    
    return df

# Load and display the data
df = load_and_preprocess_data()
print("Dataset Shape:", df.shape)
df.head()

## 2. Exploratory Data Analysis

In [None]:
# Distribution of Endowments
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='Endowment', bins=30)
plt.title('Distribution of University Endowments')
plt.xlabel('Endowment (USD)')
plt.show()

# Correlation Matrix
numeric_cols = ['Age', 'Academic Staff', 'Number of Students', 'Student_Staff_Ratio', 'Students_per_Age', 'Endowment']
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

## 3. Regression Models for Endowment Prediction

In [None]:
# Prepare features for regression
features = ['Age', 'Academic Staff', 'Number of Students', 'Student_Staff_Ratio', 'Students_per_Age']
X = df[features]
y = df['Endowment']

# Scale features
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train multiple models
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'R2': r2}

# Display results
results_df = pd.DataFrame(results).T
print("Model Performance Comparison:")
display(results_df)

## 4. Feature Importance Analysis

In [None]:
# Get best model
best_model_name = results_df['R2'].idxmax()
best_model = models[best_model_name]

# Feature importance
importance = best_model.feature_importances_
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': importance
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature')
plt.title(f'Feature Importance ({best_model_name})')
plt.show()

## 5. University Tier Classification

In [None]:
# Create tier labels
df['Tier'] = pd.qcut(df['Endowment'], q=3, labels=['Lower', 'Middle', 'Upper'])

# Prepare data for classification
X_clf = X_scaled
y_clf = df['Tier']

# Split data
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42
)

# Train classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_clf, y_train_clf)

# Make predictions
y_pred_clf = clf.predict(X_test_clf)

# Print classification report
print("Classification Report:")
print(classification_report(y_test_clf, y_pred_clf))

# Plot confusion matrix
cm = confusion_matrix(y_test_clf, y_pred_clf)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## 6. Model Optimization with GridSearchCV

In [None]:
# Grid search for best model
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='r2'
)

grid_search.fit(X_train, y_train)

print("Best Parameters:")
print(grid_search.best_params_)
print("\nBest Score:")
print(grid_search.best_score_)

## 7. Prediction Analysis

In [None]:
# Use best model from grid search
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

# Plot actual vs predicted
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_best, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Endowment')
plt.ylabel('Predicted Endowment')
plt.title('Actual vs Predicted Endowment Values')
plt.show()

# Calculate and display metrics
mse = mean_squared_error(y_test, y_pred_best)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_best)

print(f"Root Mean Squared Error: ${rmse:,.2f}")
print(f"R² Score: {r2:.3f}")

## 8. Conclusions and Insights

1. Model Performance:
   - The best performing model for endowment prediction is determined by R² score
   - Classification of university tiers shows good accuracy

2. Key Features:
   - Most important features affecting university endowments are shown in the feature importance plot
   - Age and size of the institution play significant roles

3. Predictions:
   - The model can predict university endowments with reasonable accuracy
   - Classification into tiers is more reliable than exact endowment prediction

4. Future Improvements:
   - Collect more data points
   - Include additional features like research output and alumni success
   - Try more advanced modeling techniques