In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Using cached pandas-2.3.0-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Using cached numpy-2.3.0-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting matplotlib
  Using cached matplotlib-3.10.3-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.7.0-cp313-cp313-win_amd64.whl.metadata (14 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.2-cp313-cp313-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import joblib

# 1. Data Loading and Preparation
train_df = pd.read_csv('ML-CUP24-TR.csv', comment='#', header=None)
train_df.columns = ['id'] + [f'input_{i}' for i in range(1,13)] + ['target_x', 'target_y', 'target_z']

test_df = pd.read_csv('ML-CUP24-TS.csv', comment='#', header=None)
test_df = test_df.dropna(how='all')
test_df.columns = ['id'] + [f'input_{i}' for i in range(1,13)]

X = train_df.drop(['id', 'target_x', 'target_y', 'target_z'], axis=1)
y = train_df[['target_x', 'target_y', 'target_z']].values
X_test = test_df.drop('id', axis=1)

# 2. Train-Validation Split and Scaling
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# 3. Model Evaluation
def evaluate_model(model, X_train, y_train):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    mse_scores = []
    r2_scores = []
    
    for train_idx, val_idx in kfold.split(X_train):
        X_tr, X_v = X_train[train_idx], X_train[val_idx]
        y_tr, y_v = y_train[train_idx], y_train[val_idx]
        
        if isinstance(model, SVR):
            model_wrapped = MultiOutputRegressor(model)
            model_wrapped.fit(X_tr, y_tr)
            y_pred = model_wrapped.predict(X_v)
        else:
            model.fit(X_tr, y_tr)
            y_pred = model.predict(X_v)
        
        mse_scores.append(mean_squared_error(y_v, y_pred))
        r2_scores.append(r2_score(y_v, y_pred))
    
    return np.mean(mse_scores), np.mean(r2_scores)

models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'SVR': SVR(),
    'MLP': MLPRegressor(random_state=42, max_iter=1000)
}

results = {}
for name, model in models.items():
    try:
        mse, r2 = evaluate_model(model, X_train_scaled, y_train)
        results[name] = {'MSE': mse, 'R2': r2}
        print(f"{name}: MSE = {mse:.4f}, R2 = {r2:.4f}")
    except Exception as e:
        print(f"Error with {name}: {str(e)}")
        results[name] = {'MSE': None, 'R2': None}

# 4. Hyperparameter Tuning for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0]
}

xgb = XGBRegressor(random_state=42)
grid_search = GridSearchCV(xgb, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

best_xgb = grid_search.best_estimator_
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best MSE: {-grid_search.best_score_:.4f}")

# 5. Final Evaluation
best_xgb.fit(X_train_scaled, y_train)
y_val_pred = best_xgb.predict(X_val_scaled)
val_mse = mean_squared_error(y_val, y_val_pred)
val_r2 = r2_score(y_val, y_val_pred)

print(f"\nValidation MSE: {val_mse:.4f}")
print(f"Validation R2: {val_r2:.4f}")

# 6. Feature Importance
plt.figure(figsize=(10, 6))
plt.barh(X.columns, best_xgb.feature_importances_)
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# 7. Generate Test Predictions with Custom Header and fixed newlines
test_predictions = best_xgb.predict(X_test_scaled)
output_df = test_df[['id']].copy()
output_df[['target_x', 'target_y', 'target_z']] = test_predictions

# Write to file with custom header and proper newline handling
with open('ML-CUP24-output.csv', 'w', newline='') as f:
    f.write("# Charchit Bansal, Sounak Mukopadhyay\n")
    f.write("# CharchitSounak\n")
    f.write("# ML-CUP24 V1\n")
    f.write("# 20/06/2025\n")
    output_df.to_csv(f, index=False, header=False)

print("\nProcess completed successfully!")
print("Output saved to ML-CUP24-output.csv with proper formatting")

Random Forest: MSE = 0.3900, R2 = 0.9303
Error with Gradient Boosting: y should be a 1d array, got an array of shape (160, 3) instead.
XGBoost: MSE = 0.4426, R2 = 0.9359
SVR: MSE = 2.2201, R2 = 0.9320
MLP: MSE = 0.4182, R2 = 0.9299

Best parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.9}
Best MSE: 0.3829

Validation MSE: 0.2963
Validation R2: 0.9007


PermissionError: [Errno 13] Permission denied: 'ML-CUP24-output.csv'