<a href="https://colab.research.google.com/github/denxxs/MBTest/blob/main/MihirMBProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
constant_columns = [col for col in train_df.columns if train_df[col].nunique() == 1]
train_df.drop(columns=constant_columns, inplace=True)
test_df.drop(columns=constant_columns, inplace=True)

In [4]:
categorical_columns = train_df.select_dtypes(include=['object']).columns.tolist()

In [5]:
preprocessor = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)], remainder='passthrough')

In [6]:
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

In [7]:
pca = PCA(n_components=0.95)

In [8]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('pca', pca), ('model', xgb_model)])

In [9]:
param_grid = {
    'model__n_estimators': [50, 100, 150],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.01, 0.1, 0.2]
}

In [10]:
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

In [11]:
X = train_df.drop(['ID', 'y'], axis=1)
y = train_df['y']
grid_search.fit(X, y)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [12]:
best_model = grid_search.best_estimator_

In [13]:
X_test = test_df.drop(['ID'], axis=1)
test_df['y'] = best_model.predict(X_test)

In [14]:
test_df[['ID', 'y']].to_csv('mercedes_benz_test_predictions.csv', index=False)

In [15]:
best_parameters = grid_search.best_params_
best_mse = -grid_search.best_score_
best_r2 = r2_score(y, best_model.predict(X))

In [16]:
best_parameters, best_mse, best_r2

({'model__learning_rate': 0.2,
  'model__max_depth': 3,
  'model__n_estimators': 100},
 81.05401041922467,
 0.75888549436653)