In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import randint
import numpy as np

In [2]:
# Load datasets
df_train = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv', index_col='id')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv', index_col='id')
df_sub = pd.read_csv('/kaggle/input/playground-series-s4e9/sample_submission.csv')

In [3]:
# Define feature columns
num_features = ['model_year', 'milage']
cat_features = ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']

In [4]:
# Data cleaning
def clean_data(df):
    df['fuel_type'].fillna(df['fuel_type'].mode()[0], inplace=True)
    df[['accident', 'clean_title']] = df[['accident', 'clean_title']].fillna('missing')
    return df

In [5]:
df_train = clean_data(df_train)
df_test = clean_data(df_test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['fuel_type'].fillna(df['fuel_type'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['fuel_type'].fillna(df['fuel_type'].mode()[0], inplace=True)


In [6]:
# Define preprocessing pipelines
num_pipeline = Pipeline([
    ('scaler', StandardScaler()),
])

In [7]:
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
])

In [8]:
preprocessing = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features),
], remainder='passthrough')

In [9]:
# Define full pipeline with preprocessing and model
full_pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('model', RandomForestRegressor(random_state=1))
])

In [10]:
# Separate features and target
X_train = df_train.drop(columns=['price'])
y_train = df_train['price']
X_test = df_test

In [11]:
# Split the training data into training and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [12]:
# Train the model with RandomizedSearchCV for hyperparameter tuning
param_dist = {
    'model__n_estimators': randint(10, 200),
    'model__max_depth': randint(5, 20)
}

In [13]:
random_search = RandomizedSearchCV(
    estimator=full_pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

In [14]:
random_search.fit(X_train_split, y_train_split)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END .......model__max_depth=11, model__n_estimators=189; total time= 1.7min
[CV] END .......model__max_depth=11, model__n_estimators=189; total time= 1.7min
[CV] END .......model__max_depth=11, model__n_estimators=189; total time= 1.7min
[CV] END .......model__max_depth=11, model__n_estimators=189; total time= 1.7min
[CV] END ........model__max_depth=17, model__n_estimators=24; total time=  20.4s
[CV] END ........model__max_depth=17, model__n_estimators=24; total time=  19.9s
[CV] END ........model__max_depth=17, model__n_estimators=24; total time=  20.6s
[CV] END ........model__max_depth=17, model__n_estimators=24; total time=  19.3s
[CV] END ........model__max_depth=17, model__n_estimators=24; total time=  19.6s
[CV] END ........model__max_depth=15, model__n_estimators=81; total time=  57.2s
[CV] END ........model__max_depth=15, model__n_estimators=81; total time=  56.8s
[CV] END ........model__max_depth=15, model__n_

In [15]:
# Evaluate model
val_predictions = random_search.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_predictions))
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 63016.82714388628


In [16]:
# Predict on test set and prepare submission
test_predictions = random_search.predict(X_test)
df_sub['price'] = test_predictions
df_sub.to_csv('/kaggle/working/submission.csv', index=False)