In [39]:
import pandas as pd
df = pd.read_csv('../data/train.csv')

In [40]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    res = df.copy()
    res["clean_title"] = res["clean_title"].fillna("NaN")
    res["is.clean_title"] = res["clean_title"] == "Yes"
    res["age"] = (2024 - res["model_year"]).map(lambda x: max(x, 1))
    res["milage_per_year"] = res["milage"] / res["age"]
    res["had_accident"] = res["accident"] == "At least 1 accident or damage reported"
    res["avg_price"] =  res.groupby(['brand', 'model'])['price'].transform('mean')
    res["model_year"] = res["model_year"]
    return res

In [41]:
from sklearn.model_selection import train_test_split
preprocessed = preprocess(df)
train, val = train_test_split(preprocessed, test_size = 0.3, random_state = 42)

In [42]:
def get_avg_price_from_brand_model(df: pd.DataFrame) -> dict:
    return df.groupby(['brand', 'model'])['avg_price'].mean().to_dict()

def get_avg_price_from_brand(df: pd.DataFrame) -> dict:
    return df.groupby('brand')['avg_price'].mean().to_dict()

In [43]:
avg_price_from_brand_model = get_avg_price_from_brand_model(preprocessed)

avg_price_from_brand = get_avg_price_from_brand(preprocessed)

In [44]:
test = pd.read_csv('../data/test.csv')

In [45]:
def retrieve_value(brand: str, model: str, from_brand_model: dict, from_brand: dict) -> float:
    """
    Retrieve a value from `avg_price` dictionary using `brand` and `model` as keys.
    If `model` is not found in the dictionary, use `brand` to retrieve the value.
    If `brand` key is not found, return None.
    """
    if (brand, model) in from_brand_model:
        return from_brand_model[(brand, model)]
    if brand in from_brand:
        return from_brand[brand]

    return None

In [46]:
test['avg_price'] = test.apply(lambda x: retrieve_value(x['brand'], x['model'], avg_price_from_brand_model, avg_price_from_brand), axis=1)

In [47]:
test['avg_price'].isnull().sum() / len(test)

np.float64(0.0)

In [48]:
X_train = train[["is.clean_title", "milage_per_year", "had_accident", "avg_price", "model_year"]]
y_train = train["price"]

X_val = val[["is.clean_title", "milage_per_year", "had_accident", "avg_price", "model_year"]]
y_val = val["price"]

## Use MLFlow

In [49]:
from sklearn.ensemble import RandomForestRegressor

import mlflow

import numpy as np
from sklearn.metrics import mean_squared_error


In [50]:
import time

n_estimators_list = [10, 50, 100, 200]
max_depth_list = [5, 10, 20, 30]

mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('random-forest')


for n_estimators in n_estimators_list:
    for max_depth in max_depth_list:
        with mlflow.start_run(nested=True, run_name=f'{time.strftime("%Y%m%d-%H%M%S")}-{n_estimators}-{max_depth}') as run:
            forest = RandomForestRegressor(
                n_estimators=n_estimators,
                random_state=42,
                criterion='squared_error',
                max_depth=max_depth,
            )
            forest.fit(X_train, y_train)
            y_pred = forest.predict(X_val)

            mlflow.log_params({
                "n_estimators": n_estimators,
                "max_depth": max_depth,
            })

            mlflow.log_metrics({
                "val_rmse": np.sqrt(mean_squared_error(y_val, y_pred)),
            })

            mlflow.sklearn.log_model(forest, f"random-forest-{n_estimators}-{max_depth}")

MlflowException: API request to endpoint /api/2.0/mlflow/experiments/get-by-name failed with error code 403 != 200. Response body: ''