In [1]:
import pandas as pd
df = pd.read_csv('../data/train.csv')

In [2]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    res = df.copy()
    res["clean_title"] = res["clean_title"].fillna("NaN")
    res["is.clean_title"] = res["clean_title"] == "Yes"
    res["age"] = (2024 - res["model_year"]).map(lambda x: max(x, 1))
    res["milage_per_year"] = res["milage"] / res["age"]
    res["had_accident"] = res["accident"] == "At least 1 accident or damage reported"
    res["avg_price"] =  res.groupby(['brand', 'model'])['price'].transform('mean')
    res["model_year"] = res["model_year"]
    return res

In [3]:
from sklearn.model_selection import train_test_split
preprocessed = preprocess(df)
train, val = train_test_split(preprocessed, test_size = 0.3, random_state = 42)

In [4]:
def get_avg_price_from_brand_model(df: pd.DataFrame) -> dict:
    return df.groupby(['brand', 'model'])['avg_price'].mean().to_dict()

def get_avg_price_from_brand(df: pd.DataFrame) -> dict:
    return df.groupby('brand')['avg_price'].mean().to_dict()

In [5]:
avg_price_from_brand_model = get_avg_price_from_brand_model(preprocessed)

avg_price_from_brand = get_avg_price_from_brand(preprocessed)

In [6]:
test = pd.read_csv('../data/test.csv')

In [7]:
def retrieve_value(brand: str, model: str, from_brand_model: dict, from_brand: dict) -> float:
    """
    Retrieve a value from `avg_price` dictionary using `brand` and `model` as keys.
    If `model` is not found in the dictionary, use `brand` to retrieve the value.
    If `brand` key is not found, return None.
    """
    if (brand, model) in from_brand_model:
        return from_brand_model[(brand, model)]
    if brand in from_brand:
        return from_brand[brand]

    return None

In [8]:
test['avg_price'] = test.apply(lambda x: retrieve_value(x['brand'], x['model'], avg_price_from_brand_model, avg_price_from_brand), axis=1)

In [9]:
test['avg_price'].isnull().sum() / len(test)

np.float64(0.0)

In [14]:
X_train = train[["is.clean_title", "milage_per_year", "had_accident", "avg_price", "model_year"]]
y_train = train["price"]

X_val = val[["is.clean_title", "milage_per_year", "had_accident", "avg_price", "model_year"]]
y_val = val["price"]

## Use MLFlow

In [16]:
from sklearn.ensemble import RandomForestRegressor

import mlflow

import numpy as np
from sklearn.metrics import mean_squared_error


In [38]:
import time

n_estimators_list = [10, 50, 100, 200]
max_depth_list = [5, 10, 20, 30]

mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('random-forest')


for n_estimators in n_estimators_list:
    for max_depth in max_depth_list:
        with mlflow.start_run(nested=True, run_name=f'{time.strftime("%Y%m%d-%H%M%S")}-{n_estimators}-{max_depth}') as run:
            forest = RandomForestRegressor(
                n_estimators=n_estimators,
                random_state=42,
                criterion='squared_error',
                max_depth=max_depth,
            )
            forest.fit(X_train, y_train)
            y_pred = forest.predict(X_val)

            mlflow.log_params({
                "n_estimators": n_estimators,
                "max_depth": max_depth,
            })

            mlflow.log_metrics({
                "val_rmse": np.sqrt(mean_squared_error(y_val, y_pred)),
            })

            mlflow.sklearn.log_model(forest, f"random-forest-{n_estimators}-{max_depth}")

2024/09/10 18:20:36 INFO mlflow.tracking._tracking_service.client: 🏃 View run 20240910-182032-10-5 at: http://127.0.0.1:5000/#/experiments/728286377558395095/runs/74da2ac63a2349c0b863ba74a9fb269a.
2024/09/10 18:20:36 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/728286377558395095.
2024/09/10 18:20:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run 20240910-182036-10-10 at: http://127.0.0.1:5000/#/experiments/728286377558395095/runs/e4636783ca164813aae68eadebc6b723.
2024/09/10 18:20:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/728286377558395095.
2024/09/10 18:20:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run 20240910-182041-10-20 at: http://127.0.0.1:5000/#/experiments/728286377558395095/runs/f0224850a2c9448f8334d63b90577d79.
2024/09/10 18:20:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/expe