In [1]:
import pandas as pd
df = pd.read_csv('../data/train.csv')

In [2]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    res = df.copy()
    res["clean_title"] = res["clean_title"].fillna("NaN")
    res["is.clean_title"] = res["clean_title"] == "Yes"
    res["age"] = (2024 - res["model_year"]).map(lambda x: max(x, 1))
    res["milage_per_year"] = res["milage"] / res["age"]
    res["had_accident"] = res["accident"] == "At least 1 accident or damage reported"
    res["avg_price"] =  res.groupby(['brand', 'model'])['price'].transform('mean')
    res["model_year"] = res["model_year"]
    return res

In [3]:
from sklearn.model_selection import train_test_split
preprocessed = preprocess(df)
train, val = train_test_split(preprocessed, test_size = 0.3, random_state = 42)

In [25]:
def get_avg_price_from_brand_model(df: pd.DataFrame) -> dict:
    return df.groupby(['brand', 'model'])['avg_price'].mean().to_dict()

def get_avg_price_from_brand(df: pd.DataFrame) -> dict:
    return df.groupby('brand')['avg_price'].mean().to_dict()

In [26]:
avg_price_from_brand_model = get_avg_price_from_brand_model(preprocessed)

avg_price_from_brand = get_avg_price_from_brand(preprocessed)

In [27]:
test = pd.read_csv('../data/test.csv')

In [28]:
avg_price

{('Acura', '300 Touring'): 19500.0,
 ('Acura', '330 i xDrive'): 28500.0,
 ('Acura', '335 is'): 17000.0,
 ('Acura', 'ATS 2.0L Turbo Luxury'): 10000.0,
 ('Acura', 'Acadia SLE-1'): 25625.0,
 ('Acura', 'Air Pure'): 84995.0,
 ('Acura', 'CC Sport'): 30089.0,
 ('Acura', 'CLK-Class CLK 350'): 26399.5,
 ('Acura', 'CX-30 Base'): 11000.0,
 ('Acura', 'Cascada Base'): 20599.0,
 ('Acura', 'Corvette Stingray w/1LT'): 157500.0,
 ('Acura', 'ES 350 Base'): 23999.0,
 ('Acura', 'Forte LXS'): 13998.0,
 ('Acura', 'GT'): 47798.0,
 ('Acura', 'ILX 2.0L w/Premium Package'): 17551.08510638298,
 ('Acura', 'ILX 2.4L'): 17406.473684210527,
 ('Acura', 'ILX Premium & A-SPEC Packages'): 26527.18181818182,
 ('Acura', 'ILX Premium Package'): 24447.42528735632,
 ('Acura', 'ILX Technology Plus Package'): 15555.703296703297,
 ('Acura', 'IS 350 Base'): 15000.0,
 ('Acura', 'Integra GS-R'): 9597.166666666666,
 ('Acura', 'Integra LS'): 7772.6875,
 ('Acura', 'Integra w/A-Spec Tech Package'): 45999.0,
 ('Acura', 'LS 460 Base'): 

In [29]:
def retrieve_value(brand: str, model: str, from_brand_model: dict, from_brand: dict) -> float:
    """
    Retrieve a value from `avg_price` dictionary using `brand` and `model` as keys.
    If `model` is not found in the dictionary, use `brand` to retrieve the value.
    If `brand` key is not found, return None.
    """
    if (brand, model) in from_brand_model:
        return from_brand_model[(brand, model)]
    if brand in from_brand:
        return from_brand[brand]

    return None

In [30]:
test['avg_price'] = test.apply(lambda x: retrieve_value(x['brand'], x['model'], avg_price_from_brand_model, avg_price_from_brand), axis=1)

In [31]:
test['avg_price'].isnull().sum() / len(test)

np.float64(0.0)