# Get the Car Observation from the requested URL

In [None]:
def scrape_car_offer_val(url):
    # scrape_car_offer_val function begins
    try:
        response = requests.get(url)
        html = response.text
        doc = BeautifulSoup(html, "html.parser")
        
    # Process done by the function
        curr_car_dict = {}
        attribute_exception_list = ['Some', 'Attributes', 'To', 'Exclude']  # Update this list with actual attributes to exclude

        for key, value in zip(doc.find_all("dt"), doc.find_all("dd")):
            if key.text not in attribute_exception_list:
                curr_car_dict[key.text.replace("\n", "")] = value.text.replace("\n", "")

        curr_car_dict["url"] = url
        curr_car_dict["date"] = datetime.now().strftime("%Y-%m-%d")
        curr_car_dict["time"] = datetime.now().strftime("%H-%M-%S")
        curr_car_dict["model"] = doc.find("span", class_="StageTitle_model__EbfjC StageTitle_boldClassifiedInfo__sQb0l").get_text()
        curr_car_dict["brand"] = doc.find("span", class_="StageTitle_boldClassifiedInfo__sQb0l").get_text()

        # Check if the price column exists, if not, add it
        if 'Barzahlungspreis' not in curr_car_dict:
            curr_car_dict['Barzahlungspreis'] = re.split(r'(?<=-)', doc.find('div', class_='PriceInfo_wrapper__hreB_').find('span', class_='PriceInfo_price__XU0aF').text.strip())[0]

        # Create DataFrame from the dictionary
        car_offer_df = pd.DataFrame.from_dict(curr_car_dict, orient="index").T

        return car_offer_df

    except Exception as e:
        print(f"Error occurred while scraping the URL: {e}")
        return pd.DataFrame()



# Clean and Dummy New Observation in One

## Match Format of New Observation with Cleaned/Dummy Transformed Data

In [None]:
def concat_with_first_columns_val(df_list):
    # concat_with_first_columns_val function begins

    # Process done by the function
    if not df_list:
        return pd.DataFrame()  # Return an empty DataFrame if the list is empty
    
    first_df = df_list[0]
    other_dfs = df_list[1:]
    
    # Reindex other DataFrames to match the columns of the first DataFrame
    reindexed_dfs = [first_df] + [df.reindex(columns=first_df.columns) for df in other_dfs]
    
    return pd.concat(reindexed_dfs, ignore_index=True)


## Preprocess Observation

In [3]:
def process_car_data_val(url, scraped_data_path, helper_data_path_cleaned, helper_data_path_dummied, output_path):
    # process_car_data_val function begins
    # Set the maximum number of displayed rows and columns
    pd.set_option('display.max_rows', 1000)
    pd.set_option('display.max_columns', 1000)

    # Process done by the function
    # Import scraped data
    scraped_data = pd.read_csv(scraped_data_path, low_memory=False)

    # Get new car observation from URL

    new_car_observation = scrape_car_offer_val(url)

    # Reduce scraped data
    brand_list = new_car_observation["brand"].tolist()
    model_list = new_car_observation["model"].tolist()
    
    reduced_scraped_data = pd.DataFrame()
    
    for curr_brand, curr_model in zip(brand_list, model_list):
        curr_df = scraped_data[(scraped_data["brand"] == curr_brand) & (scraped_data["model"] == curr_model)]
        reduced_scraped_data = pd.concat([reduced_scraped_data, curr_df])

    # Delete full scraped data to save memory
    if "scraped_data" in globals():
        
        del scraped_data
        gc.collect()

    # Append new observation to scraped data
    processing_data = pd.concat([reduced_scraped_data, new_car_observation])

    # Clean dataframe from observation where values have shifted
    allowance_dict = {"Umweltplakette": ["1 (Keine)", "4 (Grün)", "3 (Gelb)", "2 (Rot)", np.nan],
                      "Antriebsart": ['Heck', 'Front', 'Allrad', np.nan],
                      "Scheckheftgepflegt": ["Ja", "Nein", np.nan]}
    df_step2 = clean_from_rows_with_shifted_columns(processing_data)

    # Drop unnamed columns
    df_step3_1 = drop_unnamed_columns(df_step2)

    # Drop unwanted columns
    columns_to_drop = ["€ 0,-", "Pkw Zulassung", "Ladevolumen", "zul. Zuggewicht", "Achsen", "Nutzlast", "Radstand", "Laderaumhöhe",
                       "Laderaumbreite", "Laderaumlänge", "zul. Gesamtgewicht", "Sonderzahlung", "Verfügbarkeit", "Batteriebesitz",
                       "Letzter Zahnriemenwechsel", "Verfügbar ab", "Fahrzeugstand", "Energieeffizienzklasse", "CO₂-Effizienz",
                       "Bearbeitungsgebühren", "Bruttodarlehensbetrag", "Sollzins geb. p.a.", "Letzte Inspektion", "Baujahr",
                       "CO₂-Klasse", "Schlüsselnummer", "Gänge", "Angebotsnummer", "Farbe laut Hersteller",
                       "Farbe der Innenausstattung", "Andere Energieträger", "Fahrzeugzustand",
                       "Ladezeit von 10% bis 80%", "Fahrleistung p.a.", "Zulassungskosten", "HU"]
    df_step3_2 = drop_unwanted_columns(df_step3_1, columns_to_drop)

    df_step3_2["Elektrische Reichweite7"] = 0
    df_step3_2["Fahrzeughalter"] = df_step3_2["Fahrzeughalter"].apply(lambda x: 1 if isinstance(x, str) else x)
    df_step3_2["Fahrzeughalter"] = df_step3_2["Fahrzeughalter"].fillna(1)
    
    # Keep wanted columns
    wanted_columns = [
        'Barzahlungspreis', 'brand', 'model', 'Kilometerstand', 'Leistung', 'Kraftstoff', 'Antriebsart', 'Karosserieform',
        'Fahrzeugart', 'Sitzplätze', 'Türen', 'Fahrzeughalter', 'Getriebe', 'Hubraum', 'Zylinder', 'Kraftstoffverbrauch',
        'Schadstoffklasse', 'Umweltplakette', 'Komfort', 'Extras', 'Außenfarbe', 'Innenausstattung', 'url',
        'Nichtraucherfahrzeug', 'Leergewicht', 'Unterhaltung/Media', 'Sicherheit', 'CO₂-Emissionen', 'Lackierung',
        'Scheckheftgepflegt', 'Garantie', 'Taxi oder Mietwagen', 'Stromverbrauch', 'Elektrische Reichweite', 
        "Elektrische Reichweite7", 'date', 'Erstzulassung', 'time']
    df_step3_2 = df_step3_2[wanted_columns]

    # Merge columns
    df_step3_3_1 = merge_columns(df_step3_2, "Elektrische Reichweite", "Elektrische Reichweite7")

    # Convert strings to floats/integers
    replacement_dict_garantie = {"Nein": "0", "Ja": "12"}
    df_step4_1_1 = replace_values_single(df_step3_3_1, "Garantie", replacement_dict_garantie)

    replacement_dict_brand = {"porsche": "Porsche", "audi": "Audi", "opel": "Opel", "skoda": "Skoda", "toyota": "Toyota"}
    df_step4_1_1 = replace_values_single(df_step4_1_1, "brand", replacement_dict_brand)

    replacement_dict = {"Stromverbrauch": {",": "."}, "Leistung": {",": "."}, "Hubraum": {",": "."},
                        "Leergewicht": {",": "."}, "CO₂-Emissionen": {",": "."}, "Kilometerstand": {".": ""}}
    df_step4_1_2 = replace_values_multiple(df_step4_1_1, replacement_dict)

    column_names = ["brand", "model"]
    df_step4_1_3 = remove_trailing_whitespace(df_step4_1_2, column_names)

    df_step4_1_4 = transform_fuel_types(df_step4_1_3, "Kraftstoff")

    split_dict_simple = {"Garantie": " ", "Stromverbrauch": " ", "Leistung": " ", "Hubraum": " ", "Leergewicht": " "}
    df_step4_2_1 = split_string_to_1integer(df_step4_1_4, split_dict_simple)

    df_step4_2_1["CO₂-Emissionen"] = df_step4_2_1["CO₂-Emissionen"].astype(str)
    df_step4_2_1["CO₂-Emissionen"] = df_step4_2_1["CO₂-Emissionen"].fillna("20")
    
    df_step4_2_1["Kilometerstand"] = df_step4_2_1["Kilometerstand"].astype(str)
    df_step4_2_1["Kilometerstand"] = df_step4_2_1["Kilometerstand"].fillna("2000km")
    
    df_step4_2_1["Elektrische Reichweite"] = df_step4_2_1["Elektrische Reichweite"].astype(str)
    df_step4_2_1["Elektrische Reichweite"] = df_step4_2_1["Elektrische Reichweite"].fillna("0km")
    
    split_dict_complex = {"CO₂-Emissionen": " ", "Kilometerstand": " ", "Elektrische Reichweite": " "}
    df_step4_2_2 = split_string_to_1integer_complex(df_step4_2_1, split_dict_complex)

    df_step4_3_1 = convert_fuel_consumption(df_step4_2_2, "Kraftstoffverbrauch")
    df_step4_3_2 = convert_barzahlungspreis_to_float(df_step4_3_1, "Barzahlungspreis")

    replacement_dict_default = {"Scheckheftgepflegt": ["Nein"], "Nichtraucherfahrzeug": ["Nein"], "Garantie": [0],
                                "Taxi oder Mietwagen": ["Nein"], "Stromverbrauch": [0], "Elektrische Reichweite": [0],
                                "Lackierung": ["Keine Angabe"], "Außenfarbe": ["keine"]}
    df_step5_1 = missing_values_to_default(df_step4_3_2, replacement_dict_default)

    df_step5_2_1 = get_median_value_based_on_erstzulassung(df_step5_1, "Fahrzeughalter", "Erstzulassung")

    replacement_columns_2groups = ["Antriebsart", "Sitzplätze", "Türen"]
    df_step5_2_2 = fill_missing_with_mode_by_2groups(df_step5_2_1, replacement_columns_2groups, "brand", "model")

    replacement_columns_3groups = ["Schadstoffklasse", "Umweltplakette", "Getriebe", "Hubraum",
                                   "CO₂-Emissionen", "Zylinder", "Leergewicht", "Kraftstoff"]
    df_step5_2_3 = fill_missing_with_mode_by_3groups(df_step5_2_2, replacement_columns_3groups, "brand", "model", "Leistung")

    # Split multi-feature columns
    features_to_columns = ["Sicherheit", "Innenausstattung", "Unterhaltung/Media", "Extras", "Komfort"]
    df_step6 = create_feature_columns(df_step5_2_3, features_to_columns)

    columns_to_drop_features = ["Sicherheit", "Innenausstattung", "Unterhaltung/Media", "Extras", "Komfort"]
    df_step6 = drop_unwanted_columns(df_step6, columns_to_drop_features)

    # Date columns
    df_step7_1_1 = convert_to_timestamp(df_step6, "date", "time")
    df_step7_1_2 = drop_unwanted_columns(df_step7_1_1, ["time", "date"])
    df_step7_2_1 = split_month_year(df_step7_1_2, "Erstzulassung")
    df_step7_2_2 = convert_zulassung_to_age(df_step7_2_1, "Erstzulassung_Jahr", "Erstzulassung_Monat")
    df_step7_2_3 = drop_unwanted_columns(df_step7_2_2, ["Erstzulassung", "Erstzulassung_Monat"])

    # Rename columns
    rename_dict = {"Barzahlungspreis": "Preis", "Leistung": "Leistung_PS", "CO₂-Emissionen": "CO2-Emissionen",
                   "model": "Modell", "brand": "Hersteller", "url": "URL"}
    df_step10_1 = df_step7_2_3.rename(columns=rename_dict)

    translation_dict = {
        'Preis': 'Price', 'Hersteller': 'Manufacturer', 'Modell': 'Model', 'Kilometerstand': 'Mileage',
        'Leistung_PS': 'Power_HP', 'Kraftstoff': 'Fuel', 'Antriebsart': 'Drive_type', 'Karosserieform': 'Body_type',
        'Fahrzeugart': 'Vehicle_type', 'Sitzplätze': 'Seats', 'Türen': 'Doors', 'Fahrzeughalter': 'Owners',
        'Getriebe': 'Transmission', 'Hubraum': 'Displacement', 'Zylinder': 'Cylinders', 'Kraftstoffverbrauch': 'Fuel_consumption',
        'Schadstoffklasse': 'Emission_class', 'Umweltplakette': 'Environmental_sticker', 'Komfort': 'Comfort', 'Extras': 'Extras',
        'Außenfarbe': 'Exterior_color', 'Innenausstattung': 'Interior_features', 'URL': 'URL',
        'Nichtraucherfahrzeug': 'Non_smoker_vehicle', 'Leergewicht': 'Curb_weight', 'Unterhaltung/Media': 'Entertainment/Media',
        'Sicherheit': 'Safety', 'CO2-Emissionen': 'CO2_emissions', 'Lackierung': 'Paint', 'Scheckheftgepflegt': 'Full_service_history',
        'Garantie': 'Warranty', 'Taxi oder Mietwagen': 'Taxi_or_rental', 'Stromverbrauch': 'Electricity_consumption',
        'Elektrische Reichweite': 'Electric_range', 'date_scraped': 'Date_scraped', 'Alter': 'Age'}
    df_step10_2 = df_step10_1.rename(columns=translation_dict)

    # Rearrange columns
    first_columns = ["Price", "Manufacturer", "Model", "Mileage", "Power_HP", "Fuel", "Drive_type"]
    df_step11 = rearrange_columns(df_step10_2, first_columns)

    # Reduce data back to wanted observation
    cleaned_new_observation = df_step11[df_step11["URL"] == url]

    # Fill rest of NaNs
    cleaned_new_observation = cleaned_new_observation.fillna(0)

    # Use helper data to get the right shape
    most_recent_file = get_most_recent_file(helper_data_path_cleaned, 1)
    helper_data_cleaned = pd.read_csv(most_recent_file, low_memory=False)
    concatenated_df_cleaned = concat_with_first_columns_val([helper_data_cleaned, cleaned_new_observation])
    concatenated_df_cleaned = concatenated_df_cleaned[concatenated_df_cleaned["URL"] == url]

    # Dummied helper data
    columns_to_integers = ["Seats", "Power_HP", "Displacement", "Cylinders", "Warranty"]
    columns_to_date = ["Date_scraped"]
    columns_to_floats = []
    columns_to_categorical = []
    columns_to_drop = []

    data_type_transformed = data_type_transformer(concatenated_df_cleaned, columns_to_integers, columns_to_date,
                                                  columns_to_floats, columns_to_categorical, columns_to_drop)

    columns_to_dummy = list(data_type_transformed.select_dtypes(include=['object']).columns)
    columns_to_exclude = ["URL", "Date_scraped"]

    for curr_column in columns_to_exclude:
        if curr_column in columns_to_dummy:
            columns_to_dummy.remove(curr_column)

    data_dummy_transformed = pd.get_dummies(data_type_transformed, columns=columns_to_dummy,
                                            drop_first=False, dtype=float)

    most_recent_file_dummied = get_most_recent_file(helper_data_path_dummied, 1)
    helper_data_dummied = pd.read_csv(most_recent_file_dummied, low_memory=False)
    concatenated_df_dummied = concat_with_first_columns_val([helper_data_dummied, data_dummy_transformed])
    concatenated_df_dummied = concatenated_df_dummied[concatenated_df_dummied["URL"] == url]

    dummied_new_observation = concatenated_df_dummied.fillna(0)
    
    dummied_new_observation.reset_index(drop = True, inplace = True)

    return dummied_new_observation


# Get Prediction Results on New Observation (All in One)

In [1]:
def get_prediction_results_val(url, scraped_data_path, helper_data_path_cleaned, helper_data_path_dummied, output_path,
    # get_prediction_results_val function begins
                          fixed_procentual_error, target_folder_model):
    
    # Process done by the function
    # get the dummied new observation from URL using the prior defined function
    dummied_new_observation = process_car_data_val(url, scraped_data_path, helper_data_path_cleaned, helper_data_path_dummied, output_path)
    
    
    # define if you want the most recent file (1), second most recent (2), ...
    n_most_recent_file = 1 

    # get the input path
    input_path_model = get_data_path(target_folder_model)

    # load the model
    most_recent_model_file = get_most_recent_file(input_path_model, n_most_recent_file)
    xgb_model = joblib.load(most_recent_model_file)
    
    real_price = dummied_new_observation["Price"]
    fair_price = xgb_model.predict(dummied_new_observation.drop(columns=["Price", "URL", "Date_scraped"]))[0]
    
    lower_bound = fair_price*(1-fixed_procentual_error)
    upper_bound = fair_price*(1+fixed_procentual_error)
    
    
    print("Car Valuation Done!")
    
    return float(real_price), fair_price, lower_bound, upper_bound

In [None]:
def get_prediction_results_preloaded_model_val(url, scraped_data_path, helper_data_path_cleaned, helper_data_path_dummied, output_path,
    # get_prediction_results_preloaded_model_val function begins
                          fixed_procentual_error, model):
    
    # Process done by the function
    # get the dummied new observation from URL using the prior defined function
    dummied_new_observation = process_car_data_val(url, scraped_data_path, helper_data_path_cleaned, helper_data_path_dummied, output_path)
    xgb_model = model
    
    real_price = dummied_new_observation["Price"]
    fair_price = xgb_model.predict(dummied_new_observation.drop(columns=["Price", "URL", "Date_scraped"]))[0]
    
    lower_bound = fair_price*(1-fixed_procentual_error)
    upper_bound = fair_price*(1+fixed_procentual_error)
    
    
    print("Car Valuation Done!")
    
    return float(real_price), fair_price, lower_bound, upper_bound