# Get the Car Observation from the requested URL

In [None]:
def scrape_car_offer(url):
    # scrape_car_offer function begins
    try:
        response = requests.get(url)
        html = response.text
        doc = BeautifulSoup(html, "html.parser")

    # Process done by the function
        curr_car_dict = {}
        attribute_exception_list = ['Some', 'Attributes', 'To', 'Exclude']  # Update this list with actual attributes to exclude

        for key, value in zip(doc.find_all("dt"), doc.find_all("dd")):
            if key.text not in attribute_exception_list:
                curr_car_dict[key.text.replace("\n", "")] = value.text.replace("\n", "")

        curr_car_dict["url"] = url
        curr_car_dict["date"] = datetime.now().strftime("%Y-%m-%d")
        curr_car_dict["time"] = datetime.now().strftime("%H-%M-%S")
        curr_car_dict["model"] = doc.find("span", class_="StageTitle_model__EbfjC StageTitle_boldClassifiedInfo__sQb0l").get_text()
        curr_car_dict["brand"] = doc.find("span", class_="StageTitle_boldClassifiedInfo__sQb0l").get_text()

        # Check if the price column exists, if not, add it
        if 'Barzahlungspreis' not in curr_car_dict:
            curr_car_dict['Barzahlungspreis'] = re.split(r'(?<=-)', doc.find('div', class_='PriceInfo_wrapper__hreB_').find('span', class_='PriceInfo_price__XU0aF').text.strip())[0]

        # Create DataFrame from the dictionary
        car_offer_df = pd.DataFrame.from_dict(curr_car_dict, orient="index").T

        return car_offer_df

    except Exception as e:
        print(f"Error occurred while scraping the URL: {e}")
        return pd.DataFrame()



# Clean and Dummy New Observation in One

## Match Format of New Observations with Cleaned/Dummy Transformed Data

In [None]:
def concat_with_first_columns(df_list):
    # concat_with_first_columns function begins

    # Process done by the function
    if not df_list:
        return pd.DataFrame()  # Return an empty DataFrame if the list is empty
    
    first_df = df_list[0]
    other_dfs = df_list[1:]
    
    # Reindex other DataFrames to match the columns of the first DataFrame
    reindexed_dfs = [first_df] + [df.reindex(columns=first_df.columns) for df in other_dfs]
    
    return pd.concat(reindexed_dfs, ignore_index=True)


## Preprocess Observations

In [3]:
def process_car_data(urls, scraped_data_path, helper_data_path_cleaned, helper_data_path_dummied, output_path):
    # process_car_data function begins
    # Set the maximum number of displayed rows and columns
    pd.set_option('display.max_rows', 1000)
    pd.set_option('display.max_columns', 1000)

    # Process done by the function
    # Import scraped data
    scraped_data = pd.read_csv(scraped_data_path, low_memory=False)

    # Get new car observations from URLs
    new_car_observations = pd.DataFrame()
    for curr_url in urls:
        new_car_observation_curr = scrape_car_offer(curr_url)
        new_car_observations = pd.concat([new_car_observations, new_car_observation_curr])

    # Reduce scraped data
    brand_list = new_car_observations["brand"].tolist()
    model_list = new_car_observations["model"].tolist()
    
    reduced_scraped_data = pd.DataFrame()
    for curr_brand, curr_model in zip(brand_list, model_list):
        curr_df = scraped_data[(scraped_data["brand"] == curr_brand) & (scraped_data["model"] == curr_model)]
        reduced_scraped_data = pd.concat([reduced_scraped_data, curr_df])

    # Delete full scraped data to save memory
    if "scraped_data" in globals():
        del scraped_data
        gc.collect()

    # Append new observations to scraped data
    processing_data = pd.concat([reduced_scraped_data, new_car_observations])

    # Clean dataframe from observations where values have shifted
    allowance_dict = {"Umweltplakette": ["1 (Keine)", "4 (Grün)", "3 (Gelb)", "2 (Rot)", np.nan],
                      "Antriebsart": ['Heck', 'Front', 'Allrad', np.nan],
                      "Scheckheftgepflegt": ["Ja", "Nein", np.nan]}
    df_step2 = clean_from_rows_with_shifted_columns(processing_data)

    # Drop unnamed columns
    df_step3_1 = drop_unnamed_columns(df_step2)

    # Drop unwanted columns
    columns_to_drop = ["€ 0,-", "Pkw Zulassung", "Ladevolumen", "zul. Zuggewicht", "Achsen", "Nutzlast", "Radstand", "Laderaumhöhe",
                       "Laderaumbreite", "Laderaumlänge", "zul. Gesamtgewicht", "Sonderzahlung", "Verfügbarkeit", "Batteriebesitz",
                       "Letzter Zahnriemenwechsel", "Verfügbar ab", "Fahrzeugstand", "Energieeffizienzklasse", "CO₂-Effizienz",
                       "Bearbeitungsgebühren", "Bruttodarlehensbetrag", "Sollzins geb. p.a.", "Letzte Inspektion", "Baujahr",
                       "CO₂-Klasse", "Schlüsselnummer", "Gänge", "Angebotsnummer", "Farbe laut Hersteller",
                       "Farbe der Innenausstattung", "Andere Energieträger", "Fahrzeugzustand",
                       "Ladezeit von 10% bis 80%", "Fahrleistung p.a.", "Zulassungskosten", "HU"]
    df_step3_2 = drop_unwanted_columns(df_step3_1, columns_to_drop)

    df_step3_2["Elektrische Reichweite7"] = 0
    df_step3_2["Fahrzeughalter"] = df_step3_2["Fahrzeughalter"].apply(lambda x: 1 if isinstance(x, str) else x)
    df_step3_2["Fahrzeughalter"] = df_step3_2["Fahrzeughalter"].fillna(1)
    
    # Keep wanted columns
    wanted_columns = [
        'Barzahlungspreis', 'brand', 'model', 'Kilometerstand', 'Leistung', 'Kraftstoff', 'Antriebsart', 'Karosserieform',
        'Fahrzeugart', 'Sitzplätze', 'Türen', 'Fahrzeughalter', 'Getriebe', 'Hubraum', 'Zylinder', 'Kraftstoffverbrauch',
        'Schadstoffklasse', 'Umweltplakette', 'Komfort', 'Extras', 'Außenfarbe', 'Innenausstattung', 'url',
        'Nichtraucherfahrzeug', 'Leergewicht', 'Unterhaltung/Media', 'Sicherheit', 'CO₂-Emissionen', 'Lackierung',
        'Scheckheftgepflegt', 'Garantie', 'Taxi oder Mietwagen', 'Stromverbrauch', 'Elektrische Reichweite', 
        "Elektrische Reichweite7", 'date', 'Erstzulassung', 'time']
    df_step3_2 = df_step3_2[wanted_columns]

    # Merge columns
    df_step3_3_1 = merge_columns(df_step3_2, "Elektrische Reichweite", "Elektrische Reichweite7")

    # Convert strings to floats/integers
    replacement_dict_garantie = {"Nein": "0", "Ja": "12"}
    df_step4_1_1 = replace_values_single(df_step3_3_1, "Garantie", replacement_dict_garantie)

    replacement_dict_brand = {"porsche": "Porsche", "audi": "Audi", "opel": "Opel", "skoda": "Skoda", "toyota": "Toyota"}
    df_step4_1_1 = replace_values_single(df_step4_1_1, "brand", replacement_dict_brand)

    replacement_dict = {"Stromverbrauch": {",": "."}, "Leistung": {",": "."}, "Hubraum": {",": "."},
                        "Leergewicht": {",": "."}, "CO₂-Emissionen": {",": "."}, "Kilometerstand": {".": ""}}
    df_step4_1_2 = replace_values_multiple(df_step4_1_1, replacement_dict)

    column_names = ["brand", "model"]
    df_step4_1_3 = remove_trailing_whitespace(df_step4_1_2, column_names)

    df_step4_1_4 = transform_fuel_types(df_step4_1_3, "Kraftstoff")

    split_dict_simple = {"Garantie": " ", "Stromverbrauch": " ", "Leistung": " ", "Hubraum": " ", "Leergewicht": " "}
    df_step4_2_1 = split_string_to_1integer(df_step4_1_4, split_dict_simple)

    df_step4_2_1["CO₂-Emissionen"] = df_step4_2_1["CO₂-Emissionen"].astype(str)
    df_step4_2_1["CO₂-Emissionen"] = df_step4_2_1["CO₂-Emissionen"].fillna("20")
    
    df_step4_2_1["Kilometerstand"] = df_step4_2_1["Kilometerstand"].astype(str)
    df_step4_2_1["Kilometerstand"] = df_step4_2_1["Kilometerstand"].fillna("2000km")
    
    df_step4_2_1["Elektrische Reichweite"] = df_step4_2_1["Elektrische Reichweite"].astype(str)
    df_step4_2_1["Elektrische Reichweite"] = df_step4_2_1["Elektrische Reichweite"].fillna("0km")
    
    split_dict_complex = {"CO₂-Emissionen": " ", "Kilometerstand": " ", "Elektrische Reichweite": " "}
    df_step4_2_2 = split_string_to_1integer_complex(df_step4_2_1, split_dict_complex)

    df_step4_3_1 = convert_fuel_consumption(df_step4_2_2, "Kraftstoffverbrauch")
    df_step4_3_2 = convert_barzahlungspreis_to_float(df_step4_3_1, "Barzahlungspreis")

    replacement_dict_default = {"Scheckheftgepflegt": ["Nein"], "Nichtraucherfahrzeug": ["Nein"], "Garantie": [0],
                                "Taxi oder Mietwagen": ["Nein"], "Stromverbrauch": [0], "Elektrische Reichweite": [0],
                                "Lackierung": ["Keine Angabe"], "Außenfarbe": ["keine"]}
    df_step5_1 = missing_values_to_default(df_step4_3_2, replacement_dict_default)

    df_step5_2_1 = get_median_value_based_on_erstzulassung(df_step5_1, "Fahrzeughalter", "Erstzulassung")

    replacement_columns_2groups = ["Antriebsart", "Sitzplätze", "Türen"]
    df_step5_2_2 = fill_missing_with_mode_by_2groups(df_step5_2_1, replacement_columns_2groups, "brand", "model")

    replacement_columns_3groups = ["Schadstoffklasse", "Umweltplakette", "Getriebe", "Hubraum",
                                   "CO₂-Emissionen", "Zylinder", "Leergewicht", "Kraftstoff"]
    df_step5_2_3 = fill_missing_with_mode_by_3groups(df_step5_2_2, replacement_columns_3groups, "brand", "model", "Leistung")

    # Split multi-feature columns
    features_to_columns = ["Sicherheit", "Innenausstattung", "Unterhaltung/Media", "Extras", "Komfort"]
    df_step6 = create_feature_columns(df_step5_2_3, features_to_columns)

    columns_to_drop_features = ["Sicherheit", "Innenausstattung", "Unterhaltung/Media", "Extras", "Komfort"]
    df_step6 = drop_unwanted_columns(df_step6, columns_to_drop_features)

    # Date columns
    df_step7_1_1 = convert_to_timestamp(df_step6, "date", "time")
    df_step7_1_2 = drop_unwanted_columns(df_step7_1_1, ["time", "date"])
    df_step7_2_1 = split_month_year(df_step7_1_2, "Erstzulassung")
    df_step7_2_2 = convert_zulassung_to_age(df_step7_2_1, "Erstzulassung_Jahr", "Erstzulassung_Monat")
    df_step7_2_3 = drop_unwanted_columns(df_step7_2_2, ["Erstzulassung", "Erstzulassung_Monat"])

    # Rename columns
    rename_dict = {"Barzahlungspreis": "Preis", "Leistung": "Leistung_PS", "CO₂-Emissionen": "CO2-Emissionen",
                   "model": "Modell", "brand": "Hersteller", "url": "URL"}
    df_step10_1 = df_step7_2_3.rename(columns=rename_dict)

    translation_dict = {
        'Preis': 'Price', 'Hersteller': 'Manufacturer', 'Modell': 'Model', 'Kilometerstand': 'Mileage',
        'Leistung_PS': 'Power_HP', 'Kraftstoff': 'Fuel', 'Antriebsart': 'Drive_type', 'Karosserieform': 'Body_type',
        'Fahrzeugart': 'Vehicle_type', 'Sitzplätze': 'Seats', 'Türen': 'Doors', 'Fahrzeughalter': 'Owners',
        'Getriebe': 'Transmission', 'Hubraum': 'Displacement', 'Zylinder': 'Cylinders', 'Kraftstoffverbrauch': 'Fuel_consumption',
        'Schadstoffklasse': 'Emission_class', 'Umweltplakette': 'Environmental_sticker', 'Komfort': 'Comfort', 'Extras': 'Extras',
        'Außenfarbe': 'Exterior_color', 'Innenausstattung': 'Interior_features', 'URL': 'URL',
        'Nichtraucherfahrzeug': 'Non_smoker_vehicle', 'Leergewicht': 'Curb_weight', 'Unterhaltung/Media': 'Entertainment/Media',
        'Sicherheit': 'Safety', 'CO2-Emissionen': 'CO2_emissions', 'Lackierung': 'Paint', 'Scheckheftgepflegt': 'Full_service_history',
        'Garantie': 'Warranty', 'Taxi oder Mietwagen': 'Taxi_or_rental', 'Stromverbrauch': 'Electricity_consumption',
        'Elektrische Reichweite': 'Electric_range', 'date_scraped': 'Date_scraped', 'Alter': 'Age'}
    df_step10_2 = df_step10_1.rename(columns=translation_dict)

    # Rearrange columns
    first_columns = ["Price", "Manufacturer", "Model", "Mileage", "Power_HP", "Fuel", "Drive_type"]
    df_step11 = rearrange_columns(df_step10_2, first_columns)

    # Reduce data back to wanted observations
    cleaned_new_observations = df_step11[df_step11["URL"].isin(urls)]

    # Fill rest of NaNs
    cleaned_new_observations = cleaned_new_observations.fillna(0)

    # Use helper data to get the right shape
    most_recent_file = get_most_recent_file(helper_data_path_cleaned, 1)
    helper_data_cleaned = pd.read_csv(most_recent_file, low_memory=False)
    concatenated_df_cleaned = concat_with_first_columns([helper_data_cleaned, cleaned_new_observations])
    concatenated_df_cleaned = concatenated_df_cleaned[concatenated_df_cleaned["URL"].isin(urls)]

    # Dummied helper data
    columns_to_integers = ["Seats", "Power_HP", "Displacement", "Cylinders", "Warranty"]
    columns_to_date = ["Date_scraped"]
    columns_to_floats = []
    columns_to_categorical = []
    columns_to_drop = []

    data_type_transformed = data_type_transformer(concatenated_df_cleaned, columns_to_integers, columns_to_date,
                                                  columns_to_floats, columns_to_categorical, columns_to_drop)

    columns_to_dummy = list(data_type_transformed.select_dtypes(include=['object']).columns)
    columns_to_exclude = ["URL", "Date_scraped"]

    for curr_column in columns_to_exclude:
        if curr_column in columns_to_dummy:
            columns_to_dummy.remove(curr_column)

    data_dummy_transformed = pd.get_dummies(data_type_transformed, columns=columns_to_dummy,
                                            drop_first=False, dtype=float)

    most_recent_file_dummied = get_most_recent_file(helper_data_path_dummied, 1)
    helper_data_dummied = pd.read_csv(most_recent_file_dummied, low_memory=False)
    concatenated_df_dummied = concat_with_first_columns([helper_data_dummied, data_dummy_transformed])
    concatenated_df_dummied = concatenated_df_dummied[concatenated_df_dummied["URL"].isin(urls)]

    dummied_new_observations = concatenated_df_dummied.fillna(0)

    # Save depreciation data
    dummied_new_observations.to_csv(output_path, index=False)
    
    dummied_new_observations.reset_index(drop = True, inplace = True)

    return dummied_new_observations


# Calculate Depreciation

In [None]:
def calculate_depreciation(depreciation_data, model, planned_km_per_year, planned_years):
    # calculate_depreciation function begins
    
    # Process done by the function
    car_option_1 = depreciation_data.iloc[0:1].copy()
    car_option_2 = depreciation_data.iloc[1:2].copy()
    
    planned_km_total = planned_km_per_year*planned_years
    
    ### Make Predictions for Original Data ###
    print("Making Price Predictions for Original Data...")
    
    # Prepare data for predictions
    true_prices = depreciation_data["Price"]
    model_urls = depreciation_data["URL"]
    
    mileage_price_diffs = []
    age_price_diffs = []
    original_prices = []

    # Calculate predictions for car options 1 and 2 with modified Mileage and Age
    for car_option in [car_option_1, car_option_2]:
        original_price = model.predict(car_option.drop(columns=["Price", "URL", "Date_scraped"]))[0]
        original_prices.append(original_price)
        
        # Modify Mileage
        car_option_mileage = car_option.copy()
        car_option_mileage["Mileage"] += planned_km_total
        mileage_price = model.predict(car_option_mileage.drop(columns=["Price", "URL", "Date_scraped"]))[0]
        mileage_price_diff = mileage_price - original_price
        mileage_price_diffs.append(mileage_price_diff)
        
        # Modify Age
        car_option_age = car_option.copy()
        car_option_age["Age"] += planned_years
        age_price = model.predict(car_option_age.drop(columns=["Price", "URL", "Date_scraped"]))[0]
        age_price_diff = age_price - original_price
        age_price_diffs.append(age_price_diff)

    # Create a new DataFrame to hold the results
    results = pd.DataFrame({
        "Car Option": ["Option 1", "Option 2"],
        "Original Predicted Price": original_prices,
        "Mileage Depreciation": mileage_price_diffs,
        "Age Depreciation": age_price_diffs,
    })
    
    #depreciation_data.reset_index(drop=True, inplace=True)
    results_df = pd.concat([depreciation_data, results], axis = 1)
    results_df["Total Depreciation"] = results_df["Mileage Depreciation"] + results_df["Age Depreciation"]
    results_df["Resulting Car Value"] = results_df["Price"] + results_df["Total Depreciation"]
    
    results_df = results_df.rename(columns = {"Erstzulassung_Jahr" : "Initial Approval", "Power_HP" : "Power"})
    result_df_rounded = results_df.round(2)

    price_columns = ["Price", "Mileage Depreciation", "Age Depreciation", "Resulting Car Value", "Original Predicted Price", "Total Depreciation"]

    # Adding the € sign to the specified columns
    for col in price_columns:
        result_df_rounded[col] = result_df_rounded[col].apply(lambda x: f"{x:.2f}€")
        
    result_df_rounded["Mileage"] = result_df_rounded["Mileage"].apply(lambda x: f"{x:.2f}km")
    result_df_rounded["Power"] = result_df_rounded["Power"].apply(lambda x: f"{x:.2f}HP")

    # define manufacturer and model columns
    manufacturer_columns = [col for col in result_df_rounded.columns if 'Manufacturer_' in col]
    model_columns = [col for col in result_df_rounded.columns if 'Model_' in col]
    
    # Initialize new columns 'Manufacturer' and 'Model' with empty strings
    result_df_rounded['Manufacturer'] = ''
    result_df_rounded['Model'] = ''
    
    # Loop through the rows and set the 'Manufacturer' and 'Model' columns
    for index, row in result_df_rounded.iterrows():
        for col in manufacturer_columns:
            if row[col] == 1:
                result_df_rounded.at[index, 'Manufacturer'] = str(col)
                break
        
        for col in model_columns:
            if row[col] == 1:
                result_df_rounded.at[index, 'Model'] = str(col)
                break
    
    # Optionally, clean up the 'Manufacturer' and 'Model' columns to remove prefixes
    result_df_rounded['Manufacturer'] = result_df_rounded['Manufacturer'].str.replace('Manufacturer_', '', regex=False)
    result_df_rounded['Model'] = result_df_rounded['Model'].str.replace('Model_', '', regex=False)
    
    result_reduced = result_df_rounded[["Manufacturer", "Model", "Mileage", "Power", "Initial Approval", "Price", "Mileage Depreciation", "Age Depreciation", "Total Depreciation", "Resulting Car Value"]]

    return result_reduced


# Calculate Depreciation In One

In [None]:
def get_depreciation_in_one(urls, scraped_data_path, helper_data_path_cleaned, helper_data_path_dummied, output_path, model, planned_km_per_year, planned_years):
    # get_depreciation_in_one function begins
    
    # Process done by the function
    dummied_new_observations = process_car_data(urls = urls, scraped_data_path = scraped_data_path,
                                                helper_data_path_cleaned = helper_data_path_cleaned,
                                                helper_data_path_dummied = helper_data_path_dummied,
                                                output_path = output_path)
    
    
    results_reduced = calculate_depreciation(depreciation_data = dummied_new_observations,
                                             model = model,
                                             planned_km_per_year = planned_km_per_year,
                                             planned_years = planned_years)
    
    
    return results_reduced