# Get Best Offers

## Make Predictions on a Subselection

In [4]:
def predict_subselection_xgb(df, manufacturer, car_model, max_mileage, initial_approval, n_days, model):
    # predict_subselection_xgb function begins
        
    # Process done by the function
    manufactuer_formatted = str("Manufacturer_" + str(manufacturer))
    car_model_formatted = str("Model_" + str(car_model))
    
    ### 1) subselect the specified car model as well as max_kilometers and initial_approval ###
    car_model_subset = df.copy()
    car_model_subset = car_model_subset[(car_model_subset[manufactuer_formatted] == 1) & (car_model_subset[car_model_formatted] == 1) & (car_model_subset["Erstzulassung_Jahr"] >= initial_approval) & (car_model_subset["Mileage"] <= max_mileage)]
    
    ### 2) convert date_scraped to timestamp object ###
    car_model_subset_days = car_model_subset.copy()
    car_model_subset_days.loc[:, 'Date_scraped'] = pd.to_datetime(car_model_subset.loc[:, 'Date_scraped'])
    
    ### 3) only respect data scraped in the last n days ###
    
    # get current date
    current_date = datetime.now()
    
    # get date n_days back
    date_n_days_ago = current_date - timedelta(days= n_days)
    
    # subselect only observations scraped since date_n_days_ago
    n_days_subset = car_model_subset_days[car_model_subset_days["Date_scraped"] >= date_n_days_ago]
    
    if len(car_model_subset_days) < 1:
        
        return "no data"
    
    ### 4) Make Predictions ###
    
    print("Making Price Predictions...")

    prediction_subset = n_days_subset.copy()
    
    # extract true prices and URL
    true_prices = prediction_subset["Price"]
    model_urls = prediction_subset["URL"]
    
    
    # drop true prices and URL for model predictions
    prediction_subset.drop(columns = ["Price"], inplace = True)
    prediction_subset.drop(columns = ["URL"], inplace = True)
    prediction_subset.drop(columns = ["Date_scraped"], inplace = True)
    
    # predict prices for observations
    predicted_y = model.predict(prediction_subset)
    
    # merge the data frame back together and calculate a price diff
    merged_df = prediction_subset.copy()
    merged_df["Price"] = true_prices
    merged_df["Predicted Price"] = predicted_y
    merged_df["Price Diff"] = merged_df["Predicted Price"] - merged_df["Price"]
    merged_df["URL"] = model_urls
    
    merged_df["Manufacturer"] = manufacturer
    merged_df["Model"] = car_model
    
    # sort the dataframe such that good offers (Predicted Price >> Price) are shown at top
    sorted_prediction_df = merged_df.sort_values(["Price Diff"], ascending = False)
    
    print(f"Predictions done! Found {len(sorted_prediction_df)} observations for {manufacturer} {car_model} and max mileage of {max_mileage}km and Initial Approval >= {initial_approval}!")
    
    return sorted_prediction_df

## Check Best Offers on Availability and Create Result DataFrame

In [7]:
def get_best_offers_efficiently_xgb(df, n_best_offers, area_input, change_vpn, sleep_time):
    # get_best_offers_efficiently_xgb function begins
    
    # Process done by the function

    
    sorted_prediction_df = df.copy()
    
    # create empty df for results
    result_df = pd.DataFrame()
    
    if change_vpn:
        
        instructions_vpn = initialize_VPN(area_input = area_input)
        rotate_VPN(instructions_vpn)
    
    print("Checking Availability and Creating Result DataFrame!")
    
    # start an index for the while loop
    index = 0
    
    # loop over sorted prediction df
    while len(result_df) < n_best_offers and index <= len(sorted_prediction_df)+1:
        
        
        # extract current entry
        curr_entry = sorted_prediction_df[index : index+1].copy()
        
        # extract current URL
        
        curr_url = curr_entry["URL"][curr_entry.index[0]]

        try:

            response = requests.get(curr_url, timeout = 10)

            if response.status_code == 200:
                
                try:
                    # get the image_url
                    html = response.text
                    doc = BeautifulSoup(html, "html.parser")
                    img_url = doc.find('picture', class_ ='ImageWithBadge_picture__XJG24').find()
                    img_url = "/".join(str(img_url).split()[3].split("=")[1][1:-1].split("/")[:-1])

                    # save successful observation to result dataframe
                    curr_entry["Image URL"] = img_url
                    result_df = pd.concat([result_df, curr_entry])
                    
                    #print(f"Succesfully created entry at index {index}!")

                    # let the scraper sleep to remain undetected
                    time.sleep(sleep_time)
                    
                except Exception as e:
                            
                    # let the scraper sleep to remain undetected
                    #print(f"Failed to create entry at index {index}!")
                    #print(f"Error: {e}")
                    
                    time.sleep(sleep_time)
            

            

            
            
        # handle and log the exception
        except requests.RequestException as e:
            
            # let the scraper sleep to remain undetected
            time.sleep(sleep_time)
            

        # increase index by 1 for each iteration
        index += 1
        
    if len(result_df) < 1:
        
        print("Found no available Offers! [2nd function]")
    
    else:
        
        print(f"Best Offers DataFrame created with {len(result_df)} Offers!")
        
    ## Formating Results
    result_df.rename(columns = {"Predicted Price" : "Fair Price", "Price" : "Offer Price", "Price Diff" : "Savings"}, inplace = True)
    
    # round results
    result_df_rounded = result_df.round(2)
    
    
    # add € to Price columns
    price_columns = ["Fair Price", "Offer Price", "Savings"]

    # Adding the € sign to the specified columns
    for col in price_columns:

        result_df_rounded[col] = result_df_rounded[col].apply(lambda x: f"{x:.2f}€")


    return result_df_rounded

## All in One Function

In [None]:
def get_best_offers_in_one_efficiently_xgb(df, manufacturer, car_model, max_mileage, initial_approval, n_days, area_input,
    # get_best_offers_in_one_efficiently_xgb function begins
                                       sleep_time, change_vpn, model, n_best_offers):
    
    # Process done by the function
    # get available subset
    sorted_prediction_df = predict_subselection_xgb(df, manufacturer, car_model, max_mileage, initial_approval, n_days, model)
    
    print("")
    
    if not isinstance(sorted_prediction_df, pd.DataFrame):
        
        print("No Data found [1st Function]!")

        return 

    best_offers_df = get_best_offers_efficiently_xgb(sorted_prediction_df, n_best_offers, area_input, change_vpn, sleep_time)
        
        
    return best_offers_df



# Improved Solution

In [None]:
import pandas as pd
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
import time

def predict_subselection_xgb(df, manufacturer, car_model, max_mileage, initial_approval, n_days, model):
    # predict_subselection_xgb function begins
    manufactuer_formatted = str("Manufacturer_" + str(manufacturer))
    car_model_formatted = str("Model_" + str(car_model))
    
    # Process done by the function
    ### 1) subselect the specified car model as well as max_kilometers and initial_approval ###
    car_model_subset = df.copy()
    car_model_subset = car_model_subset[
        (car_model_subset[manufactuer_formatted] == 1) &
        (car_model_subset[car_model_formatted] == 1) &
        (car_model_subset["Erstzulassung_Jahr"] >= initial_approval) &
        (car_model_subset["Mileage"] <= max_mileage)
    ]
    
    if car_model_subset.empty:
        print("No data found after subselecting car model.")
        return pd.DataFrame()
    
    ### 2) convert date_scraped to timestamp object ###
    car_model_subset_days = car_model_subset.copy()
    car_model_subset_days.loc[:, 'Date_scraped'] = pd.to_datetime(car_model_subset.loc[:, 'Date_scraped'])
    
    ### 3) only respect data scraped in the last n days ###
    
    # get current date
    current_date = datetime.now()
    
    # get date n_days back
    date_n_days_ago = current_date - timedelta(days=n_days)
    
    # subselect only observations scraped since date_n_days_ago
    n_days_subset = car_model_subset_days[car_model_subset_days["Date_scraped"] >= date_n_days_ago]
    
    if n_days_subset.empty:
        print("No data found in the last n days.")
        return pd.DataFrame()
    
    ### 4) Make Predictions ###
    
    print("Making Price Predictions...")
    
    prediction_subset = n_days_subset.copy()
    
    # extract true prices and URL
    true_prices = prediction_subset["Price"]
    model_urls = prediction_subset["URL"]
    
    # drop true prices and URL for model predictions
    prediction_subset.drop(columns=["Price", "URL", "Date_scraped"], inplace=True)
    
    # predict prices for observations
    predicted_y = model.predict(prediction_subset)
    
    # merge the data frame back together and calculate a price diff
    merged_df = prediction_subset.copy()
    merged_df["Price"] = true_prices
    merged_df["Predicted Price"] = predicted_y
    merged_df["Price Diff"] = merged_df["Predicted Price"] - merged_df["Price"]
    merged_df["URL"] = model_urls
    merged_df["Manufacturer"] = manufacturer
    merged_df["Model"] = car_model
    
    # sort the dataframe such that good offers (Predicted Price >> Price) are shown at top
    sorted_prediction_df = merged_df.sort_values(["Price Diff"], ascending=False)
    
    print(f"Predictions done! Found {len(sorted_prediction_df)} observations for {manufacturer} {car_model} and max mileage of {max_mileage}km and Initial Approval >= {initial_approval}!")
    
    return sorted_prediction_df

def get_best_offers_efficiently_xgb(df, n_best_offers, sleep_time):
    # get_best_offers_efficiently_xgb function begins
    
    # Process done by the function
    if df.empty:
        print("Input dataframe is empty. No offers to check.")
        return pd.DataFrame()
    
    sorted_prediction_df = df.copy()
    
    # create empty df for results
    result_df = pd.DataFrame()
   
    print("Checking Availability and Creating Result DataFrame!")
    
    # start an index for the while loop
    index = 0
    
    # loop over sorted prediction df
    while len(result_df) < n_best_offers and index <= len(sorted_prediction_df):
        # extract current entry
        curr_entry = sorted_prediction_df[index:index+1].copy()
        
        # Check if curr_entry is empty
        if curr_entry.empty:
            index += 1
            continue
        
        # extract current URL
        curr_url = curr_entry["URL"].iloc[0]

        try:
            response = requests.get(curr_url, timeout=10)
            if response.status_code == 200:
                try:
                    # get the image_url
                    html = response.text
                    doc = BeautifulSoup(html, "html.parser")
                    img_url = doc.find('picture', class_='ImageWithBadge_picture__XJG24').find()
                    img_url = "/".join(str(img_url).split()[3].split("=")[1][1:-1].split("/")[:-1])

                    # save successful observation to result dataframe
                    curr_entry["Image URL"] = img_url
                    result_df = pd.concat([result_df, curr_entry])
                    
                    # let the scraper sleep to remain undetected
                    time.sleep(sleep_time)
                    
                except Exception as e:
                    # let the scraper sleep to remain undetected
                    time.sleep(sleep_time)
        except requests.RequestException as e:
            # let the scraper sleep to remain undetected
            time.sleep(sleep_time)
        
        # increase index by 1 for each iteration
        index += 1
        
    if result_df.empty:
        print("Found no available Offers! [2nd function]")
    else:
        print(f"Best Offers DataFrame created with {len(result_df)} Offers!")
        
    ## Formating Results
    result_df.rename(columns={"Predicted Price": "Fair Price", "Price": "Offer Price", "Price Diff": "Savings"}, inplace=True)
    
    # round results
    result_df_rounded = result_df.round(2)
    
    # add € to Price columns
    price_columns = ["Fair Price", "Offer Price", "Savings"]

    # Adding the € sign to the specified columns
    for col in price_columns:
        result_df_rounded[col] = result_df_rounded[col].apply(lambda x: f"{x:.2f}€")

    return result_df_rounded

def get_best_offers_in_one_efficiently_xgb(df, manufacturer, car_model, max_mileage, initial_approval, n_days,
    # get_best_offers_in_one_efficiently_xgb function begins
                                           sleep_time, model, n_best_offers):
    # get available subset
    sorted_prediction_df = predict_subselection_xgb(df, manufacturer, car_model, max_mileage, initial_approval, n_days, model)
    
    # Process done by the function
    if sorted_prediction_df.empty:
        print("No Data found [1st Function]!")
        return pd.DataFrame()

    best_offers_df = get_best_offers_efficiently_xgb(sorted_prediction_df, n_best_offers, sleep_time)
    
    best_offers_df = best_offers_df.reset_index(drop = True)
    
    return best_offers_df
