# Adding predictions

## Load and validate data

to work from the project's root directory

In [1]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

In [None]:
import pandas as pd

df = pd.read_csv("../datasets/with_sale_prices/test_data_15_2608.csv", comment="#")  # if the first line of the csv contains a comment

df.head()

Ensure that data is in the correct format by loading the Pydantic model

In [None]:
from src.models import SaleRow
from pydantic import ValidationError


def validate_df(df):
    try:
        validated_rows = [SaleRow(**row) for row in df.to_dict(orient='records')]
        return validated_rows
    except ValidationError as e:
        print(e)
        return None

# usage
validated_rows = validate_df(df)
if validated_rows:
    print("All rows are valid")
else:
    print("Some rows are invalid")

## Add predictions

In [4]:
sale_price_commision = 0.87  # since predicted price = sell price, 0.02 = 2% commision (e.g predixted_price * 0.98), 1 if its not exist

In [None]:
from src.ml.get_predicted_price_from_models import get_predicted_data, PredictedData
import pandas as pd
from IPython.display import clear_output


def apply_models_predictions(df: pd.DataFrame) -> pd.DataFrame:
    """
    Applies the model to each item in the dataframe using 10 prices (price_1 ... price_10).
    Stores the result in the 'predicted' column.

    Arguments:
      df: DataFrame with columns price_1 ... price_10

    Returns:
      Updated df with a new column 'predicted'
    
    Note:
      The prediction is formed as the average of the predictions from CatBoost and LSTM models.
    """
    if 'predict_1_models_mean' not in df.columns:
        df['predict_1_models_mean'] = 0.0  # Initialize the column

    for idx, row in df.iterrows():
        if idx % 200 == 0:
            clear_output(wait=True)

        prices = [row.get(f"price_{i}", None) for i in range(1, 11)]

        # Check if all prices are present (skip if any NaN)
        if any(pd.isna(price) for price in prices):
            print(f"{idx} - failed to retrieve valid prices!")
            continue

        predicted_item_data: PredictedData = get_predicted_data(prices=prices)

        # Validate predictions
        try:
            predicted_item_data.validate()
            df.at[idx, 'predict_1_models_mean'] = round(predicted_item_data.mean_1 * sale_price_commision, 3)
            print(f"{idx} - successfully added prediction for\n {prices}\n --> {predicted_item_data.mean_1}")
        except ValueError as e:
            print(f"Validation failed for index {idx} with prices {prices}: {e}")

    # Remove rows where predictions remained 0.0 (indicating no valid predictions)
    df = df[df['predict_1_models_mean'] != 0.0]

    return df


apply_models_predictions(df)



## Save Data

In [9]:
csv_name = "../datasets/ready_to_work/test_data_15_2608.csv"
description = "test data of 15 prices and timestamps with a masked name with a sold prices and predicted_mean (catboost + lstm mean) (origin - not specified)\n"


def save_csv_with_a_description(df: pd.DataFrame, csv_name: str, description: str):
    description = "# " + description  # add a comment symbol

    df.to_csv(csv_name, index=False)

    # open csv and add  description at the firsrt row
    with open(csv_name, 'w', encoding='utf-8', newline='') as f:
        f.write(description)  # Записываем описание
        df.to_csv(f, index=False)  # Записываем DataFrame


save_csv_with_a_description(df, csv_name, description)