In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import psycopg
from pgvector.psycopg import register_vector

# Функция для подключения к БД
def connect_db():
    conn = psycopg.connect(
        dbname="shoes",
        user="postgres",
        host="localhost",
        port="5430"
    )
    register_vector(conn)
    return conn

# Загрузка данных из БД с фильтрацией
def load_data():
    conn = connect_db()
    cur = conn.cursor()
    
    # Выбираем только велосипеды в наличии с ненулевой ценой
    cur.execute("""
        WITH price_stats AS (
            SELECT 
                PERCENTILE_CONT(0.001) WITHIN GROUP (ORDER BY price) AS p1,
                PERCENTILE_CONT(0.999) WITHIN GROUP (ORDER BY price) AS p999
            FROM shoes
            WHERE in_stock = 'В наличии' AND price IS NOT NULL AND price > 0
        )
        SELECT 
            b.embedding, 
            b.brand, 
            b.category, 
            b.price
        FROM shoes b, price_stats ps
        WHERE b.in_stock = 'В наличии' 
        AND b.price IS NOT NULL 
        AND b.price > 0
        AND b.price BETWEEN ps.p1 AND ps.p999
    """)
    
    data = cur.fetchall()
    cur.close()
    conn.close()
    
    # Преобразуем в DataFrame
    df = pd.DataFrame(data, columns=['embedding', 'brand', 'category', 'price'])
    
    return df

# Предобработка данных
def preprocess_data(df):
    # Кодирование бренда (оставляем LabelEncoder, так как брендов может быть много)
    brand_encoder = LabelEncoder()
    df['brand_encoded'] = brand_encoder.fit_transform(df['brand'])
    
    # One-hot кодирование категории
    category_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    category_encoded = category_encoder.fit_transform(df[['category']])
    
    # Преобразование эмбеддингов в numpy массив
    embeddings = np.array(df['embedding'].tolist())
    
    # Объединение всех признаков
    X = np.hstack([
        embeddings,
        df[['brand_encoded']].values,
        category_encoded
    ])
    
    # Целевая переменная
    y = df['price'].values
    
    # Масштабирование признаков (только для числовых, one-hot не масштабируем)
    scaler = StandardScaler()
    
    # Определяем какие колонки масштабировать (все кроме one-hot)
    cols_to_scale = list(range(embeddings.shape[1])) + [embeddings.shape[1]]  # эмбеддинги + brand_encoded
    
    # Масштабируем только нужные признаки
    X_scaled = X.copy()
    X_scaled[:, cols_to_scale] = scaler.fit_transform(X[:, cols_to_scale])
    
    return X_scaled, y, scaler, brand_encoder, category_encoder

In [28]:
# Загрузка и предобработка данных
df = load_data()
df

Unnamed: 0,embedding,brand,category,price
0,"[0.03810836, 0.01570246, -0.0015478154, 0.0038...",Stels,Детский велосипед,9980.00
1,"[0.042307913, 0.010306882, -0.0020949498, -0.0...",Royal Baby,Детский велосипед,21400.00
2,"[0.03810836, 0.01570246, -0.0015478154, 0.0038...",Novatrack,Детский велосипед,12710.00
3,"[0.03810836, 0.01570246, -0.0015478154, 0.0038...",Stels,Детский велосипед,9790.00
4,"[0.04768627, -0.015802076, -0.0057999515, -0.0...",Stels,Складной велосипед,12850.00
...,...,...,...,...
948,"[0.0072933077, 0.06085926, -0.0033684813, -0.0...",KTM,Электровелосипед,467800.00
949,"[0.011151808, -0.003582016, -0.002112051, -0.0...",Sunpeed,Горный велосипед,75400.00
950,"[0.0072933077, 0.06085926, -0.0033684813, -0.0...",KTM,Электровелосипед,467800.00
951,"[0.025245624, 0.018310096, -0.0012464457, -0.0...",Electra,Детский велосипед,45000.00


In [29]:
# Разделение на train/test
X, y, scaler, brand_encoder, category_encoder = preprocess_data(df)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
df

Unnamed: 0,embedding,brand,category,price,brand_encoded
0,"[0.03810836, 0.01570246, -0.0015478154, 0.0038...",Stels,Детский велосипед,9980.00,33
1,"[0.042307913, 0.010306882, -0.0020949498, -0.0...",Royal Baby,Детский велосипед,21400.00,26
2,"[0.03810836, 0.01570246, -0.0015478154, 0.0038...",Novatrack,Детский велосипед,12710.00,20
3,"[0.03810836, 0.01570246, -0.0015478154, 0.0038...",Stels,Детский велосипед,9790.00,33
4,"[0.04768627, -0.015802076, -0.0057999515, -0.0...",Stels,Складной велосипед,12850.00,33
...,...,...,...,...,...
948,"[0.0072933077, 0.06085926, -0.0033684813, -0.0...",KTM,Электровелосипед,467800.00,16
949,"[0.011151808, -0.003582016, -0.002112051, -0.0...",Sunpeed,Горный велосипед,75400.00,35
950,"[0.0072933077, 0.06085926, -0.0033684813, -0.0...",KTM,Электровелосипед,467800.00,16
951,"[0.025245624, 0.018310096, -0.0012464457, -0.0...",Electra,Детский велосипед,45000.00,4


In [30]:
# Инициализация моделей
models = {
    "Linear Regression": LinearRegression(),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "XGBoost": XGBRegressor(random_state=42, max_depth=10)
}

# Обучение и оценка моделей
results = []
for name, model in models.items():
    # Обучение
    model.fit(X_train, y_train)
    
    # Предсказание
    y_pred = model.predict(X_test)
    
    # Расчет метрик
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Сохранение результатов
    results.append({
        'Model': name,
        'R^2': r2,
        'MAE': mae,
        'RMSE': rmse
    })

# Преобразование результатов в DataFrame
results_df = pd.DataFrame(results).sort_values('R^2', ascending=False)
print(results_df)

               Model       R^2           MAE          RMSE
2            XGBoost  0.840407  21429.585938  51861.205539
1                KNN  0.826832  24881.748691  54021.771138
0  Linear Regression  0.607304  43011.874278  81351.161765
