In [1]:
# --- Importuj niezbędne biblioteki ---
import pandas as pd
import boto3
import io
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pickle
import streamlit as st
from sklearn.preprocessing import StandardScaler
import numpy as np

# --- Skonfiguruj dostęp do Digital Ocean Spaces z secrets.toml ---
aws_access_key_id = st.secrets["do_spaces"]["access_key"]
aws_secret_access_key = st.secrets["do_spaces"]["secret_key"]
region_name = st.secrets["do_spaces"]["region"]
endpoint_url = st.secrets["do_spaces"]["endpoint_url"]
bucket_name = st.secrets["do_spaces"]["bucket_name"]

file_key = 'used_cars_clean_v3.csv'
model_output_key = 'model_samochody.pkl'

session = boto3.session.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=region_name
)

s3 = session.client('s3', endpoint_url=endpoint_url)

# --- Wczytaj dane z Digital Ocean Spaces ---
response = s3.get_object(Bucket=bucket_name, Key=file_key)
csv_data = response['Body'].read().decode('utf-8')
df = pd.read_csv(io.StringIO(csv_data))
print("Kolumny w df po wczytaniu:", df.columns.tolist())

# Usunięcie wierszy z autami ze skrzynią "Transmission w/Dual Shift Mode"
df = df[df['transmission'] != "Transmission w/Dual Shift Mode"]

# Przetwarzanie danych
df.dropna(subset=['brand', 'model', 'model_year', 'milage', 'fuel_type', 'transmission', 'ext_col', 'accident', 'clean_title'], inplace=True)
df['milage'] = df['milage'].astype(str).str.replace(' mi.', '', regex=False).str.replace(',', '', regex=False).astype(float)
df['price'] = df['price'].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False).astype(float)
df.dropna(subset=['milage', 'price', 'model_year'], inplace=True)

# Obliczenia wieku samochodu
current_year = 2025
df['wiek'] = current_year - df['model_year'].astype(int)
df['wiek_kwadrat'] = df['wiek']**2
df['wiek_przebieg'] = df['wiek'] * df['milage']
df.drop('model_year', axis=1, inplace=True)

# Grupowanie rzadkich modeli
model_counts = df['model'].value_counts()
rare_models_threshold = 10
rare_models = model_counts[model_counts < rare_models_threshold].index
df['model'] = df['model'].apply(lambda x: 'Other' if x in rare_models else x)

# Usunięcie kolumny 'int_col'
if 'int_col' in df.columns:
    df.drop('int_col', axis=1, inplace=True)

def categorize_transmission(transmission_str):
    if pd.isna(transmission_str):
        return None
    if 'A/T' in transmission_str or 'CVT' in transmission_str or 'overdrive' in transmission_str:
        return 'Automat'
    elif 'M/T' in transmission_str:
        return 'Manual'
    else:
        return None # Ignoruj inne typy skrzyń biegów na potrzeby tego podziału

df['transmission_type'] = df['transmission'].apply(categorize_transmission)
df.dropna(subset=['transmission_type'], inplace=True) # Usuń wiersze z innymi typami skrzyń

# Usuń oryginalną kolumnę 'transmission'
df.drop('transmission', axis=1, inplace=True)

# Obsługa zmiennych kategorialnych za pomocą one-hot encoding
categorical_features = ['brand', 'model', 'fuel_type', 'transmission_type', 'ext_col', 'engine_fuel', 'accident', 'clean_title']
df = pd.get_dummies(df, columns=categorical_features, drop_first=True, dummy_na=False)

target = 'price'
y = df[target]
X = df.drop(target, axis=1)

# --- Sprawdzenie typów danych ---
print("Typy danych w X:", X.dtypes)

# --- Podział danych na zestawy treningowe i testowe ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Sprawdzenie na NaN w X_train ---
print("Czy są NaN w X_train:", X_train.isna().any().any())
print("Czy są NaN w y_train:", y_train.isna().any())

# --- Skalowanie cech ---
scaler = StandardScaler()
numerical_features = ['milage', 'wiek', 'wiek_kwadrat', 'wiek_przebieg', 'engine_hp']  # Usunięto 'engine_cylinders'
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# --- Zapisz scaler ---
scaler_filename = 'scaler_samochody.pkl'
with open(scaler_filename, 'wb') as file:
    pickle.dump(scaler, file)

# --- Upload scalera do Digital Ocean Spaces ---
scaler_key = 'scaler_samochody.pkl'
try:
    s3.upload_file(scaler_filename, bucket_name, scaler_key)
    print(f"Scaler został pomyślnie zapisany do {bucket_name}/{scaler_key}")
except Exception as e:
    print(f"Wystąpił błąd podczas zapisywania scalera: {e}")

# --- Trenowanie modelu ---
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# --- Ocena modelu ---
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

# --- Analiza ważności cech ---
feature_importances = model.feature_importances_
importances_df = pd.DataFrame({'feature': X.columns, 'importance': feature_importances}).sort_values(by='importance', ascending=False)
print("\n--- Ważność cech ---")
print(importances_df)

# --- Zapisz model ---
model_filename = 'model_samochody.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)

# --- Upload modelu do Digital Ocean Spaces ---
try:
    s3.upload_file(model_filename, bucket_name, model_output_key)
    print(f"Model został pomyślnie zapisany do {bucket_name}/{model_output_key}")
except Exception as e:
    print(f"Wystąpił błąd podczas zapisywania modelu: {e}")

Kolumny w df po wczytaniu: ['brand', 'model', 'model_year', 'milage', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title', 'price', 'wiek', 'engine_hp', 'engine_cylinders', 'engine_fuel', 'engine_liters_log']
Typy danych w X: milage                    float64
wiek                        int32
engine_hp                 float64
engine_cylinders          float64
engine_liters_log         float64
                           ...   
ext_col_Yellow               bool
engine_fuel_Electric         bool
engine_fuel_Flex Fuel        bool
engine_fuel_Gasoline         bool
accident_None reported       bool
Length: 90, dtype: object
Czy są NaN w X_train: False
Czy są NaN w y_train: False
Scaler został pomyślnie zapisany do usedcars/scaler_samochody.pkl
Mean Squared Error: 90577398.562999
R^2 Score: 0.8696906296768449

--- Ważność cech ---
                       feature    importance
6                wiek_przebieg  4.387973e-01
2                    engine_hp  3.017791e-01
4   