In [1]:
import pandas as pd
from pycaret.classification import setup, compare_models, finalize_model, save_model
import re
import numpy as np

# 📌 Wczytanie danych z plików CSV
files = ["used_cars_clean.csv"]
dfs = [pd.read_csv(file) for file in files]
df = pd.concat(dfs, ignore_index=True)
current_year = 2025
df['wiek'] = current_year - df['model_year'].astype(int)
#df.drop('model_year', axis=1, inplace=True)

df['milage'] = df['milage'].astype(str).str.replace(' mi.', '', regex=False).str.replace(',', '', regex=False).astype(float)
df['price'] = df['price'].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False).astype(float)

def extract_engine_features(engine_str):
    hp_match = re.search(r'(\d+\.0)HP', engine_str) or re.search(r'(\d+)HP', engine_str)
    hp = float(hp_match.group(1)) if hp_match else None

    liters_match = re.search(r'(\d+\.\d)L', engine_str)
    liters = float(liters_match.group(1)) if liters_match else None

    cylinder_match = re.search(r'(\d+) Cylinder', engine_str) or re.search(r'(\d+) Cylinder Engine', engine_str) or re.search(r'(\d+) Straight', engine_str) or re.search(r'(\d+) Flat', engine_str) or re.search(r'(\d+) Rotary', engine_str)
    cylinders = int(cylinder_match.group(1)) if cylinder_match else None

    fuel_type_match = re.search(r'(Gasoline|Flex Fuel|Hybrid|Plug-In Hybrid|Diesel|Electric|Hydrogen)', engine_str, re.IGNORECASE)
    fuel = fuel_type_match.group(1) if fuel_type_match else None

    return pd.Series({'engine_hp': hp, 'engine_liters': liters, 'engine_cylinders': cylinders, 'engine_fuel': fuel})

df[['engine_hp', 'engine_liters', 'engine_cylinders', 'engine_fuel']] = df['engine'].apply(extract_engine_features)
df.drop('engine', axis=1, inplace=True)

# Oblicz medianę kolumny 'engine_hp'
median_engine_hp = df['engine_hp'].median()

# Wypełnij brakujące wartości w kolumnie 'engine_hp' medianą
df['engine_hp'].fillna(median_engine_hp, inplace=True)

# Usuń wiersze z liczbą cylindrów mniejszą niż 3 lub większą niż 10
df = df[(df['engine_cylinders'] > 3) & (df['engine_cylinders'] < 10)]

# Zastosuj transformację logarytmiczną do kolumny 'engine_liters'
df['engine_liters_log'] = np.log(df['engine_liters'] + 1e-9)

# Usuń oryginalną kolumnę 'engine_liters'
df.drop('engine_liters', axis=1, inplace=True)

# Usuń wiersze z brakującymi wartościami w 'engine_fuel'
df.dropna(subset=['engine_fuel'], inplace=True)

# Usuń wiersze z brakującymi wartościami w 'engine_liters_log'
df.dropna(subset=['engine_liters_log'], inplace=True)

# Zapisz wyczyszczony DataFrame do pliku CSV
cleaned_filename = 'used_cars_clean_v2.csv'
df.to_csv(cleaned_filename, index=False)
print(f"Wyczyszczony DataFrame został zapisany do {cleaned_filename}")


Wyczyszczony DataFrame został zapisany do used_cars_clean_v2.csv
