dataset link: https://www.kaggle.com/datasets/nehalbirla/vehicle-dataset-from-cardekho?select=car+details+v4.csv

In [7]:
import pandas as pd

df = pd.read_csv("dataset/car details_v4.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2059 entries, 0 to 2058
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Make                2059 non-null   object 
 1   Model               2059 non-null   object 
 2   Price               2059 non-null   int64  
 3   Year                2059 non-null   int64  
 4   Kilometer           2059 non-null   int64  
 5   Fuel Type           2059 non-null   object 
 6   Transmission        2059 non-null   object 
 7   Location            2059 non-null   object 
 8   Color               2059 non-null   object 
 9   Owner               2059 non-null   object 
 10  Seller Type         2059 non-null   object 
 11  Engine              1979 non-null   object 
 12  Max Power           1979 non-null   object 
 13  Max Torque          1979 non-null   object 
 14  Drivetrain          1923 non-null   object 
 15  Length              1995 non-null   float64
 16  Width 

In [8]:
import pandas as pd

from preprocessing import DataMiningFootprint
from preprocessing import VariableCleaningFootstep

def main() -> None:
    # 1. Load the Kaggle dataset
    df = pd.read_csv("car details_v4.csv")

    print("Original columns:")
    print(list(df.columns))

    # 2. Wrap it into our generic footprint
    fp = DataMiningFootprint(raw_df=df)

    # 3. Configure renaming + expected dtypes
    #    (expected_dtypes si riferisce ai NOMI DOPO il rename!)
    rename_map = {
        "Make": "make",
        "Model": "model",
        "Price": "price",
        "Year": "year",
        "Kilometer": "kilometers",
        "Fuel Type": "fuel_type",
        "Transmission": "transmission",
        "Location": "location",
        "Color": "color",
        "Owner": "owner",
        "Seller Type": "seller_type",
    }

    expected_dtypes = {
        "price": "float64",       # da int -> float (pi√π comodo)
        "year": "int64",
        "kilometers": "int64",
        "fuel_type": "string",
        "transmission": "string",
        "location": "string",
        "color": "string",
        "owner": "string",
        "seller_type": "string",
    }

    cleaning_step = VariableCleaningFootstep(
        rename_map=rename_map,
        expected_dtypes=expected_dtypes,
        strict=False,   # per iniziare: logga problemi invece di esplodere
    )

    # 4. Run the step on the footprint
    fp = cleaning_step.run(fp)

    # 5. Inspect results
    print("\n=== dtypes after cleaning ===")
    print(fp.cleaned_df.dtypes)

    print("\n=== head of cleaned_df ===")
    print(fp.cleaned_df.head())

    print("\n=== preprocessing log ===")
    for line in fp.preprocessing_log:
        print(" -", line)


if __name__ == "__main__":
    main()


ImportError: attempted relative import with no known parent package