In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('../../../data_set/train-data.csv', index_col=0)

In [26]:
df_clean = df.copy()

# Formattiamo bene 'Mileage'
df_clean['Mileage'] = df_clean['Mileage'].str.split(' ').str[0]
df_clean['Mileage'] = pd.to_numeric(df_clean['Mileage'], errors='coerce')

# Formattiamo bene 'Power'
df_clean['Power'] = df_clean['Power'].str.split(' ').str[0]
df_clean['Power'] = pd.to_numeric(df_clean['Power'], errors='coerce')

# Formattiamo bene 'New_Price'
df_clean['New_Price'] = df_clean['New_Price'].str.split(' ').str[0]
df_clean['New_Price'] = pd.to_numeric(df_clean['New_Price'], errors='coerce')

# Formattiamo bene 'New_Price'
df_clean['Engine'] = df_clean['Engine'].str.split(' ').str[0]
df_clean['Engine'] = pd.to_numeric(df_clean['Engine'], errors='coerce')

In [27]:
# Fattori di conversione da km/l a km/kg
conversion_factors = {
    'Petrol': 1/0.74,   # approssimativo
    'Diesel': 1/0.85,   # approssimativo
    'LPG': 1/0.51,      # approssimativo
    'CNG': 1,            # già in km/kg
    'Electric': np.nan   # non convertibile
}

# Funzione di conversione per riga
def convert_kmpl_to_kmpkg(row):
    fuel = row['Fuel_Type']
    mileage = row['Mileage']
    # Se il valore non è valido o carburante non convertibile
    if pd.isna(mileage) or fuel not in conversion_factors:
    # if pd.isna(mileage) or mileage <= 0 or fuel not in conversion_factors:
        return np.nan
    factor = conversion_factors[fuel]
    return mileage / factor

# Applico la conversione
df_clean['Mileage_kmpkg'] = df_clean.apply(convert_kmpl_to_kmpkg, axis=1)

# Calcolo la mediana dei valori validi
median_value = df_clean['Mileage_kmpkg'].median()

# Riempio i valori NaN (inclusi quelli inizialmente NaN o <=0)
df_clean['Mileage_kmpkg'] = df_clean['Mileage_kmpkg'].fillna(median_value)

df_clean['Mileage'] = df_clean['Mileage_kmpkg']

df_clean.drop(columns=['Mileage_kmpkg'], inplace=True)

In [28]:
df_clean = df_clean.drop('New_Price', axis=1)

cols_to_impute = ['Power', 'Mileage', 'Engine', 'Seats']
df[cols_to_impute] = df_clean[cols_to_impute].replace(0.0, np.nan)

for col in cols_to_impute:
    median_value = df_clean[col].median()
    df_clean[col] = df_clean[col].fillna(median_value)

df_clean['Engine'] = df_clean['Engine'].astype('int64')
df_clean['Seats'] = df_clean['Seats'].astype('int64')

df_clean['Brand'] = df_clean['Name'].str.split(' ').str[0]
df_clean['Model'] = df_clean['Name'].str.split(' ').str[1]
df_clean = df_clean.drop('Name', axis=1)

df_clean.drop_duplicates(inplace=True)

In [29]:
first_cols = ['Brand', 'Model']

remaining_cols = [col for col in df_clean.columns if col not in first_cols]

df_clean = df_clean[first_cols + remaining_cols]

df_clean.head()

Unnamed: 0,Brand,Model,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti,Wagon,Mumbai,2010,72000,CNG,Manual,First,26.6,998,58.16,5,1.75
1,Hyundai,Creta,Pune,2015,41000,Diesel,Manual,First,16.7195,1582,126.2,5,12.5
2,Honda,Jazz,Chennai,2011,46000,Petrol,Manual,First,13.468,1199,88.7,5,4.5
3,Maruti,Ertiga,Chennai,2012,87000,Diesel,Manual,First,17.6545,1248,88.76,7,6.0
4,Audi,A4,Coimbatore,2013,40670,Diesel,Automatic,Second,12.92,1968,140.8,5,17.74
