In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import re


In [71]:
data = pd.read_csv('../datasets/Car details v3.csv')

# Primer paso dividir conjunto de entrenamiento entre train y test simulando un escenario real

Nuestro objetivo es predicir el selling_price

In [72]:
X = data.drop('selling_price',axis=1)
y = data['selling_price']

Dividimos nuestro conjunto de entrenamiento entre test y train dejando el 30% para hacer pruebas

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Cleaning

## Tipo de datos

In [74]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5689 entries, 6783 to 7270
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          5689 non-null   object 
 1   year          5689 non-null   int64  
 2   km_driven     5689 non-null   int64  
 3   fuel          5689 non-null   object 
 4   seller_type   5689 non-null   object 
 5   transmission  5689 non-null   object 
 6   owner         5689 non-null   object 
 7   mileage       5538 non-null   object 
 8   engine        5538 non-null   object 
 9   max_power     5541 non-null   object 
 10  torque        5537 non-null   object 
 11  seats         5538 non-null   float64
dtypes: float64(1), int64(2), object(9)
memory usage: 577.8+ KB


In [75]:
X_train[['make', 'model']] = X_train['name'].str.split(' ', n=1, expand=True)

In [76]:
def impute_mileage_with_equivalent_car(df):
  for idx, row in df[df['mileage'] == '0.0 kmpl'].iterrows():
    similar_condition = (df['name'] == row['name']) & (df['year'] == row['year']) & (df['fuel'] == row['fuel'])
    similar_vehicles = df[similar_condition]

    if not similar_vehicles.empty:
      df.at[idx, 'mileage'] = similar_vehicles['mileage'].iloc[0]
  return df

X_train['mileage'] = impute_mileage_with_equivalent_car(X_train)['mileage']

In [77]:
def convert_to_kmpl(row):
    kmpl_rx = r"(\d*\.?\d+)\s*(kmpl)"
    kmkg_rx = r"(\d*\.?\d+)\s*(km/kg)"

    conversion_factor = 1.39

    mileage_value = None

    # Si el fuel es Diesel o Petrol y tiene kmpl, simplemente extraemos el número
    if row['fuel'].lower() in ['diesel', 'petrol']:
        match = re.search(kmpl_rx, row['mileage'])
        if match:
            mileage_value = float(match.group(1))

    # Si el fuel es CNG o LPG y tiene km/kg, convertimos a kmpl
    elif row['fuel'].lower() in ['cng', 'lpg']:
        match = re.search(kmkg_rx, row['mileage'])
        if match:
            kmkg_value = float(match.group(1))
            mileage_value = kmkg_value * conversion_factor  # Convertimos a kmpl

    return mileage_value

mileage_non_null = X_train[X_train['mileage'].notnull()].copy()
X_train['mileage_kmpl'] = mileage_non_null.apply(convert_to_kmpl, axis=1)
X_train.drop(columns=['mileage'])

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,engine,max_power,torque,seats,make,model,mileage_kmpl
6783,Maruti Alto LXi BSIII,2006,120000,Petrol,Individual,Manual,Third Owner,796 CC,46.3 bhp,62Nm@ 3000rpm,5.0,Maruti,Alto LXi BSIII,19.70
1073,Toyota Fortuner 2.8 2WD AT BSIV,2018,100000,Diesel,Dealer,Automatic,First Owner,2755 CC,174.5 bhp,450Nm@ 1600-2400rpm,7.0,Toyota,Fortuner 2.8 2WD AT BSIV,12.90
7756,BMW 3 Series 320d GT Luxury Line,2017,39000,Diesel,Dealer,Automatic,First Owner,1995 CC,187.74 bhp,380Nm@ 1750-2750rpm,5.0,BMW,3 Series 320d GT Luxury Line,19.59
144,Toyota Corolla Altis 1.8 G CVT,2013,39000,Petrol,Dealer,Automatic,First Owner,1798 CC,138.03 bhp,173Nm@ 4000rpm,5.0,Toyota,Corolla Altis 1.8 G CVT,14.28
6424,Maruti Swift Dzire VDI,2013,70000,Diesel,Individual,Manual,Second Owner,1248 CC,74 bhp,190Nm@ 2000rpm,5.0,Maruti,Swift Dzire VDI,23.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,Mahindra Scorpio 2006-2009 VLX 2WD 7 Str BSIII,2009,120000,Diesel,Individual,Manual,First Owner,2179 CC,120 bhp,290Nm@ 1800-2800rpm,7.0,Mahindra,Scorpio 2006-2009 VLX 2WD 7 Str BSIII,12.05
5390,Maruti Swift Dzire VDI,2014,80000,Diesel,Individual,Manual,Second Owner,1248 CC,74 bhp,190Nm@ 2000rpm,5.0,Maruti,Swift Dzire VDI,23.40
860,Hyundai i20 Asta Option 1.2,2016,35000,Petrol,Individual,Manual,First Owner,1197 CC,81.83 bhp,114.7Nm@ 4000rpm,5.0,Hyundai,i20 Asta Option 1.2,18.60
7603,Maruti Swift Dzire VDI,2019,27000,Diesel,Individual,Manual,First Owner,1248 CC,74.02 bhp,190Nm@ 2000rpm,5.0,Maruti,Swift Dzire VDI,28.40


In [79]:
len(X_train['make'].unique())


30

In [80]:
len(X_train['model'].unique())

1782

In [57]:
def impute_max_power_with_equivalent_car(df):
  for idx, row in df[(df['max_power'].isin(['bhp', '0']))].iterrows():
    similar_condition = (df['name'] == row['name']) & (df['year'] == row['year']) & (df['fuel'] == row['fuel'])
    similar_vehicles = df[similar_condition]

    if not similar_vehicles.empty:
      df.at[idx, 'max_power'] = similar_vehicles['max_power'].iloc[0]
  return df

X_train['max_power'] = impute_max_power_with_equivalent_car(X_train)['max_power']

In [58]:
def standardize_torque(torque_str):
    if pd.isna(torque_str):
        return {'torque_peak_power': np.nan, 'torque_peak_speed': np.nan}

    # Regex patterns for different torque formats
    patterns = [
        r"(\d*\.?\d+)\s*(kgm|nm)?\s*@\s*([-\d\s,]+)\s*(rpm)?",
        r"(\d*\.?\d+)\s*(kgm|nm)?\s*at\s*([-\d\s,]+)\s*(rpm)?",
        r"(\d*\.?\d+)\s*@\s*([-\d\s,]+)\s*\(kgm@\s*rpm\)",
        r"(\d*\.?\d+)\s*kgm\s*at\s*([-\d\s,]+)\s*(rpm)?",
        r"(\d*\.?\d+)\s*\((\d*\.?\d+)\s*kgm\)\s*@\s*([-\d\s,]+)\s*(rpm)?",
        r"(\d*\.?\d+)\s*\((\d*\.?\d+)\)\s*@\s*([-\d\s,]+)\s*(rpm)?",
        r"(\d*\.?\d+)\s*(kgm|nm)?\s*/\s*([-\d\s,]+)\s*(rpm)?",
        r"(\d*\.?\d+)\s*/\s*([-\d\s,]+)"
    ]

    # Initialize default values
    torque_peak_power = None
    torque_peak_speed = None

    for pattern in patterns:
        match = re.findall(pattern, torque_str.lower())
        if match:
            # Determine the number of groups matched and unpack accordingly
            if len(match[0]) == 4:
                value, unit, rpm_range, _ = match[0]
            elif len(match[0]) == 3:
                value, unit, rpm_range = match[0]
                rpm_range = rpm_range.strip().replace(',', '')  # Clean up the rpm range
            elif len(match[0]) == 2:
                value, rpm_range = match[0]
                unit = None
            else:
                continue

            # Convert value to float
            value = float(value)

            # Convert kgm to Nm if necessary
            if 'kgm' in torque_str.lower() or (unit and 'kgm' in unit):
                value *= 9.81  # Convert kgm to Nm

            # Store torque value
            torque_peak_power = value

            # Extract maximum RPM from range if available
            if rpm_range:
                # Handle ranges like "1800-2000" and single values with commas
                rpm_range = rpm_range.replace(',', '')  # Remove commas from the RPM range
                if '-' in rpm_range:
                    rpm_values = list(map(int, rpm_range.split('-')))
                    torque_peak_speed = max(rpm_values)  # Take the max RPM from the range
                else:
                    torque_peak_speed = int(rpm_range.strip())  # Single RPM value

            break  # Stop after the first match

    # Return the results as a dictionary
    return {'torque_peak_power': torque_peak_power, 'torque_peak_speed': torque_peak_speed}

In [59]:
engine_non_null = X_train[X_train['engine'].notnull()].copy()
X_train['engine_cc'] = engine_non_null['engine'].str.replace(r'[^0-9.]+', '', regex=True)
X_train['engine_cc'] = pd.to_numeric(X_train['engine_cc'], errors='coerce')

In [60]:
max_power_non_null = X_train[X_train['max_power'].notnull()].copy()
X_train['max_power_bhp'] = mileage_non_null['max_power'].str.replace(r'[^0-9.]+', '', regex=True)
X_train['max_power_bhp'] = pd.to_numeric(X_train['max_power_bhp'], errors='coerce')

In [61]:
torque_results = X_train['torque'].apply(standardize_torque)

X_train['torque_peak_power'] = torque_results.apply(lambda x: x['torque_peak_power'])
X_train['torque_peak_speed'] = torque_results.apply(lambda x: x['torque_peak_speed'])

In [62]:
X_train = X_train.drop(columns=['name', 'mileage', 'engine', 'max_power', 'torque'])

In [63]:
owner_mapping = {
    'First Owner': 1,
    'Second Owner': 2,
    'Third Owner': 3,
    'Fourth & Above Owner': 4,
    'Test Drive Car': 5
}

X_train['owner'] = X_train['owner'].map(owner_mapping)

In [None]:
X_train['seats'] = X_train['seats'].round().astype(int)

In [None]:
X_train

## Completar nulls categoricos

In [40]:
categorical_cols = ['fuel', 'seller_type', 'transmission', 'make']

In [42]:
encoder = OneHotEncoder(drop='first', sparse_output=False)

# Fit and transform the categorical columns
encoded_columns = encoder.fit_transform(X_train[categorical_cols])

encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(categorical_cols))

encoded_df.index = X_train.index
X_train = pd.concat([X_train.drop(columns=['fuel', 'seller_type', 'transmission']), encoded_df], axis=1)

## Completar nulls numericos

In [41]:
numerical_cols = ['year', 'selling_price', 'km_driven', 'owner', 'seats', 'mileage_kmpl', 'engine_cc', 'max_power_bhp']

In [None]:
data_for_imputation = X_train.drop(['model', 'make'], axis=1)

data_for_imputation

In [None]:
# Initialize and fit the imputer
imputer = IterativeImputer(random_state=42, max_iter=10, imputation_order='random')
X_train_imputed = imputer.fit_transform(data_for_imputation)

# Convert the imputed NumPy array back to a DataFrame
X_train_imputed_df = pd.DataFrame(X_train_imputed, columns=data_for_imputation.columns, index=X_train.index)

for column in ['mileage_kmpl', 'engine_cc', 'max_power_bhp', 'seats']:
    X_train[f'{column}_was_imputed'] = X_train[column].isnull()

# Update the original X_train with the imputed values
X_train.update(X_train_imputed_df)

X_train

# Feature engineer

# Feature transformation

Generar pipeline con todos los pasos previos para aplicarlos en nuevos datos y tambien escalar los datos

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Escala las variables numéricas
        ('cat', OneHotEncoder(sparse=False, drop='first'), categorical_cols)  # Codifica las categóricas con OneHotEncoding
    ]
)