# IA PARA CIENCIA DE DATOS

## Contenido

1. Packaging
1. Read file and load data
1. ML methods
1. Metrics

## 1. Packaging

In [1]:
# Core libraries
import warnings
warnings.filterwarnings("ignore")
import numpy as np # to use numpy arrays instead of lists
import pandas as pd # DataFrame (table)

# Matplotlib and seaborn for charting
import matplotlib.pyplot as plt # to plot
import seaborn as sns # to plot

In [2]:
# sklearn for ML
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.neural_network import MLPRegressor

# Sklearn regression model evaluation metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

In [3]:
# Custom analysis modules
import sys
sys.path.append('../models')
from analysis_modules import *

## 2. Read file and load data

### 2.1 Read file

In [4]:
# Load data
data_file = "C:/Users/dark_/OneDrive/Documentos/01_Escuela/09_Septimo semestre/llamenadios/data/raw/BMW sales data (2010-2024.csv"
datos = pd.read_csv(data_file, header=0)

In [5]:
# View data information
describeData(datos)

      Model  Year         Region  Color Fuel_Type Transmission  Engine_Size_L  \
0  5 Series  2016           Asia    Red    Petrol       Manual            3.5   
1        i8  2013  North America    Red    Hybrid    Automatic            1.6   
2  5 Series  2022  North America   Blue    Petrol    Automatic            4.5   
3        X3  2024    Middle East   Blue    Petrol    Automatic            1.7   
4  7 Series  2020  South America  Black    Diesel       Manual            2.1   

   Mileage_KM  Price_USD  Sales_Volume Sales_Classification  
0      151748      98740          8300                 High  
1      121671      79219          3428                  Low  
2       10991     113265          6994                  Low  
3       27255      60971          4047                  Low  
4      122131      49898          3080                  Low  


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column                Non

### 2.2 Preprocessing

In [6]:
def processData(dataSet=0):
    """
    Normalize  data
    """
    newData = pd.DataFrame()

    newData['Year'] = dataSet['Year']
    newData['Fuel_Type'] = dataSet['Fuel_Type']
    newData['Engine_Size_L'] = dataSet['Engine_Size_L']
    newData['Mileage_KM'] = dataSet['Mileage_KM']
    newData['age'] = 2025 - dataSet['Year']
    newData['KM_per_year'] = dataSet['Mileage_KM'] / newData['age']
    


    newDataset = pd.get_dummies(newData, columns=['Fuel_Type'], drop_first=False)

    scaler = preprocessing.MinMaxScaler()
    scaler.fit(newDataset)

    # Transform the data using the fitted scaler
    np_dataSet = scaler.transform(newDataset)

    # Convert the numpy array back to a DataFrame
    newDataset = pd.DataFrame(np_dataSet, columns=newDataset.columns, index=newDataset.index)

    # Fill NaN values after rolling mean
    newDataset = newDataset.fillna(method='bfill')

    return newDataset

In [7]:
# Preprocess data
misDatos = processData(datos)
print(misDatos.head())
print()

       Year  Engine_Size_L  Mileage_KM       age  KM_per_year  \
0  0.428571       0.571429    0.758752  0.571429     0.084307   
1  0.214286       0.028571    0.608361  0.785714     0.050698   
2  0.857143       0.857143    0.054942  0.142857     0.018318   
3  1.000000       0.057143    0.136265  0.000000     0.136280   
4  0.714286       0.171429    0.610661  0.285714     0.122136   

   Fuel_Type_Diesel  Fuel_Type_Electric  Fuel_Type_Hybrid  Fuel_Type_Petrol  
0               0.0                 0.0               0.0               1.0  
1               0.0                 0.0               1.0               0.0  
2               0.0                 0.0               0.0               1.0  
3               0.0                 0.0               0.0               1.0  
4               1.0                 0.0               0.0               0.0  



### 2.3 Correlation analysis

### 2.4 Split Data

In [9]:
# --------------------
# split data
def splitDataSet(dataSet=0, test_size=.2, randSplit=True):
    """
    Split data in train and test sets
    """

    train, test = train_test_split(dataSet, test_size=test_size, shuffle=randSplit, random_state=0)

    return [train, test]

In [11]:
# Split data into training and test sets
ts_size = .3
[trainSet, testSet] = splitDataSet(misDatos, test_size=ts_size, randSplit=True)
print("Train set")
print(trainSet.head())
print()
print('trainSet shape: ', trainSet.shape)
print()

print("Test set")
print(testSet.head())
print()
print('testSet shape: ', testSet.shape)
print()

Train set
           Year  Engine_Size_L  Mileage_KM       age  KM_per_year  \
17967  0.642857       0.171429    0.891541  0.357143     0.148593   
32391  0.214286       0.000000    0.643863  0.785714     0.053656   
9341   0.928571       0.571429    0.973364  0.071429     0.486694   
7929   0.785714       0.942857    0.096698  0.214286     0.024178   
46544  0.500000       0.742857    0.712880  0.500000     0.089112   

       Fuel_Type_Diesel  Fuel_Type_Electric  Fuel_Type_Hybrid  \
17967               0.0                 0.0               0.0   
32391               0.0                 0.0               0.0   
9341                1.0                 0.0               0.0   
7929                0.0                 0.0               1.0   
46544               0.0                 0.0               0.0   

       Fuel_Type_Petrol  
17967               1.0  
32391               1.0  
9341                0.0  
7929                0.0  
46544               1.0  

trainSet shape:  (35000, 9)

## 3. ML methods

In [12]:
# Create dataframes for training and test sets
# with the target variables only
yVar = 'KM_per_year'
xVar = list(misDatos.columns)
xVar.remove(yVar)
#xVar = ['Year', 'Mileage_km']
print("xVar: ", xVar)
print("yVar: ", yVar)
print()

trainModel = pd.DataFrame()
testModel = pd.DataFrame()
trainModel[yVar] = trainSet[yVar]
testModel[yVar] = testSet[yVar]
print(trainModel.head())
print()
print(testModel.head())

xVar:  ['Year', 'Engine_Size_L', 'Mileage_KM', 'age', 'Fuel_Type_Diesel', 'Fuel_Type_Electric', 'Fuel_Type_Hybrid', 'Fuel_Type_Petrol']
yVar:  KM_per_year

       KM_per_year
17967     0.148593
32391     0.053656
9341      0.486694
7929      0.024178
46544     0.089112

       KM_per_year
11841     0.067671
19602     0.131018
45519     0.136830
25747     0.440044
42642     0.058975


In [25]:
model_name = 'DT'
columnsName = [model_name + ' ' + yVar]
print("columnsName: ", columnsName)

columnsName:  ['DT KM_per_year']


In [13]:
# Perform regression with DT
classifier_dt = DecisionTreeRegressor(max_depth=5,random_state=0)
classifier_dt.fit(trainSet[xVar], trainSet[yVar])
print("Estimated Parameters:")
print(f"Max depth of trees (max_depth): {classifier_dt.get_depth()}")
print(f"Number of leaves (n_leaves): {classifier_dt.get_n_leaves()}")
print("\n")

trainModel['DT KM_per_year'] =classifier_dt.predict(trainSet[xVar])
print(trainModel)
print()
testModel['DT KM_per_year'] = classifier_dt.predict(testSet[xVar])
print(testModel)


Estimated Parameters:
Max depth of trees (max_depth): 5
Number of leaves (n_leaves): 32


       KM_per_year  DT KM_per_year
17967     0.148593        0.146803
32391     0.053656        0.058723
9341      0.486694        0.438365
7929      0.024178        0.024097
46544     0.089112        0.084211
...            ...             ...
21243     0.085144        0.064461
45891     0.036686        0.034403
42613     0.095912        0.084211
43567     0.012238        0.007570
2732      0.053548        0.034403

[35000 rows x 2 columns]

       KM_per_year  DT KM_per_year
11841     0.067671        0.084211
19602     0.131018        0.110124
45519     0.136830        0.110124
25747     0.440044        0.438365
42642     0.058975        0.034403
...            ...             ...
38344     0.014855        0.021522
49984     0.040469        0.021522
32624     0.264819        0.315507
46437     0.070432        0.064461
35070     0.066247        0.084211

[15000 rows x 2 columns]


In [14]:
# Perform regression with RF
classifier_rf = RandomForestRegressor(n_estimators=3, max_depth=5, random_state=0)
classifier_rf.fit(trainSet[xVar], trainSet[yVar])

trainModel['RF KM_per_year'] = classifier_rf.predict(trainSet[xVar])
print(trainModel)
print()

testModel['RF KM_per_year'] = classifier_rf.predict(testSet[xVar])
print(testModel)

       KM_per_year  DT KM_per_year  RF KM_per_year
17967     0.148593        0.146803        0.143559
32391     0.053656        0.058723        0.056750
9341      0.486694        0.438365        0.432878
7929      0.024178        0.024097        0.022605
46544     0.089112        0.084211        0.088260
...            ...             ...             ...
21243     0.085144        0.064461        0.064082
45891     0.036686        0.034403        0.034273
42613     0.095912        0.084211        0.080691
43567     0.012238        0.007570        0.007522
2732      0.053548        0.034403        0.034273

[35000 rows x 3 columns]

       KM_per_year  DT KM_per_year  RF KM_per_year
11841     0.067671        0.084211        0.088260
19602     0.131018        0.110124        0.106427
45519     0.136830        0.110124        0.106427
25747     0.440044        0.438365        0.432878
42642     0.058975        0.034403        0.034273
...            ...             ...             ...
3834

In [15]:
# Perform regression with SVM
classifier_svm = svm.SVR(kernel='rbf', degree=3)
classifier_svm.fit(trainSet[xVar], trainSet[yVar])

trainModel['SVM KM_per_year'] = classifier_svm.predict(trainSet[xVar])
print(trainModel)
print()
testModel['SVM KM_per_year'] = classifier_svm.predict(testSet[xVar])
print(testModel)


       KM_per_year  DT KM_per_year  RF KM_per_year  SVM KM_per_year
17967     0.148593        0.146803        0.143559         0.049277
32391     0.053656        0.058723        0.056750         0.134069
9341      0.486694        0.438365        0.432878         0.596500
7929      0.024178        0.024097        0.022605        -0.039412
46544     0.089112        0.084211        0.088260         0.047351
...            ...             ...             ...              ...
21243     0.085144        0.064461        0.064082        -0.014352
45891     0.036686        0.034403        0.034273         0.113342
42613     0.095912        0.084211        0.080691         0.101666
43567     0.012238        0.007570        0.007522        -0.036611
2732      0.053548        0.034403        0.034273         0.080072

[35000 rows x 4 columns]

       KM_per_year  DT KM_per_year  RF KM_per_year  SVM KM_per_year
11841     0.067671        0.084211        0.088260         0.052336
19602     0.131018   

In [16]:
# Perform regression with MLP
classifier_mlp = MLPRegressor(hidden_layer_sizes=(130,30), solver='sgd', max_iter=400, random_state=0)
classifier_mlp.fit(trainSet[xVar], trainSet[yVar])

trainModel['MLP KM_per_year'] = classifier_mlp.predict(trainSet[xVar])
print(trainModel)
print()
testModel['MLP KM_per_year'] = classifier_mlp.predict(testSet[xVar])
print(testModel)

       KM_per_year  DT KM_per_year  RF KM_per_year  SVM KM_per_year  \
17967     0.148593        0.146803        0.143559         0.049277   
32391     0.053656        0.058723        0.056750         0.134069   
9341      0.486694        0.438365        0.432878         0.596500   
7929      0.024178        0.024097        0.022605        -0.039412   
46544     0.089112        0.084211        0.088260         0.047351   
...            ...             ...             ...              ...   
21243     0.085144        0.064461        0.064082        -0.014352   
45891     0.036686        0.034403        0.034273         0.113342   
42613     0.095912        0.084211        0.080691         0.101666   
43567     0.012238        0.007570        0.007522        -0.036611   
2732      0.053548        0.034403        0.034273         0.080072   

       MLP KM_per_year  
17967         0.210238  
32391         0.032033  
9341          0.470123  
7929          0.029545  
46544         0.112740

## 4. Metrics

In [17]:
# Create dataframes to store model performance metrics
methodsUsed=['DT', 'RF', 'SVM', 'MLP']
performanceHeaders=['MAE', 'MSE', 'RMSE', 'MAPE', 'R2']
trainModelMetrics = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)
testModelMetrics = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)

In [18]:
def calculateMetrics(model_name, position, yVar, trainModel, testModel, trainModelMetrics, testModelMetrics):
    columnsNames = [model_name + ' KM_per_year']

    # --- Train Metrics ---
    trainModelMetrics.iloc[position, 0] = round(mean_absolute_error(trainModel[yVar], trainModel[columnsNames]), 2)
    trainModelMetrics.iloc[position, 1] = round(mean_squared_error(trainModel[yVar], trainModel[columnsNames]), 2)
    trainModelMetrics.iloc[position, 2] = round(np.sqrt(trainModelMetrics.iloc[position, 1]), 2)
    trainModelMetrics.iloc[position, 3] = round(mean_absolute_percentage_error(trainModel[yVar], trainModel[columnsNames]), 2)
    trainModelMetrics.iloc[position, 4] = round(r2_score(trainModel[yVar], trainModel[columnsNames]), 2)

    # --- Test Metrics ---
    testModelMetrics.iloc[position, 0] = round(mean_absolute_error(testModel[yVar], testModel[columnsNames]), 2)
    testModelMetrics.iloc[position, 1] = round(mean_squared_error(testModel[yVar], testModel[columnsNames]), 2)
    testModelMetrics.iloc[position, 2] = round(np.sqrt(testModelMetrics.iloc[position, 1]), 2)
    testModelMetrics.iloc[position, 3] = round(mean_absolute_percentage_error(testModel[yVar], testModel[columnsNames]), 2)
    testModelMetrics.iloc[position, 4] = round(r2_score(testModel[yVar], testModel[columnsNames]), 2)

    return trainModelMetrics, testModelMetrics

In [19]:
# DT metrics
model_name = 'DT'
position = 0  # Position for DT in the results DataFrame
trainModelMetrics, testModelMetrics = calculateMetrics(model_name, position, yVar, trainModel, testModel, trainModelMetrics, testModelMetrics)

# RF metrics
model_name = 'RF'
position = 1  # Position for RF in the results DataFrame
trainModelMetrics, testModelMetrics = calculateMetrics(model_name, position, yVar, trainModel, testModel, trainModelMetrics, testModelMetrics)

# SVM metrics
model_name = 'SVM'
position = 2  # Position for SVM in the results DataFrame
trainModelMetrics, testModelMetrics = calculateMetrics(model_name, position, yVar, trainModel, testModel, trainModelMetrics, testModelMetrics)

# MLP metrics
model_name = 'MLP'
position = 3  # Position for MLP in the results DataFrame
trainModelMetrics, testModelMetrics = calculateMetrics(model_name, position, yVar, trainModel, testModel, trainModelMetrics, testModelMetrics)

In [20]:
print("=======================================================================")
print("                      MODEL PERFORMANCE COMPARISON")
print("-----------------------------------------------------------------------")
print("                      Training")
print("-----------------------------------------------------------------------")
print(trainModelMetrics)
print("-----------------------------------------------------------------------")
print("                      Testing")
print("-----------------------------------------------------------------------")
print(testModelMetrics)
print("\n")

                      MODEL PERFORMANCE COMPARISON
-----------------------------------------------------------------------
                      Training
-----------------------------------------------------------------------
      MAE   MSE RMSE           MAPE    R2
DT   0.01   0.0  0.0   974115181.06  0.98
RF   0.01   0.0  0.0   967858265.62  0.99
SVM  0.06   0.0  0.0  7838104126.58  0.82
MLP  0.04  0.01  0.1  2345388292.11  0.78
-----------------------------------------------------------------------
                      Testing
-----------------------------------------------------------------------
      MAE   MSE RMSE  MAPE    R2
DT   0.01   0.0  0.0   0.6  0.98
RF   0.01   0.0  0.0  0.58  0.99
SVM  0.06   0.0  0.0  5.03  0.81
MLP  0.04  0.01  0.1  1.45  0.78


