In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import optuna
from optuna.trial import TrialState
import joblib

In [None]:
cars = pd.read_csv("./used_cars.csv")
cars = cars.rename(columns={
    "year": "entry_year",
    "title_status": "vehicle_status",
    "size": "vehicle_size",
    "type": "vehicle_type"
})
cars.head()

## Cleaning the data
### Removing duplicates and irrelevant columns

In [None]:
print(f"Length before removing duplicates: {len(cars)}")

clean_cars = cars.drop(["id", "region", "VIN", "county", "lat", "long", "posting_date"], axis=1)
# These columns are removed because they were found to have no correlation in a future heatmap
clean_cars = clean_cars.drop(["paint_color", "state"], axis=1)
clean_cars.drop_duplicates(keep='first', inplace=True)

print(f"Length after removing duplicates: {len(clean_cars)}")

### Dealing with missing values

In [None]:
def print_null_values_count_per_column(dataframe):
    end_output = ""
    for column in dataframe.columns:
        end_output += f"nulls in {column}: {len(dataframe[dataframe[column].isnull()])},\n"
    end_output = end_output.rstrip(",\n")
    print(end_output)

print_null_values_count_per_column(clean_cars)

In [None]:
print(f"Length before removing null values: {len(clean_cars)}")

# Car model is essential for predicting price, thus null values are dropped
clean_cars = clean_cars.dropna(subset="model")

# year and odometer nulls are difficult to fill, since there are few of them they will be dropped
clean_cars = clean_cars.dropna(subset=["entry_year", "odometer"])

# for columns with few null values, merge them in the most common category
# otherwise place them in their own "unknown" group
clean_cars.manufacturer = clean_cars.manufacturer.fillna("unknown")
clean_cars.condition = clean_cars.condition.fillna("unknown")
clean_cars.cylinders = clean_cars.cylinders.fillna("unknown")
clean_cars.fuel = clean_cars.fuel.fillna("gas")
clean_cars.vehicle_status = clean_cars.vehicle_status.fillna("clean")
clean_cars.transmission = clean_cars.transmission.fillna("automatic")
clean_cars.drive = clean_cars.drive.fillna("unknown")
clean_cars.vehicle_size = clean_cars.vehicle_size.fillna("unknown")
clean_cars.vehicle_type = clean_cars.vehicle_type.fillna("unknown")

print(f"Length after removing null values: {len(clean_cars)}")

In [None]:
print_null_values_count_per_column(clean_cars)

### Changing string columns to numerical columns where possible

In [None]:
clean_cars.condition = clean_cars.condition.map({
    "unknown": -1,
    "salvage": 0,
    "fair": 1,
    "good": 2,
    "excellent": 3,
    "like new": 4,
    "new": 5
})
clean_cars.cylinders = clean_cars.cylinders.map({
    "unknown": -1,
    "other": 0,
    "3 cylinders": 3,
    "4 cylinders": 4,
    "5 cylinders": 5,
    "6 cylinders": 6,
    "8 cylinders": 8,
    "10 cylinders": 10,
    "12 cylinders": 12
})
clean_cars.vehicle_size = clean_cars.vehicle_size.map({
    "unknown": -1,
    "sub-compact": 0,
    "compact": 1,
    "mid-size": 2,
    "full-size": 3
})

clean_cars.price = clean_cars.price.astype(int)
clean_cars.entry_year = clean_cars.entry_year.astype(int)
clean_cars.odometer = clean_cars.odometer.astype(int)

## Removing outliers

In [None]:
# Keep all prices under 1M$ because big prices mess with the histogram below
no_outliers = clean_cars.copy()
no_outliers.price = no_outliers.price[no_outliers.price < 1000000]
no_outliers.price = no_outliers.price[no_outliers.price >= 1500]

In [None]:
# Create a histogram of every column that could have outliers to see which ones have outliers
# Alongside there will be plotted 2 vertical lines representing the bounds for eliminating outliers
columns_used_for_checking_outliers = ["price", "entry_year", "odometer"]

fig, axes = plt.subplots(3, 1, figsize=(14, 10))
fig.subplots_adjust(hspace=0.9, wspace=0.2)
axes = axes.flatten()

for subplot_index, column_name in enumerate(columns_used_for_checking_outliers):
    ax = axes[subplot_index]
    ax.hist(no_outliers[column_name], bins=75, rwidth=0.8)
    
    mean = no_outliers[column_name].mean()
    standard_deviation = no_outliers[column_name].std()
    
    lower_bound = mean - (3 * standard_deviation)
    upper_bound = mean + (3 * standard_deviation)

    ax.axvline(x=lower_bound, color='b')
    ax.axvline(x=upper_bound, color='b')
    
    ax.set_xlabel(column_name)
    ax.set_ylabel("frequency")
    ax.set_title(f"Distribution of {column_name}")
    if column_name != "entry_year":
        ax.set_yscale("log")
        ax.set_title(f"Distribution of {column_name} (logarithmic scale)")

plt.ticklabel_format(style='plain', axis='x')
plt.show()

In [None]:
print(f"Length before removing outliers: {len(clean_cars)}\n")

columns_used_for_removing_outliers = ["price", "entry_year", "odometer"]

for column_name in columns_used_for_removing_outliers:
    mean = no_outliers[column_name].mean()
    standard_deviation = no_outliers[column_name].std()
    
    lower_bound = mean - (3 * standard_deviation)
    upper_bound = mean + (3 * standard_deviation)
    
    percentage_removed = round((((no_outliers[column_name] < lower_bound) | (no_outliers[column_name] > upper_bound)).sum() / len(no_outliers)) * 100, 2)

    print(f"For column {column_name}, removing a percentage of {percentage_removed}% values.")
    no_outliers = no_outliers[(lower_bound <= no_outliers[column_name]) & (no_outliers[column_name] <= upper_bound)]

print(f"\nLength after removing outliers: {len(no_outliers)}")

In [None]:
columns_used_for_checking_outliers = ["price", "entry_year", "odometer"]

fig, axes = plt.subplots(3, 1, figsize=(14, 10))
fig.subplots_adjust(hspace=0.9, wspace=0.2)
axes = axes.flatten()

for subplot_index, column_name in enumerate(columns_used_for_checking_outliers):
    ax = axes[subplot_index]
    ax.hist(no_outliers[column_name], bins=25, rwidth=0.8)
    
    ax.set_xlabel(column_name)
    ax.set_ylabel("frequency")
    ax.set_title(f"Distribution of {column_name}")

plt.ticklabel_format(style='plain', axis='x')
plt.show()

### Erasing models that don't appear often #1

In [None]:
print(f"Length before removing models that don't appear often: {len(no_outliers)}\n")

# This operation is done 2 times, normally it is done after we've completely cleaned the dataset to remove very rare models
# However the cells below takes a lot of time to complete, so it is first done here to remove some of the cars, then again later
model_counts = no_outliers.model.value_counts()
values_to_keep = model_counts[model_counts >= 10].index
no_outliers = no_outliers[no_outliers.model.isin(values_to_keep)]

print(f"Length after removing models that don't appear often: {len(no_outliers)}\n")
no_outliers.model.value_counts()

### Erase price outliers for each car model

In [None]:
print(f"Length before removing outliers for each car model: {len(no_outliers)}\n")

car_model_groups = no_outliers.groupby('model')

def remove_outliers(group):
    price_mean = group['price'].mean()
    price_std = group['price'].std()

    lower_bound = price_mean - (2 * price_std)
    upper_bound = price_mean + (2 * price_std)

    lower_outliers_mask = group['price'] >= lower_bound
    upper_outliers_mask = group['price'] <= upper_bound

    return group[lower_outliers_mask & upper_outliers_mask]

no_outliers = car_model_groups.apply(remove_outliers)
no_outliers.reset_index(drop=True, inplace=True)

print(f"Length after removing outliers for each car model: {len(no_outliers)}\n")

### Eliminating better cars that are cheaper

In [None]:
print(f"Length before removing better cars that are cheaper: {len(no_outliers)}\n")
final_df = no_outliers.copy()

# These columns are used because a car with condition 0 would normally be considered better than a car with condition -1
# But condition -1 (unknown) is as good if not usually better than 0 (salvage)
# Once all better cars that are cheaper are erased, these columns will be dropped to prevent multicollinearity problems
final_df["condition_unknown"] = np.where(final_df["condition"] == -1, 1, 0)
final_df["cylinders_unknown"] = np.where(final_df["cylinders"] == -1, 1, 0)
final_df["vehicle_size_unknown"] = np.where(final_df["vehicle_size"] == -1, 1, 0)

for model_index, model_name in tqdm(enumerate(final_df.model.unique()), total=len(final_df.model.unique())):
    curr_cars = final_df[final_df.model == model_name].sort_values(by='price', ascending=False)
    to_be_deleted = set()
    for car_index, car in curr_cars.iterrows():
        if car_index in to_be_deleted:
            continue
        better_cheaper_cars = curr_cars[
            (curr_cars.manufacturer == car.manufacturer) &
            (curr_cars.entry_year >= car.entry_year) &
            (curr_cars.condition >= car.condition) &
            (curr_cars.condition_unknown == car.condition_unknown) &
            (curr_cars.cylinders >= car.cylinders) &
            (curr_cars.cylinders_unknown == car.cylinders_unknown) &
            (curr_cars.fuel == car.fuel) &
            (curr_cars.odometer <= car.odometer) &
            (curr_cars.vehicle_status == car.vehicle_status) &
            (curr_cars.transmission == car.transmission) &
            (curr_cars.drive == car.drive) &
            (curr_cars.vehicle_size >= car.vehicle_size) &
            (curr_cars.vehicle_size_unknown == car.vehicle_size_unknown) &
            (curr_cars.vehicle_type == car.vehicle_type) &
            (curr_cars.price <= car.price) &
            (curr_cars.index != car_index)
        ]
        
        if len(better_cheaper_cars):
            to_be_deleted.update(better_cheaper_cars.index)
    final_df = final_df.drop(to_be_deleted)

final_df = final_df.drop(["condition_unknown", "cylinders_unknown", "vehicle_size_unknown"], axis=1)
print(f"\nLength after removing better cars that are cheaper: {len(final_df)}")
final_df.head()

### Erasing models that don't appear often #2

In [None]:
print(f"Length before removing models that don't appear often: {len(final_df)}\n")

model_counts = final_df.model.value_counts()
values_to_keep = model_counts[model_counts >= 10].index
final_df = final_df[final_df.model.isin(values_to_keep)]

print(f"Length after removing models that don't appear often: {len(final_df)}\n")
final_df.model.value_counts()

### Correlation heatmap, used to remove irrelevant columns

In [None]:
from sklearn.preprocessing import LabelEncoder

heatmap_df = final_df.copy()

# Label encode string columns
label_encoder = LabelEncoder()
string_columns = ['manufacturer', 'model', "condition", "cylinders", 'fuel', 'vehicle_status', 'transmission', 'drive', "vehicle_size", 'vehicle_type']
for col in string_columns:
    heatmap_df[col] = label_encoder.fit_transform(heatmap_df[col])

correlation_matrix = heatmap_df.corr()
plt.figure(figsize=(10, 8))
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='nearest')
plt.colorbar()

for i in range(correlation_matrix.shape[0]):
    for j in range(correlation_matrix.shape[1]):
        plt.text(j, i, f"{correlation_matrix.iloc[i, j]:.2f}", ha='center', va='center', color='white', fontsize=8)

plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix.index)), correlation_matrix.index)
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

### One hot encoding

In [None]:
columns_to_encode = ["manufacturer", "model", "fuel", "vehicle_status", "transmission", "drive", "vehicle_type"]

final_df = pd.get_dummies(final_df, columns=columns_to_encode, prefix=columns_to_encode, drop_first=True)

final_df.head()

## First phase hyperparameter tuning

### Test on 1000 samples and pick the top 100 configurations

In [None]:
first_phase_df = final_df.head(1000)
X = first_phase_df.drop("price", axis=1)
y = first_phase_df.price

models = { 
    "linear_regression": {
        "steps": [
            ("scaler", MinMaxScaler()),
            ("regressor", linear_model.LinearRegression())
        ],
        "params": {}
    },
    "knn_regression": {
        "steps": [
            ("scaler", MinMaxScaler()),
            ("regressor", KNeighborsRegressor())
        ],
        "params": {
            "regressor__n_neighbors": [5, 10, 25],
            "regressor__weights": ["uniform", "distance"],
            "regressor__algorithm": ["auto", "ball_tree", "kd_tree"],
            "regressor__leaf_size": [20, 30, 40],
            "regressor__p": [1, 2, 3]
        }
    },
    "suppor_vector_regression": {
        "steps": [
            ("scaler", MinMaxScaler()),
            ("regressor", SVR())
        ],
        "params": {
            "regressor__kernel": ["linear", "poly", "rbf", "sigmoid"],
            "regressor__C": [0.1, 1, 10, 100],
            "regressor__epsilon": [0.1, 0.01, 0.001],
            "regressor__gamma": ["scale", "auto", 0.1, 1],
            "regressor__degree": [2, 3]
        }
    },
    "random_forest_regression": {
        "steps": [
            ("scaler", MinMaxScaler()),
            ("regressor", RandomForestRegressor())
        ],
        "params": {
            "regressor__n_estimators": [50, 100, 250, 500],
            "regressor__max_depth": [None, 5, 10, 20],
            "regressor__min_samples_split": [2, 5, 10],
            "regressor__min_samples_leaf": [1, 2, 5],
            "regressor__max_features": [1.0, 'sqrt', 'log2']
        }
    },
    "gradient_boosting_regression": {
        "steps": [
            ("scaler", MinMaxScaler()),
            ("regressor", GradientBoostingRegressor())
        ],
        "params": {
            "regressor__learning_rate": [0.1, 0.01, 0.001],
            "regressor__n_estimators": [50, 100, 250, 500],
            "regressor__max_depth": [None, 5, 10, 20],
            "regressor__min_samples_split": [2, 5, 10],
            "regressor__min_samples_leaf": [1, 2, 5],
            "regressor__max_features": [1.0, 'sqrt', 'log2']
        }
    }
}

scores = []

for model_name, options in tqdm(models.items(), desc="Hyperparameter Tuning"):
    pipeline = Pipeline(options["steps"])
    grid_search = GridSearchCV(pipeline, options["params"], cv=5, return_train_score=False, verbose = 4)
    
    grid_search.fit(X, y)

    score_results = grid_search.cv_results_['mean_test_score']
    params_results = grid_search.cv_results_['params']
    
    for score, params in zip(score_results, params_results):
        scores.append({
            'model': model_name,
            'score': score,
            'params': params
        })

scores_df = pd.DataFrame(scores, columns=['model', 'score', 'params'])
scores_df = scores_df.sort_values('score', ascending=False)
scores_df.reset_index(drop=True, inplace=True)
scores_df

In [None]:
scores_df.head(100)

## Second phase hyperparameter tuning

### Test winners from phase 1 on 10000 samples and pick the top 5 configurations

In [None]:
second_phase_df = final_df.head(10000)
X = second_phase_df.drop("price", axis=1)
y = second_phase_df.price

models = { 
    "gradient_boosting_regression": {
        "steps": [
            ("scaler", MinMaxScaler()),
            ("regressor", GradientBoostingRegressor())
        ],
        "params": {
            "regressor__learning_rate": [0.1],
            "regressor__n_estimators": [100, 250, 500],
            "regressor__max_depth": [None, 5, 10, 20],
            "regressor__min_samples_split": [2, 5, 10],
            "regressor__min_samples_leaf": [1, 2, 5],
            "regressor__max_features": ['sqrt', 'log2']
        }
    }
}

scores = []

for model_name, options in tqdm(models.items(), desc="Hyperparameter Tuning"):
    pipeline = Pipeline(options["steps"])
    grid_search = GridSearchCV(pipeline, options["params"], cv=5, return_train_score=False, verbose = 4)
    
    grid_search.fit(X, y)

    score_results = grid_search.cv_results_['mean_test_score']
    params_results = grid_search.cv_results_['params']
    
    for score, params in zip(score_results, params_results):
        scores.append({
            'model': model_name,
            'score': score,
            'params': params
        })

scores_df = pd.DataFrame(scores, columns=['model', 'score', 'params'])
scores_df = scores_df.sort_values('score', ascending=False)
scores_df.reset_index(drop=True, inplace=True)
scores_df

## Final phase hyperparameter tuning

### Test winners from phase 2 on all samples and pick the best model

In [None]:
final_phase_df = final_df.copy()
X = final_phase_df.drop("price", axis=1)
y = final_phase_df.price

models = { 
    "gradient_boosting_regression": {
        "steps": [
            ("scaler", MinMaxScaler()),
            ("regressor", GradientBoostingRegressor())
        ],
        "params": {
            "regressor__learning_rate": [0.1],
            "regressor__n_estimators": [500],
            "regressor__max_depth": [None],
            "regressor__min_samples_split": [2, 10],
            "regressor__min_samples_leaf": [2, 5],
            "regressor__max_features": ['sqrt']
        }
    }
}

scores = []

for model_name, options in tqdm(models.items(), desc="Hyperparameter Tuning"):
    pipeline = Pipeline(options["steps"])
    grid_search = GridSearchCV(pipeline, options["params"], cv=3, return_train_score=False, verbose = 4)
    
    grid_search.fit(X, y)

    score_results = grid_search.cv_results_['mean_test_score']
    params_results = grid_search.cv_results_['params']
    
    for score, params in zip(score_results, params_results):
        scores.append({
            'model': model_name,
            'score': score,
            'params': params
        })

scores_df = pd.DataFrame(scores, columns=['model', 'score', 'params'])
scores_df = scores_df.sort_values('score', ascending=False)
scores_df.reset_index(drop=True, inplace=True)
scores_df

### Training final model and calculating score

In [None]:
X = final_df.drop("price", axis=1)
y = final_df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Linear Regression model
model = GradientBoostingRegressor(learning_rate=0.1, max_depth=None, max_features="sqrt", min_samples_leaf=2, min_samples_split=10, n_estimators=500)

# Train the model using the training data
model.fit(X_train, y_train)

# Evaluate the model on the testing data
score = model.score(X_test, y_test)
score

In [None]:
y_pred = model.predict(X_test)

# Calculate the residuals
residuals = y_test - y_pred

# Plot the residuals
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.show()

In [None]:
len(X_test)

In [None]:
df_residuals = pd.DataFrame({'residuals': residuals})  # Create a DataFrame with residuals
df_inputs = X_test.reset_index(drop=True)  # Reset the index of X_test DataFrame

# Find rows with residuals greater than 10000
outliers_dataframe = df_residuals[abs(df_residuals['residuals']) > 10000]

# Get the corresponding inputs for outliers
outliers_inputs = outliers_dataframe.loc[outliers_dataframe.index]
outliers_inputs

## Testing neural networks

### Using optuna to pick the best neural network for our problem

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 128

X = final_df.drop("price", axis=1)
y = final_df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(DEVICE)
y_train_tensor = torch.tensor(y_train.values.reshape(-1, 1), dtype=torch.float32).to(DEVICE)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(DEVICE)
y_test_tensor = torch.tensor(y_test.values.reshape(-1, 1), dtype=torch.float32).to(DEVICE)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

valid_dataset = TensorDataset(X_test_tensor, y_test_tensor)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)


def define_model(trial):
    n_layers = trial.suggest_int("n_layers", 1, 4)
    layers = []

    in_features = len(X.columns)
    for i in range(n_layers):
        out_features = trial.suggest_int("n_units_l{}".format(i), 16, 512)
        layers.append(nn.Linear(in_features, out_features))
        layers.append(nn.ReLU())
        p = trial.suggest_float("dropout_l{}".format(i), 0.2, 0.5)
        layers.append(nn.Dropout(p))

        in_features = out_features
    
    layers.append(nn.Linear(in_features, 1))

    return nn.Sequential(*layers)


def objective(trial):
    # Generate the model.
    model = define_model(trial).to(DEVICE)

    # Generate the optimizers.
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-4, 1e-1, log=True)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)

    # Get the number of epochs as a hyperparameter from Optuna.
    EPOCHS = trial.suggest_int("epochs", 20, 200)

    # Training of the model.
    for epoch in range(EPOCHS):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)

            optimizer.zero_grad()
            output = model(data)
            loss = F.mse_loss(output, target)
            loss.backward()
            optimizer.step()

        # Validation of the model.
        model.eval()
        total_loss = 0.0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(valid_loader):
                data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)
                output = model(data)
                loss = F.mse_loss(output, target)  # Use MSE for evaluation
                total_loss += loss.item()

        mse = total_loss / len(valid_loader.dataset)

        trial.report(mse, epoch)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return mse


if __name__ == "__main__":
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=100)

    pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
    complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

### Testing the best neural network and comparing it with the best ml model

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 183
BATCH_SIZE = 128

X = final_df.drop("price", axis=1)
y = final_df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(DEVICE)
y_train_tensor = torch.tensor(y_train.values.reshape(-1, 1), dtype=torch.float32).to(DEVICE)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(DEVICE)
y_test_tensor = torch.tensor(y_test.values.reshape(-1, 1), dtype=torch.float32).to(DEVICE)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

input_size = len(X.columns)

model = nn.Sequential(
    nn.Linear(input_size, 235),
    nn.ReLU(),
    nn.Dropout(0.306),
    nn.Linear(235, 324),
    nn.ReLU(),
    nn.Dropout(0.265),
    nn.Linear(324, 1)
).to(DEVICE)

# Define the loss function (Mean Squared Error for regression)
loss_fn = nn.MSELoss()

# Define the optimizer (RMSprop)
learning_rate = 0.0048
optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)

for epoch in range(EPOCHS):
    ### Training
    model.train()
    
    for batch_X, batch_y in train_loader:
        # Move batch data to device (GPU if available)
        batch_X, batch_y = batch_X.to(DEVICE), batch_y.to(DEVICE)

        # 1. Forward pass
        y_pred = model(batch_X)
  
        # 2. Calculate loss
        loss = loss_fn(y_pred, batch_y)
        r2 = r2_score(batch_y.cpu().detach().numpy(), y_pred.cpu().detach().numpy())

        # 3. Optimizer zero grad
        optimizer.zero_grad()

        # 4. Loss backward
        loss.backward()

        # 5. Optimizer step
        optimizer.step()

    ### Testing
    model.eval()
    with torch.no_grad():
        # 1. Forward pass
        test_pred = model(X_test_tensor)
        # 2. Calculate test loss
        test_loss = loss_fn(test_pred, y_test_tensor)
        test_r2 = r2_score(y_test_tensor.cpu().detach().numpy(), test_pred.cpu().detach().numpy())

    # Print out what's happening every 10 epochs
    if epoch % 10 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.1f} | R2: {r2:.4f} | Test loss: {test_loss:.1f} | Test R2: {test_r2:.4f}")

In [None]:
model.eval()
with torch.no_grad():
    test_pred = model(X_test_tensor).cpu().detach().numpy()

    # Calculate the residuals
    residuals = (y_test_tensor.cpu().detach().numpy() - test_pred).flatten()

    # Plot the residuals
    plt.scatter(test_pred, residuals)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel("Predicted values")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    plt.show()

In [None]:
df_residuals = pd.DataFrame({'residuals': residuals})  # Create a DataFrame with residuals
df_inputs = X_test.reset_index(drop=True)  # Reset the index of X_test DataFrame

# Find rows with residuals greater than 10000
outliers_dataframe = df_residuals[abs(df_residuals['residuals']) > 10000]

# Get the corresponding inputs for outliers
outliers_inputs = outliers_dataframe.loc[outliers_dataframe.index]
outliers_inputs

### Exporting the neural network and the scaler

In [None]:
torch.save(model.state_dict(), "used_cars_price_prediction_neural_network.pt")
joblib.dump(scaler, "used_cars_scaler.pkl")