In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [2]:
cars = pd.read_csv("./vehicles.csv")
useless_removed = cars.drop(["url", "region_url", "image_url", "description"], axis=1)
useless_removed.to_csv("./used_cars.csv")

In [None]:
cars = pd.read_csv("./vehicles.csv")
cars = cars.rename(columns={
    "year": "entry_year",
    "title_status": "vehicle_status",
    "size": "vehicle_size",
    "type": "vehicle_type"
})
cars.head()

## Cleaning the data
### Removing duplicates and irrelevant columns

In [None]:
print(f"Length before removing duplicates: {len(cars)}")

clean_cars = cars.drop(["id", "url", "region", "region_url", "VIN", "image_url", "description", "county", "state", "lat", "long", "posting_date"], axis=1)
clean_cars = clean_cars.drop_duplicates()

print(f"Length after removing duplicates: {len(clean_cars)}")

### Dealing with missing values

In [None]:
def print_null_values_count_per_column(dataframe):
    end_output = ""
    for column in dataframe.columns:
        end_output += f"nulls in {column}: {len(dataframe[dataframe[column].isnull()])},\n"
    end_output = end_output.rstrip(",\n")
    print(end_output)

print_null_values_count_per_column(clean_cars)

In [None]:
print(f"Length before removing same cars different price: {len(clean_cars)}")

# Car model is essential for predicting price, thus null values are dropped
no_nulls = clean_cars.copy()
no_nulls = no_nulls.dropna(subset="model")

# year and odometer nulls are difficult to fill, since there are few of them they will be dropped
no_nulls = no_nulls.dropna(subset=["entry_year", "odometer"])

# for columns with few null values, merge them in the most common category
# otherwise place them in their own "unknown" group
no_nulls.manufacturer = no_nulls.manufacturer.fillna("unknown")
no_nulls.condition = no_nulls.condition.fillna("unknown")
no_nulls.cylinders = no_nulls.cylinders.fillna("unknown")
no_nulls.fuel = no_nulls.fuel.fillna("gas")
no_nulls.vehicle_status = no_nulls.vehicle_status.fillna("clean")
no_nulls.transmission = no_nulls.transmission.fillna("automatic")
no_nulls.drive = no_nulls.drive.fillna("unknown")
no_nulls.vehicle_size = no_nulls.vehicle_size.fillna("unknown")
no_nulls.vehicle_type = no_nulls.vehicle_type.fillna("unknown")
no_nulls.paint_color = no_nulls.paint_color.fillna("unknown")

print(f"Length after removing same cars different price: {len(no_nulls)}")

In [None]:
print_null_values_count_per_column(no_nulls)

### Removing all rows that describe the same car but different price

In [None]:
print(f"Length before removing same cars different price: {len(no_nulls)}")

rows_to_remove = no_nulls[no_nulls.drop("price", axis=1).duplicated(keep=False)].index
no_nulls = no_nulls.drop(rows_to_remove, axis=0)

print(f"Length after removing same cars different price: {len(no_nulls)}")

## Removing outliers

In [None]:
# Keep all prices under 1M$ because big prices mess with the histogram below
no_outliers = no_nulls.copy()
no_outliers.price = no_outliers.price[no_outliers.price < 1000000]
no_outliers.price = no_outliers.price[no_outliers.price >= 500]

In [None]:
# Create a histogram of every column that could have outliers to see which ones have outliers
# Alongside there will be plotted 2 vertical lines representing the bounds for eliminating outliers
columns_used_for_checking_outliers = ["price", "entry_year", "odometer"]

fig, axes = plt.subplots(3, 1, figsize=(14, 10))
fig.subplots_adjust(hspace=0.9, wspace=0.2)
axes = axes.flatten()

for subplot_index, column_name in enumerate(columns_used_for_checking_outliers):
    ax = axes[subplot_index]
    ax.hist(no_outliers[column_name], bins=75, rwidth=0.8)
    
    mean = no_outliers[column_name].mean()
    standard_deviation = no_outliers[column_name].std()
    
    lower_bound = mean - (3 * standard_deviation)
    upper_bound = mean + (3 * standard_deviation)

    ax.axvline(x=lower_bound, color='b')
    ax.axvline(x=upper_bound, color='b')
    
    ax.set_xlabel(column_name)
    ax.set_ylabel("frequency")
    ax.set_title(f"Distribution of {column_name}")
    if column_name != "entry_year":
        ax.set_yscale("log")
        ax.set_title(f"Distribution of {column_name} (logarithmic scale)")

plt.ticklabel_format(style='plain', axis='x')
plt.show()

In [None]:
print(f"Length before removing outliers: {len(no_nulls)}\n")

columns_used_for_removing_outliers = ["price", "entry_year", "odometer"]

for column_name in columns_used_for_removing_outliers:
    mean = no_outliers[column_name].mean()
    standard_deviation = no_outliers[column_name].std()
    
    lower_bound = mean - (3 * standard_deviation)
    upper_bound = mean + (3 * standard_deviation)
    
    percentage_removed = round((((no_outliers[column_name] < lower_bound) | (no_outliers[column_name] > upper_bound)).sum() / len(no_outliers)) * 100, 2)

    print(f"For column {column_name}, removing a percentage of {percentage_removed}% values.")
    no_outliers = no_outliers[(lower_bound <= no_outliers[column_name]) & (no_outliers[column_name] <= upper_bound)]

### Erasing models that don't appear often

In [None]:
model_counts = no_outliers.model.value_counts()
values_to_keep = model_counts[model_counts >= 10].index
no_outliers = no_outliers[no_outliers.model.isin(values_to_keep)]

no_outliers.model.value_counts()

In [None]:
for model_name in no_outliers.model.unique():
    prices = no_outliers[no_outliers.model == model_name].price
    lower_bound = prices.mean() - (2 * prices.std())
    upper_bound = prices.mean() + (2 * prices.std())
    outliers = no_outliers[(no_outliers.model == model_name) & ((lower_bound > no_outliers.price) | (upper_bound < no_outliers.price))].index
    no_outliers = no_outliers.drop(outliers)

print(f"\nLength after removing outliers: {len(no_outliers)}")

In [None]:
columns_used_for_checking_outliers = ["price", "entry_year", "odometer"]

fig, axes = plt.subplots(3, 1, figsize=(14, 10))
fig.subplots_adjust(hspace=0.9, wspace=0.2)
axes = axes.flatten()

for subplot_index, column_name in enumerate(columns_used_for_checking_outliers):
    ax = axes[subplot_index]
    ax.hist(no_outliers[column_name], bins=25, rwidth=0.8)
    
    ax.set_xlabel(column_name)
    ax.set_ylabel("frequency")
    ax.set_title(f"Distribution of {column_name}")

plt.ticklabel_format(style='plain', axis='x')
plt.show()

## Feature engineering

### Changing string columns to numerical columns where possible

In [None]:
final_df = no_outliers.copy()

final_df.condition = final_df.condition.map({
    "unknown": -1,
    "salvage": 0,
    "fair": 1,
    "good": 2,
    "excellent": 3,
    "like new": 4,
    "new": 5
})
final_df.cylinders = final_df.cylinders.map({
    "unknown": -1,
    "other": 0,
    "3 cylinders": 3,
    "4 cylinders": 4,
    "5 cylinders": 5,
    "6 cylinders": 6,
    "8 cylinders": 8,
    "10 cylinders": 10,
    "12 cylinders": 12
})
final_df.vehicle_size = final_df.vehicle_size.map({
    "unknown": -1,
    "sub-compact": 0,
    "compact": 1,
    "mid-size": 2,
    "full-size": 3
})

final_df["condition_unknown"] = np.where(final_df["condition"] == "unknown", 1, 0)
final_df["cylinders_unknown"] = np.where(final_df["cylinders"] == "unknown", 1, 0)
final_df["vehicle_size_unknown"] = np.where(final_df["vehicle_size"] == "unknown", 1, 0)

final_df.price = final_df.price.astype(int)
final_df.entry_year = final_df.entry_year.astype(int)
final_df.odometer = final_df.odometer.astype(int)

final_df.head()

### One hot encoding

In [None]:
columns_to_encode = ["manufacturer", "model", "fuel", "vehicle_status", "transmission", "drive", "vehicle_type", "paint_color"]

final_df = pd.get_dummies(final_df, columns=columns_to_encode, prefix=columns_to_encode, drop_first=True)

final_df.head()

In [None]:
X = final_df.drop("price", axis=1)
y = final_df.price

models = { 
    "linear_regression": {
        "steps": [
            ("scaler", MinMaxScaler()),
            ("regressor", linear_model.LinearRegression())
        ],
        "params": {}
    },
    "knn_regression": {
        "steps": [
            ("scaler", MinMaxScaler()),
            ("regressor", KNeighborsRegressor())
        ],
        "params": {
            "regressor__n_neighbors": [3, 5, 7],
            "regressor__weights": ["uniform", "distance"],
            "regressor__algorithm": ["auto", "ball_tree", "kd_tree"],
            "regressor__leaf_size": [20, 30, 40],
            "regressor__p": [1, 2, 3]
        }
    },
    "suppor_vector_regression": {
        "steps": [
            ("scaler", MinMaxScaler()),
            ("regressor", SVR())
        ],
        "params": {
            "regressor__kernel": ["linear", "poly", "rbf", "sigmoid"],
            "regressor__C": [0.1, 1, 10],
            "regressor__epsilon": [0.1, 0.01],
            "regressor__gamma": ["scale", "auto", 0.1, 1],
            "regressor__degree": [2, 3]
        }
    },
    "random_forest_regression": {
        "steps": [
            ("scaler", MinMaxScaler()),
            ("regressor", RandomForestRegressor())
        ],
        "params": {
            "regressor__n_estimators": [50, 100, 200],
            "regressor__max_depth": [None, 5, 10],
            "regressor__min_samples_split": [2, 5],
            "regressor__min_samples_leaf": [1, 2, 4],
            "regressor__max_features": [1.0, 'sqrt', 'log2']
        }
    },
    "gradient_boosting_regression": {
        "steps": [
            ("scaler", MinMaxScaler()),
            ("regressor", GradientBoostingRegressor())
        ],
        "params": {
            "regressor__learning_rate": [0.1, 0.01, 0.001],
            "regressor__n_estimators": [50, 100, 200],
            "regressor__max_depth": [None, 5, 10],
            "regressor__min_samples_split": [2, 5],
            "regressor__min_samples_leaf": [1, 2, 4],
            "regressor__max_features": [1.0, 'sqrt', 'log2']
        }
    }
}

scores = []

for model_name, options in models.items():
    print(f"checking model {model_name}")
    pipeline = Pipeline(options["steps"])
    grid_search = GridSearchCV(pipeline, options["params"], cv=5, return_train_score=False, verbose = 4)
    
    grid_search.fit(X, y)

    score_results = grid_search.cv_results_['mean_test_score']
    params_results = grid_search.cv_results_['params']
    
    for score, params in zip(score_results, params_results):
        scores.append({
            'model': model_name,
            'score': score,
            'params': params
        })

scores_df = pd.DataFrame(scores, columns=['model', 'score', 'params'])
scores_df = scores_df.sort_values('score', ascending=False)
scores_df.reset_index(drop=True, inplace=True)
scores_df

In [None]:
scores_df.head(25)

In [None]:


X = final_df.drop("price", axis=1)
y = final_df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Linear Regression model
model = GradientBoostingRegressor(learning_rate=0.1, max_depth=5, max_features=1.0, min_samples_leaf=4, min_samples_split = 5, n_estimators=100)

# Train the model using the training data
model.fit(X_train, y_train)

# Evaluate the model on the testing data
score = model.score(X_test, y_test)

In [None]:
score

In [None]:
y_pred = model.predict(X_test)

# Calculate the residuals
residuals = y_test - y_pred

# Plot the residuals
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.show()

In [None]:
df_residuals = pd.DataFrame({'residuals': residuals})  # Create a DataFrame with residuals
df_inputs = X_test.reset_index(drop=True)  # Reset the index of X_test DataFrame

# Find rows with residuals greater than 40000
outliers_dataframe = df_residuals[abs(df_residuals['residuals']) > 20000]

# Get the corresponding inputs for outliers
outliers_inputs = outliers_dataframe.loc[outliers_dataframe.index]

In [None]:
outliers_inputs

In [None]:
final_df.loc[38903]

In [None]:
model.predict([final_df.loc[38903].drop("price")])

In [None]:
no_outliers.loc[38903]