## Install the required modules

In [None]:
!pip3 install -r requirements.txt -q

## Import the data

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

directory_path = '/Users/brendanashton/dev/go/src/github.com/deasa/pinkbike_crawler/runs'

# Initialize an empty list to store DataFrames
dfs = []

for filename in os.listdir(directory_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(directory_path, filename)
        df = pd.read_csv(file_path)
        dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
dataset = pd.concat(dfs, ignore_index=True)

dataset.head()
dataset.info()

## Data preprocessing

### Drop unnecessary columns

In [None]:
dataset = dataset.drop(columns=['Title', 'Frame Size', 'Reason for Review', 'URL', 'Original Currency'])
dataset.head()
dataset.info()

#### Clean the year - some are missing and some are strangely high

In [None]:
current_year = pd.Timestamp.now().year
min_year = 2000
max_year = pd.Timestamp.today().year 

# Calculate the mean (average) year within the bounds
mean_year = dataset[(dataset['Year'] >= min_year) & (dataset['Year'] <= max_year)]['Year'].mean()

# Replace out-of-bounds low years with NaN
dataset.loc[(dataset['Year'] < min_year), 'Year'] = np.nan

# Replace out-of-bounds years with the mean (Corrected line)
dataset.loc[(dataset['Year'] > max_year), 'Year'] = mean_year

dataset.head()

#### Extract numbers from front and rear travel

In [None]:
import re

# Extract numerical values using regular expressions
dataset['Rear Travel'] = dataset['Rear Travel'].astype(str).str.extract('(\d+)', expand=False)

# Convert to numeric, setting failed conversions to NaN
dataset['Rear Travel'] = pd.to_numeric(dataset['Rear Travel'], errors='coerce')

dataset['Rear Travel'].head()

In [None]:
dataset['Front Travel'] = dataset['Front Travel'].astype(str).str.extract('(\d+)', expand=False)
dataset['Front Travel'] = pd.to_numeric(dataset['Front Travel'], errors='coerce')

dataset['Front Travel'].head()

In [None]:
dataset.head()

#### Replace NoManufacturer and NoModelFound with NaN

In [None]:
dataset.replace('NoModelFound', np.nan, inplace=True)
dataset.replace('NoManufacturer', np.nan, inplace=True)

dataset.head()

#### Remove any electric hits

In [None]:
dataset['Model'] = dataset['Model'].astype(str).str.replace(r'.*electric.*', "NaN", case=False, regex=True)
dataset.replace('nan', np.nan, inplace=True)
dataset.replace('NaN', np.nan, inplace=True)

dataset.head()

### Build up manual predictions dataset

In [None]:
# Create a new DataFrame for manual predictions
manual_predictions = pd.DataFrame({
    'Year': [2019, 2021, 2021, 2022],
    'Manufacturer': ['Specialized', 'Canyon', 'Ibis', 'Specialized'],
    'Model': ['Stumpjumper', 'Spectral', 'Ripmo AF', 'Status 140'],
    'USD Price': [2000, 2500, 2000, 2000],
    'Condition': ['Good - Used, Mechanically Sound', 'Good - Used, Mechanically Sound', 'Good - Used, Mechanically Sound', 'Good - Used, Mechanically Sound'],
    'Wheel Size': ['29', '29', '29', '29'],
    'Front Travel': [150, 160, 160, 140],
    'Rear Travel': [140, 150, 147, 140],
    'Material': ['Carbon Fiber', 'Carbon Fiber', 'Aluminum', 'Aluminum'],
})

### Drop Model - it adds too much noise to the model

In [None]:
dataset = dataset.drop(columns=['Model'])
dataset.head()

In [None]:
manual_predictions = manual_predictions.drop(columns=['Model'])
manual_predictions.head()

### De-duplicate

In [None]:
print(f"Original DataFrame shape: {dataset.shape}")

# Drop duplicate rows, keeping the first occurrence
dataset = dataset.drop_duplicates(keep='first')

print(f"Deduplicated DataFrame shape: {dataset.shape}")

### Drop all NaN

In [None]:
print(f"Original DataFrame shape: {dataset.shape}")
dataset = dataset.dropna()

# Print the shape of the original and deduplicated DataFrames

print(f"No NA DataFrame shape: {dataset.shape}")

In [None]:
dataset.head()

## Feature engineering

### Compute an age column, drop the year

In [None]:
from datetime import datetime

def convert_age(X):
    if 'Year' not in X.columns:
        raise KeyError("The DataFrame does not contain a 'Year' column.")
    
    # Make a copy of the DataFrame to avoid SettingWithCopyWarning
    X = X.copy()
    
    current_year = datetime.now().year
    X.loc[:, 'Age'] = current_year - X['Year']
    return X

dataset = convert_age(dataset)
# Drop the 'Year' column
dataset = dataset.drop(columns='Year')

dataset.head()

In [None]:
manual_predictions = convert_age(manual_predictions)
manual_predictions = manual_predictions.drop(columns='Year')
manual_predictions.head()

### Put them in categories based on their travel numbers

In [None]:
# Create a new column 'Category' that will categorize the listings by the rear travel.
def categorize_travel(X):
    if 'Rear Travel' not in X.columns:
        raise KeyError("The DataFrame does not contain a 'Rear Travel' column.")
    
    # Make a copy of the DataFrame to avoid SettingWithCopyWarning
    X = X.copy()
    
    # Create a new column 'Category' based on the 'Rear Travel' column
    X.loc[(X['Rear Travel'] < 0), 'Category'] = np.nan
    X.loc[(X['Rear Travel'] > 210), 'Category'] = np.nan
    X.loc[(X['Rear Travel'] == 0), 'Category'] = 'Hardtail'
    X.loc[(X['Rear Travel'] >= 0) & (X['Rear Travel'] < 120), 'Category'] = 'Short Travel'
    X.loc[(X['Rear Travel'] >= 120) & (X['Rear Travel'] <= 150), 'Category'] = 'Mid Travel'
    X.loc[(X['Rear Travel'] > 150) & (X['Rear Travel'] <= 210), 'Category'] = 'Long Travel'
    return X

print(f"Original DataFrame shape: {dataset.shape}")
dataset = categorize_travel(dataset)
dataset = dataset.dropna()
print(f"DataFrame shape after categorization: {dataset.shape}")

dataset.head()

In [None]:
manual_predictions = categorize_travel(manual_predictions)
manual_predictions.head()

### Drop now-irrelevant front travel and rear travel columns

In [None]:
dataset = dataset.drop(columns=['Rear Travel', 'Front Travel'])
dataset.head()

In [None]:
manual_predictions = manual_predictions.drop(columns=['Rear Travel', 'Front Travel'])
manual_predictions.head()

### Reorder columns so the target variable is at the end

In [None]:
dataset = dataset[[col for col in dataset.columns if col != 'USD Price'] + ['USD Price']]
dataset.head()

In [None]:
manual_predictions = manual_predictions[[col for col in manual_predictions.columns if col != 'USD Price'] + ['USD Price']]
manual_predictions.head()

### Split the data into features and a target variable

In [None]:
X = dataset.iloc[:, :-1]
Y = dataset.iloc[:, -1]
X_manual = manual_predictions.iloc[:, :-1]
Y_manual = manual_predictions.iloc[:, -1]

In [None]:
print(X)

In [None]:
print(Y)

In [None]:
print(X_manual)

In [None]:
print(Y_manual)

## Transform the data with one-hot encoding

In [None]:
def encode_data(X):
    for column in ['Manufacturer', 'Condition', 'Wheel Size', 'Material', 'Category']:
        insert_loc = X.columns.get_loc(column)
        X = pd.concat([X.iloc[:,:insert_loc], pd.get_dummies(X.loc[:, [column]]), X.iloc[:,insert_loc+1:]], axis=1)
    return X

# temporarily join the X and X_manual DataFrames to encode the categorical variables
X = pd.concat([X, X_manual], ignore_index=True)
X = np.array(encode_data(X.copy()))

# Split the encoded data back into the original X and X_manual DataFrames
X, X_manual = X[:len(dataset)], X[len(dataset):]

In [None]:
print(X_manual)

### Split the dataset into a training set and a test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
# split_index = int(len(X_transformed) * 0.8)
# X_train = X_transformed[:split_index]
# X_test = X_transformed[split_index:]
# Y_train = Y[:split_index]
# Y_test = Y[split_index:]

In [None]:
print(X_train)
# print(dataset.iloc[split_index:, -2])

## Apply feature scaling
We don't actually have to apply feature scaling for multiple linear regression because the coefficient for each independent variable will auto adjust according to its significance and its raw values.

In [None]:
print(X_test)

## Train the model

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, Y_train)

## Predicting the test set results

In [None]:
Y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(Y_pred)

In [None]:
Y_test_array = Y_test.to_numpy().reshape(len(Y_test),1)
Y_pred_array = Y_pred.reshape(len(Y_pred),1)
comparisons = np.concatenate((Y_pred_array, Y_test_array),1)
print(comparisons)

## Visualize the differences

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Compute mean absolute error (MAE)
mae = mean_absolute_error(Y_test_array, Y_pred_array)

# Compute mean squared error (MSE)
mse = mean_squared_error(Y_test_array, Y_pred_array)

# Compute root mean squared error (RMSE)
rmse = np.sqrt(mse)

mae, mse, rmse

### Generate predictions

In [None]:
# Predict the prices of the manual predictions
Y_manual_pred = regressor.predict(X_manual)

print('Y_manual_pred:', Y_manual_pred)