## Install the required modules

In [134]:
!pip3 install -r requirements.txt -q

## Import the data

In [135]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dataset = pd.read_csv('goodListingsCache0721TrailBikes.csv')
dataset.head()

Unnamed: 0,Title,Year,Manufacturer,Model,USD Price,Original Currency,Condition,Frame Size,Wheel Size,Front Travel,Rear Travel,Material,Reason for Review,URL
0,2023 Yeti SB120 T-Series Custom (L),2023,Yeti,SB120,7900,USD,Excellent - Lightly Ridden,L,29,140 mm,120 mm,Carbon Fiber,,https://www.pinkbike.com/buysell/3894487/
1,2024 Santa Cruz Tallboy,2024,Santa Cruz,Tallboy,4099,USD,New - Unridden/With Tags,L,29,130 mm,120 mm,Carbon Fiber,,https://www.pinkbike.com/buysell/3843045/
2,2024 Santa Cruz Bronson,2024,Santa Cruz,Bronson,3999,USD,New - Unridden/With Tags,M,29,160 mm,150 mm,Carbon Fiber,,https://www.pinkbike.com/buysell/3843047/
3,2024 Santa Cruz Hightower,2024,Santa Cruz,Hightower,3849,USD,New - Unridden/With Tags,XL,29,150 mm,145 mm,Carbon Fiber,,https://www.pinkbike.com/buysell/3843048/
4,2024 Santa Cruz Heckler SL,2024,Santa Cruz,Heckler,6799,USD,New - Unridden/With Tags,M,29,160 mm,150 mm,Carbon Fiber,,https://www.pinkbike.com/buysell/3843052/


### Compute an age column, drop the year

In [136]:
from datetime import datetime

def convert_age(X):
    if 'Year' not in X.columns:
        raise KeyError("The DataFrame does not contain a 'Year' column.")
    
    # Make a copy of the DataFrame to avoid SettingWithCopyWarning
    X = X.copy()
    
    current_year = datetime.now().year
    X.loc[:, 'Age'] = current_year - X['Year']
    return X

dataset = convert_age(dataset)
dataset.head()

Unnamed: 0,Title,Year,Manufacturer,Model,USD Price,Original Currency,Condition,Frame Size,Wheel Size,Front Travel,Rear Travel,Material,Reason for Review,URL,Age
0,2023 Yeti SB120 T-Series Custom (L),2023,Yeti,SB120,7900,USD,Excellent - Lightly Ridden,L,29,140 mm,120 mm,Carbon Fiber,,https://www.pinkbike.com/buysell/3894487/,1
1,2024 Santa Cruz Tallboy,2024,Santa Cruz,Tallboy,4099,USD,New - Unridden/With Tags,L,29,130 mm,120 mm,Carbon Fiber,,https://www.pinkbike.com/buysell/3843045/,0
2,2024 Santa Cruz Bronson,2024,Santa Cruz,Bronson,3999,USD,New - Unridden/With Tags,M,29,160 mm,150 mm,Carbon Fiber,,https://www.pinkbike.com/buysell/3843047/,0
3,2024 Santa Cruz Hightower,2024,Santa Cruz,Hightower,3849,USD,New - Unridden/With Tags,XL,29,150 mm,145 mm,Carbon Fiber,,https://www.pinkbike.com/buysell/3843048/,0
4,2024 Santa Cruz Heckler SL,2024,Santa Cruz,Heckler,6799,USD,New - Unridden/With Tags,M,29,160 mm,150 mm,Carbon Fiber,,https://www.pinkbike.com/buysell/3843052/,0


### Exclude anything more than 7 years old

In [137]:
def exclude_old_bikes(df):
    if 'Age' not in df.columns:
        raise KeyError("The DataFrame does not contain an 'Age' column.")
    
    # Make a copy of the DataFrame to avoid SettingWithCopyWarning
    df = df.copy()
    
    df = df[df['Age'] <= 7]
    return df

dataset = exclude_old_bikes(dataset)

### Exclude prices less than 1000 or more than 7000

In [138]:
# def exclude_cheap_and_pricey_bikes(df):
#     if 'USD Price' not in df.columns:
#         raise KeyError("The DataFrame does not contain a 'USD Price' column.")

#     # Make a copy of the DataFrame to avoid SettingWithCopyWarning
#     df = df.copy()

#     df = df[df['USD Price'] >= 1000]
#     return df

# dataset = exclude_cheap_and_pricey_bikes(dataset)

### Drop unnecessary columns

In [139]:
dataset = dataset.drop(columns=['Title', 'Frame Size', 'Wheel Size', 'Front Travel', 'Reason for Review', 'URL', 'Original Currency', 'Year'])
dataset.head()

Unnamed: 0,Manufacturer,Model,USD Price,Condition,Rear Travel,Material,Age
0,Yeti,SB120,7900,Excellent - Lightly Ridden,120 mm,Carbon Fiber,1
1,Santa Cruz,Tallboy,4099,New - Unridden/With Tags,120 mm,Carbon Fiber,0
2,Santa Cruz,Bronson,3999,New - Unridden/With Tags,150 mm,Carbon Fiber,0
3,Santa Cruz,Hightower,3849,New - Unridden/With Tags,145 mm,Carbon Fiber,0
4,Santa Cruz,Heckler,6799,New - Unridden/With Tags,150 mm,Carbon Fiber,0


### Reorder columns so the target variable is at the end

In [140]:
dataset = dataset[[col for col in dataset.columns if col != 'USD Price'] + ['USD Price']]
dataset.head()

Unnamed: 0,Manufacturer,Model,Condition,Rear Travel,Material,Age,USD Price
0,Yeti,SB120,Excellent - Lightly Ridden,120 mm,Carbon Fiber,1,7900
1,Santa Cruz,Tallboy,New - Unridden/With Tags,120 mm,Carbon Fiber,0,4099
2,Santa Cruz,Bronson,New - Unridden/With Tags,150 mm,Carbon Fiber,0,3999
3,Santa Cruz,Hightower,New - Unridden/With Tags,145 mm,Carbon Fiber,0,3849
4,Santa Cruz,Heckler,New - Unridden/With Tags,150 mm,Carbon Fiber,0,6799


### Split the data into features and a target variable

In [141]:
X = dataset.iloc[:, :-1]
Y = dataset.iloc[:, -1]

In [142]:
print(X)

     Manufacturer       Model                        Condition  \
0            Yeti       SB120       Excellent - Lightly Ridden   
1      Santa Cruz     Tallboy         New - Unridden/With Tags   
2      Santa Cruz     Bronson         New - Unridden/With Tags   
3      Santa Cruz   Hightower         New - Unridden/With Tags   
4      Santa Cruz     Heckler         New - Unridden/With Tags   
...           ...         ...                              ...   
4340        Scott      Genius       Excellent - Lightly Ridden   
4341         Kona       Honzo  Good - Used, Mechanically Sound   
4342        Giant      Fathom       Excellent - Lightly Ridden   
4343        Salsa  Blackthorn         New - Unridden/With Tags   
4344        Vitus    Mythique  Good - Used, Mechanically Sound   

          Rear Travel      Material  Age  
0              120 mm  Carbon Fiber    1  
1              120 mm  Carbon Fiber    0  
2              150 mm  Carbon Fiber    0  
3              145 mm  Carbon Fiber

In [143]:
print(Y)

0       7900
1       4099
2       3999
3       3849
4       6799
        ... 
4340    3500
4341    1900
4342    1239
4343    4500
4344    1500
Name: USD Price, Length: 4044, dtype: int64


## Transform the data with one-hot encoding

In [144]:
def encode_data(X):
    for column in ['Manufacturer', 'Model', 'Condition', 'Rear Travel', 'Material']:
        insert_loc = X.columns.get_loc(column)
        X = pd.concat([X.iloc[:,:insert_loc], pd.get_dummies(X.loc[:, [column]]), X.iloc[:,insert_loc+1:]], axis=1)
    return X

X_transformed = np.array(encode_data(X.copy()))

def reverse_one_hot_encoding(X_transformed, original_columns):
    """
    Reverses one-hot encoding performed by the 'encode_data' function.

    Args:
        X_transformed: The NumPy array containing the transformed data.
        original_columns: A list of the original column names before one-hot encoding.

    Returns:
        A pandas DataFrame with the original categorical columns restored.
    """

    # Get all column names from X_transformed
    all_columns = [f"col_{i}" for i in range(X_transformed.shape[1])]  # Generic column names for now

    X_reversed = pd.DataFrame(X_transformed, columns=all_columns)

    for column in original_columns:
        # Find one-hot encoded columns related to this original column
        one_hot_cols = [col for col in X_reversed.columns if col.startswith(column + '_')]

        if one_hot_cols:  # Check if any one-hot columns were found
            # Reverse one-hot encoding for this column
            X_reversed[column] = X_reversed[one_hot_cols].idxmax(axis=1).str.replace(column + '_', '', regex=False)

            # Drop the one-hot encoded columns
            X_reversed.drop(one_hot_cols, axis=1, inplace=True)
        else:
            # If no one-hot columns found, assume the original column was not encoded
            pass  # Or handle this case as needed (e.g., raise an error)

    return X_reversed

X_reversed = reverse_one_hot_encoding(X_transformed, list(X.columns))
X_reversed.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_291,col_292,col_293,col_294,col_295,col_296,col_297,col_298,col_299,col_300
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,1
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,0
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,0
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,0
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,0


### Split the dataset into a training set and a test set

In [114]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
# split_index = int(len(X_transformed) * 0.8)
# X_train = X_transformed[:split_index]
# X_test = X_transformed[split_index:]
# Y_train = Y[:split_index]
# Y_test = Y[split_index:]

In [122]:
print(X_train)
# print(dataset.iloc[split_index:, -2])

[[False False False ... False False 1]
 [False False False ... False False 0]
 [False False False ... False False 0]
 ...
 [False False False ... False False 4]
 [False False False ... False False 2]
 [False False False ... False False 2]]
3438    5
3439    6
3440    3
3441    3
3442    3
       ..
4340    3
4341    2
4342    3
4343    3
4344    3
Name: Age, Length: 809, dtype: int64


## Apply feature scaling
We don't actually have to apply feature scaling for multiple linear regression because the coefficient for each independent variable will auto adjust according to its significance and its raw values.

In [98]:
print(X_test)

[[False False False ... False False 3]
 [False False False ... False False 7]
 [False False False ... False False 3]
 ...
 [False False False ... False False 3]
 [False False False ... False False 6]
 [False False False ... False False 2]]


## Train the model

In [99]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, Y_train)

## Predicting the test set results

In [100]:
Y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(Y_pred)

[ 2.50e+03  6.07e+02  2.86e+03  1.05e+03  2.82e+03  4.44e+03  4.20e+03
  2.91e+03  4.13e+03  3.55e+03  3.94e+03  3.96e+03  3.10e+03  1.58e+03
  2.82e+03  4.46e+03  2.75e+03  1.02e+03  3.37e+03  4.10e+03  3.43e+03
  3.94e+03  4.19e+03  4.26e+03  7.06e+02  3.05e+03  2.49e+03  4.22e+03
  1.43e+03  3.97e+03  3.51e+03  3.70e+03  1.11e+03  3.87e+03  6.77e+02
  4.48e+03  3.88e+03  3.82e+03  2.34e+03  4.59e+03  3.39e+03  3.88e+03
  1.90e+03  3.33e+03  2.26e+03  2.38e+03  2.47e+03  1.75e+03  2.89e+03
  3.05e+03  4.77e+03  2.02e+03  4.55e+03  3.81e+03  2.32e+03  2.66e+03
  2.03e+03  5.15e+03  3.55e+03  3.57e+03  2.65e+03  4.61e+03  2.04e+03
  3.33e+03  2.58e+03  1.17e+03  3.83e+03  5.20e+03  4.41e+03  2.88e+03
  9.24e+02  6.19e+02  2.28e+03  2.08e+03  4.97e+03  2.02e+03  4.04e+03
  2.60e+03  3.79e+03  4.40e+03  2.93e+03  3.79e+03  2.57e+03  2.52e+03
  1.37e+03  4.93e+03  2.77e+03  4.78e+03  3.79e+03  1.68e+03  1.39e+03
  2.02e+03  4.47e+03  4.69e+03  3.73e+03  2.92e+03  3.11e+03  2.79e+03
  1.45

In [101]:
Y_test_array = Y_test.to_numpy().reshape(len(Y_test),1)
Y_pred_array = Y_pred.reshape(len(Y_pred),1)
comparisons = np.concatenate((Y_pred_array, Y_test_array),1)
print(comparisons)

[[2503.53 2100.  ]
 [ 607.34 1604.  ]
 [2856.19 2552.  ]
 ...
 [3805.84 4000.  ]
 [2658.34 2600.  ]
 [3952.31 4000.  ]]


## Visualize the differences

In [102]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Compute mean absolute error (MAE)
mae = mean_absolute_error(Y_test_array, Y_pred_array)

# Compute mean squared error (MSE)
mse = mean_squared_error(Y_test_array, Y_pred_array)

# Compute root mean squared error (RMSE)
rmse = np.sqrt(mse)

mae, mse, rmse

(np.float64(843502731228.5333),
 np.float64(1.1582436292143387e+26),
 np.float64(10762172778832.064))

### Generate predictions