## Install the required modules

In [1]:
!pip3 install -r requirements.txt -q

## Import the data

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

directory_path = '/Users/brendanashton/dev/go/src/github.com/deasa/pinkbike_crawler/runs'

# Initialize an empty list to store DataFrames
dfs = []

for filename in os.listdir(directory_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(directory_path, filename)
        df = pd.read_csv(file_path)
        dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
dataset = pd.concat(dfs, ignore_index=True)

dataset.head()
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42303 entries, 0 to 42302
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              42303 non-null  object 
 1   Year               41102 non-null  float64
 2   Manufacturer       42303 non-null  object 
 3   Model              42303 non-null  object 
 4   USD Price          42303 non-null  int64  
 5   Original Currency  42297 non-null  object 
 6   Condition          42270 non-null  object 
 7   Frame Size         42082 non-null  object 
 8   Wheel Size         41886 non-null  object 
 9   Front Travel       41426 non-null  object 
 10  Rear Travel        41131 non-null  object 
 11  Material           42056 non-null  object 
 12  Reason for Review  12607 non-null  object 
 13  URL                42303 non-null  object 
dtypes: float64(1), int64(1), object(12)
memory usage: 4.5+ MB


## Data preprocessing

### Drop unnecessary columns

In [3]:
dataset = dataset.drop(columns=['Title', 'Frame Size', 'Reason for Review', 'URL', 'Original Currency'])
dataset.head()
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42303 entries, 0 to 42302
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Year          41102 non-null  float64
 1   Manufacturer  42303 non-null  object 
 2   Model         42303 non-null  object 
 3   USD Price     42303 non-null  int64  
 4   Condition     42270 non-null  object 
 5   Wheel Size    41886 non-null  object 
 6   Front Travel  41426 non-null  object 
 7   Rear Travel   41131 non-null  object 
 8   Material      42056 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 2.9+ MB


#### Clean the year - some are missing and some are strangely high

In [4]:
current_year = pd.Timestamp.now().year
min_year = 2000
max_year = pd.Timestamp.today().year 

# Calculate the mean (average) year within the bounds
mean_year = dataset[(dataset['Year'] >= min_year) & (dataset['Year'] <= max_year)]['Year'].mean()

# Replace out-of-bounds low years with NaN
dataset.loc[(dataset['Year'] < min_year), 'Year'] = np.nan

# Replace out-of-bounds years with the mean (Corrected line)
dataset.loc[(dataset['Year'] > max_year), 'Year'] = mean_year

dataset.head()

Unnamed: 0,Year,Manufacturer,Model,USD Price,Condition,Wheel Size,Front Travel,Rear Travel,Material
0,2021.0,NoManufacturer,NoModelFound,3500,Excellent - Lightly Ridden,29,160 mm,0 mm (Hardtail),Titanium
1,2022.0,Canyon,Torque,1340,Excellent - Lightly Ridden,29,,,Carbon Fiber
2,2023.0,Yeti,SB160,3700,Excellent - Lightly Ridden,29,,,Carbon Fiber
3,2023.0,Specialized,Turbo Levo Electric,2999,Excellent - Lightly Ridden,29,160 mm,150 mm,Aluminium
4,2018.0,NoManufacturer,NoModelFound,2208,"Good - Used, Mechanically Sound",29,170 mm,155 mm,Carbon Fiber


#### Extract numbers from front and rear travel

In [5]:
import re

# Extract numerical values using regular expressions
dataset['Rear Travel'] = dataset['Rear Travel'].astype(str).str.extract('(\d+)', expand=False)

# Convert to numeric, setting failed conversions to NaN
dataset['Rear Travel'] = pd.to_numeric(dataset['Rear Travel'], errors='coerce')

dataset['Rear Travel'].head()

0      0.0
1      NaN
2      NaN
3    150.0
4    155.0
Name: Rear Travel, dtype: float64

In [6]:
dataset['Front Travel'] = dataset['Front Travel'].astype(str).str.extract('(\d+)', expand=False)
dataset['Front Travel'] = pd.to_numeric(dataset['Front Travel'], errors='coerce')

dataset['Front Travel'].head()

0    160.0
1      NaN
2      NaN
3    160.0
4    170.0
Name: Front Travel, dtype: float64

In [7]:
dataset.head()

Unnamed: 0,Year,Manufacturer,Model,USD Price,Condition,Wheel Size,Front Travel,Rear Travel,Material
0,2021.0,NoManufacturer,NoModelFound,3500,Excellent - Lightly Ridden,29,160.0,0.0,Titanium
1,2022.0,Canyon,Torque,1340,Excellent - Lightly Ridden,29,,,Carbon Fiber
2,2023.0,Yeti,SB160,3700,Excellent - Lightly Ridden,29,,,Carbon Fiber
3,2023.0,Specialized,Turbo Levo Electric,2999,Excellent - Lightly Ridden,29,160.0,150.0,Aluminium
4,2018.0,NoManufacturer,NoModelFound,2208,"Good - Used, Mechanically Sound",29,170.0,155.0,Carbon Fiber


#### Replace NoManufacturer and NoModelFound with NaN

In [8]:
dataset.replace('NoModelFound', np.nan, inplace=True)
dataset.replace('NoManufacturer', np.nan, inplace=True)

dataset.head()

Unnamed: 0,Year,Manufacturer,Model,USD Price,Condition,Wheel Size,Front Travel,Rear Travel,Material
0,2021.0,,,3500,Excellent - Lightly Ridden,29,160.0,0.0,Titanium
1,2022.0,Canyon,Torque,1340,Excellent - Lightly Ridden,29,,,Carbon Fiber
2,2023.0,Yeti,SB160,3700,Excellent - Lightly Ridden,29,,,Carbon Fiber
3,2023.0,Specialized,Turbo Levo Electric,2999,Excellent - Lightly Ridden,29,160.0,150.0,Aluminium
4,2018.0,,,2208,"Good - Used, Mechanically Sound",29,170.0,155.0,Carbon Fiber


#### Remove any electric hits

In [9]:
dataset['Model'] = dataset['Model'].astype(str).str.replace(r'.*electric.*', "NaN", case=False, regex=True)
dataset.replace('nan', np.nan, inplace=True)
dataset.replace('NaN', np.nan, inplace=True)

dataset.head()

Unnamed: 0,Year,Manufacturer,Model,USD Price,Condition,Wheel Size,Front Travel,Rear Travel,Material
0,2021.0,,,3500,Excellent - Lightly Ridden,29,160.0,0.0,Titanium
1,2022.0,Canyon,Torque,1340,Excellent - Lightly Ridden,29,,,Carbon Fiber
2,2023.0,Yeti,SB160,3700,Excellent - Lightly Ridden,29,,,Carbon Fiber
3,2023.0,Specialized,,2999,Excellent - Lightly Ridden,29,160.0,150.0,Aluminium
4,2018.0,,,2208,"Good - Used, Mechanically Sound",29,170.0,155.0,Carbon Fiber


### Build up manual predictions dataset

In [10]:
# Create a new DataFrame for manual predictions
manual_predictions = pd.DataFrame({
    'Year': [2019, 2021, 2021, 2022],
    'Manufacturer': ['Specialized', 'Canyon', 'Ibis', 'Specialized'],
    'Model': ['Stumpjumper', 'Spectral', 'Ripmo AF', 'Status 140'],
    'USD Price': [2000, 2500, 2000, 2000],
    'Condition': ['Good - Used, Mechanically Sound', 'Good - Used, Mechanically Sound', 'Good - Used, Mechanically Sound', 'Good - Used, Mechanically Sound'],
    'Wheel Size': ['29', '29', '29', '29'],
    'Front Travel': [150, 160, 160, 140],
    'Rear Travel': [140, 150, 147, 140],
    'Material': ['Carbon Fiber', 'Carbon Fiber', 'Aluminum', 'Aluminum'],
})

### Drop Model - it adds too much noise to the model

In [11]:
dataset = dataset.drop(columns=['Model'])
dataset.head()

Unnamed: 0,Year,Manufacturer,USD Price,Condition,Wheel Size,Front Travel,Rear Travel,Material
0,2021.0,,3500,Excellent - Lightly Ridden,29,160.0,0.0,Titanium
1,2022.0,Canyon,1340,Excellent - Lightly Ridden,29,,,Carbon Fiber
2,2023.0,Yeti,3700,Excellent - Lightly Ridden,29,,,Carbon Fiber
3,2023.0,Specialized,2999,Excellent - Lightly Ridden,29,160.0,150.0,Aluminium
4,2018.0,,2208,"Good - Used, Mechanically Sound",29,170.0,155.0,Carbon Fiber


In [12]:
manual_predictions = manual_predictions.drop(columns=['Model'])
manual_predictions.head()

Unnamed: 0,Year,Manufacturer,USD Price,Condition,Wheel Size,Front Travel,Rear Travel,Material
0,2019,Specialized,2000,"Good - Used, Mechanically Sound",29,150,140,Carbon Fiber
1,2021,Canyon,2500,"Good - Used, Mechanically Sound",29,160,150,Carbon Fiber
2,2021,Ibis,2000,"Good - Used, Mechanically Sound",29,160,147,Aluminum
3,2022,Specialized,2000,"Good - Used, Mechanically Sound",29,140,140,Aluminum


### De-duplicate

In [13]:
print(f"Original DataFrame shape: {dataset.shape}")

# Drop duplicate rows, keeping the first occurrence
dataset = dataset.drop_duplicates(keep='first')

print(f"Deduplicated DataFrame shape: {dataset.shape}")

Original DataFrame shape: (42303, 8)
Deduplicated DataFrame shape: (22360, 8)


### Drop all NaN

In [14]:
print(f"Original DataFrame shape: {dataset.shape}")
dataset = dataset.dropna()

# Print the shape of the original and deduplicated DataFrames

print(f"No NA DataFrame shape: {dataset.shape}")

Original DataFrame shape: (22360, 8)
No NA DataFrame shape: (18797, 8)


In [15]:
dataset.head()

Unnamed: 0,Year,Manufacturer,USD Price,Condition,Wheel Size,Front Travel,Rear Travel,Material
3,2023.0,Specialized,2999,Excellent - Lightly Ridden,29,160.0,150.0,Aluminium
5,2019.0,Yeti,2834,Excellent - Lightly Ridden,27.5 / 650B,170.0,150.0,Carbon Fiber
11,2024.0,Specialized,7000,Excellent - Lightly Ridden,29,170.0,150.0,Carbon Fiber
13,2018.0,Yeti,2500,Excellent - Lightly Ridden,27.5 / 650B,170.0,160.0,Carbon Fiber
14,2022.0,Mondraker,8750,New - Unridden/With Tags,29,160.0,160.0,Carbon Fiber


## Feature engineering

### Compute an age column, drop the year

In [16]:
from datetime import datetime

def convert_age(X):
    if 'Year' not in X.columns:
        raise KeyError("The DataFrame does not contain a 'Year' column.")
    
    # Make a copy of the DataFrame to avoid SettingWithCopyWarning
    X = X.copy()
    
    current_year = datetime.now().year
    X.loc[:, 'Age'] = current_year - X['Year']
    return X

dataset = convert_age(dataset)
# Drop the 'Year' column
dataset = dataset.drop(columns='Year')

dataset.head()

Unnamed: 0,Manufacturer,USD Price,Condition,Wheel Size,Front Travel,Rear Travel,Material,Age
3,Specialized,2999,Excellent - Lightly Ridden,29,160.0,150.0,Aluminium,1.0
5,Yeti,2834,Excellent - Lightly Ridden,27.5 / 650B,170.0,150.0,Carbon Fiber,5.0
11,Specialized,7000,Excellent - Lightly Ridden,29,170.0,150.0,Carbon Fiber,0.0
13,Yeti,2500,Excellent - Lightly Ridden,27.5 / 650B,170.0,160.0,Carbon Fiber,6.0
14,Mondraker,8750,New - Unridden/With Tags,29,160.0,160.0,Carbon Fiber,2.0


In [17]:
manual_predictions = convert_age(manual_predictions)
manual_predictions = manual_predictions.drop(columns='Year')
manual_predictions.head()

Unnamed: 0,Manufacturer,USD Price,Condition,Wheel Size,Front Travel,Rear Travel,Material,Age
0,Specialized,2000,"Good - Used, Mechanically Sound",29,150,140,Carbon Fiber,5
1,Canyon,2500,"Good - Used, Mechanically Sound",29,160,150,Carbon Fiber,3
2,Ibis,2000,"Good - Used, Mechanically Sound",29,160,147,Aluminum,3
3,Specialized,2000,"Good - Used, Mechanically Sound",29,140,140,Aluminum,2


### Put them in categories based on their travel numbers

In [18]:
# Create a new column 'Category' that will categorize the listings by the rear travel.
def categorize_travel(X):
    if 'Rear Travel' not in X.columns:
        raise KeyError("The DataFrame does not contain a 'Rear Travel' column.")
    
    # Make a copy of the DataFrame to avoid SettingWithCopyWarning
    X = X.copy()
    
    # Create a new column 'Category' based on the 'Rear Travel' column
    X.loc[(X['Rear Travel'] < 0), 'Category'] = np.nan
    X.loc[(X['Rear Travel'] > 210), 'Category'] = np.nan
    X.loc[(X['Rear Travel'] == 0), 'Category'] = 'Hardtail'
    X.loc[(X['Rear Travel'] >= 0) & (X['Rear Travel'] < 120), 'Category'] = 'Short Travel'
    X.loc[(X['Rear Travel'] >= 120) & (X['Rear Travel'] <= 150), 'Category'] = 'Mid Travel'
    X.loc[(X['Rear Travel'] > 150) & (X['Rear Travel'] <= 210), 'Category'] = 'Long Travel'
    return X

print(f"Original DataFrame shape: {dataset.shape}")
dataset = categorize_travel(dataset)
dataset = dataset.dropna()
print(f"DataFrame shape after categorization: {dataset.shape}")

dataset.head()

Original DataFrame shape: (18797, 8)
DataFrame shape after categorization: (18769, 9)


  X.loc[(X['Rear Travel'] == 0), 'Category'] = 'Hardtail'


Unnamed: 0,Manufacturer,USD Price,Condition,Wheel Size,Front Travel,Rear Travel,Material,Age,Category
3,Specialized,2999,Excellent - Lightly Ridden,29,160.0,150.0,Aluminium,1.0,Mid Travel
5,Yeti,2834,Excellent - Lightly Ridden,27.5 / 650B,170.0,150.0,Carbon Fiber,5.0,Mid Travel
11,Specialized,7000,Excellent - Lightly Ridden,29,170.0,150.0,Carbon Fiber,0.0,Mid Travel
13,Yeti,2500,Excellent - Lightly Ridden,27.5 / 650B,170.0,160.0,Carbon Fiber,6.0,Long Travel
14,Mondraker,8750,New - Unridden/With Tags,29,160.0,160.0,Carbon Fiber,2.0,Long Travel


In [19]:
manual_predictions = categorize_travel(manual_predictions)
manual_predictions.head()

  X.loc[(X['Rear Travel'] == 0), 'Category'] = 'Hardtail'


Unnamed: 0,Manufacturer,USD Price,Condition,Wheel Size,Front Travel,Rear Travel,Material,Age,Category
0,Specialized,2000,"Good - Used, Mechanically Sound",29,150,140,Carbon Fiber,5,Mid Travel
1,Canyon,2500,"Good - Used, Mechanically Sound",29,160,150,Carbon Fiber,3,Mid Travel
2,Ibis,2000,"Good - Used, Mechanically Sound",29,160,147,Aluminum,3,Mid Travel
3,Specialized,2000,"Good - Used, Mechanically Sound",29,140,140,Aluminum,2,Mid Travel


### Drop now-irrelevant front travel and rear travel columns

In [20]:
dataset = dataset.drop(columns=['Rear Travel', 'Front Travel'])
dataset.head()

Unnamed: 0,Manufacturer,USD Price,Condition,Wheel Size,Material,Age,Category
3,Specialized,2999,Excellent - Lightly Ridden,29,Aluminium,1.0,Mid Travel
5,Yeti,2834,Excellent - Lightly Ridden,27.5 / 650B,Carbon Fiber,5.0,Mid Travel
11,Specialized,7000,Excellent - Lightly Ridden,29,Carbon Fiber,0.0,Mid Travel
13,Yeti,2500,Excellent - Lightly Ridden,27.5 / 650B,Carbon Fiber,6.0,Long Travel
14,Mondraker,8750,New - Unridden/With Tags,29,Carbon Fiber,2.0,Long Travel


In [21]:
manual_predictions = manual_predictions.drop(columns=['Rear Travel', 'Front Travel'])
manual_predictions.head()

Unnamed: 0,Manufacturer,USD Price,Condition,Wheel Size,Material,Age,Category
0,Specialized,2000,"Good - Used, Mechanically Sound",29,Carbon Fiber,5,Mid Travel
1,Canyon,2500,"Good - Used, Mechanically Sound",29,Carbon Fiber,3,Mid Travel
2,Ibis,2000,"Good - Used, Mechanically Sound",29,Aluminum,3,Mid Travel
3,Specialized,2000,"Good - Used, Mechanically Sound",29,Aluminum,2,Mid Travel


### Reorder columns so the target variable is at the end

In [22]:
dataset = dataset[[col for col in dataset.columns if col != 'USD Price'] + ['USD Price']]
dataset.head()

Unnamed: 0,Manufacturer,Condition,Wheel Size,Material,Age,Category,USD Price
3,Specialized,Excellent - Lightly Ridden,29,Aluminium,1.0,Mid Travel,2999
5,Yeti,Excellent - Lightly Ridden,27.5 / 650B,Carbon Fiber,5.0,Mid Travel,2834
11,Specialized,Excellent - Lightly Ridden,29,Carbon Fiber,0.0,Mid Travel,7000
13,Yeti,Excellent - Lightly Ridden,27.5 / 650B,Carbon Fiber,6.0,Long Travel,2500
14,Mondraker,New - Unridden/With Tags,29,Carbon Fiber,2.0,Long Travel,8750


In [23]:
manual_predictions = manual_predictions[[col for col in manual_predictions.columns if col != 'USD Price'] + ['USD Price']]
manual_predictions.head()

Unnamed: 0,Manufacturer,Condition,Wheel Size,Material,Age,Category,USD Price
0,Specialized,"Good - Used, Mechanically Sound",29,Carbon Fiber,5,Mid Travel,2000
1,Canyon,"Good - Used, Mechanically Sound",29,Carbon Fiber,3,Mid Travel,2500
2,Ibis,"Good - Used, Mechanically Sound",29,Aluminum,3,Mid Travel,2000
3,Specialized,"Good - Used, Mechanically Sound",29,Aluminum,2,Mid Travel,2000


### Split the data into features and a target variable

In [24]:
X = dataset.iloc[:, :-1]
Y = dataset.iloc[:, -1]
X_manual = manual_predictions.iloc[:, :-1]
Y_manual = manual_predictions.iloc[:, -1]

In [25]:
print(X)

         Manufacturer                        Condition   Wheel Size  \
3         Specialized       Excellent - Lightly Ridden           29   
5                Yeti       Excellent - Lightly Ridden  27.5 / 650B   
11        Specialized       Excellent - Lightly Ridden           29   
13               Yeti       Excellent - Lightly Ridden  27.5 / 650B   
14          Mondraker         New - Unridden/With Tags           29   
...               ...                              ...          ...   
42298  Rocky Mountain       Excellent - Lightly Ridden  27.5 / 650B   
42299      Santa Cruz  Good - Used, Mechanically Sound           29   
42300     Diamondback       Excellent - Lightly Ridden           29   
42301            Kona  Good - Used, Mechanically Sound           29   
42302           Giant  Good - Used, Mechanically Sound           29   

           Material  Age      Category  
3         Aluminium  1.0    Mid Travel  
5      Carbon Fiber  5.0    Mid Travel  
11     Carbon Fiber  0.0

In [26]:
print(Y)

3        2999
5        2834
11       7000
13       2500
14       8750
         ... 
42298    2500
42299    3000
42300     800
42301    2850
42302    2700
Name: USD Price, Length: 18769, dtype: int64


In [27]:
print(X_manual)

  Manufacturer                        Condition Wheel Size      Material  Age  \
0  Specialized  Good - Used, Mechanically Sound         29  Carbon Fiber    5   
1       Canyon  Good - Used, Mechanically Sound         29  Carbon Fiber    3   
2         Ibis  Good - Used, Mechanically Sound         29      Aluminum    3   
3  Specialized  Good - Used, Mechanically Sound         29      Aluminum    2   

     Category  
0  Mid Travel  
1  Mid Travel  
2  Mid Travel  
3  Mid Travel  


In [28]:
print(Y_manual)

0    2000
1    2500
2    2000
3    2000
Name: USD Price, dtype: int64


## Transform the data with one-hot encoding

In [29]:
def encode_data(X):
    for column in ['Manufacturer', 'Condition', 'Wheel Size', 'Material', 'Category']:
        insert_loc = X.columns.get_loc(column)
        X = pd.concat([X.iloc[:,:insert_loc], pd.get_dummies(X.loc[:, [column]]), X.iloc[:,insert_loc+1:]], axis=1)
    return X

# temporarily join the X and X_manual DataFrames to encode the categorical variables
X = pd.concat([X, X_manual], ignore_index=True)
X = np.array(encode_data(X.copy()))

# Split the encoded data back into the original X and X_manual DataFrames
X, X_manual = X[:len(dataset)], X[len(dataset):]

In [30]:
print(X_manual)

[[False False False False False False False False False False False False
  False False False False False False False False False False False False
  False False False False False False False False False False False False
  False False False False False False False False False False False False
  False False False False False False False False True False False False
  False False False False False False False False False True False False
  False False False False True False False False False True False False
  False 5.0 False True False]
 [False False False False False False True False False False False False
  False False False False False False False False False False False False
  False False False False False False False False False False False False
  False False False False False False False False False False False False
  False False False False False False False False False False False False
  False False False False False False False False False True False False
  False False 

### Split the dataset into a training set and a test set

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
# split_index = int(len(X_transformed) * 0.8)
# X_train = X_transformed[:split_index]
# X_test = X_transformed[split_index:]
# Y_train = Y[:split_index]
# Y_test = Y[split_index:]

In [32]:
print(X_train)
# print(dataset.iloc[split_index:, -2])

[[False False False ... True False False]
 [False False False ... False False True]
 [False False False ... False True False]
 ...
 [False False False ... False True False]
 [False False False ... True False False]
 [False False False ... False True False]]


## Apply feature scaling
We don't actually have to apply feature scaling for multiple linear regression because the coefficient for each independent variable will auto adjust according to its significance and its raw values.

In [33]:
print(X_test)

[[False False False ... False True False]
 [False False False ... True False False]
 [False False False ... True False False]
 ...
 [False False False ... False False True]
 [False False False ... False False True]
 [False False False ... False True False]]


## Train the model

In [34]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, Y_train)

## Predicting the test set results

In [35]:
Y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(Y_pred)

[3970.   2883.5  3703.25 ... 1011.75 2082.5  2775.5 ]


In [36]:
Y_test_array = Y_test.to_numpy().reshape(len(Y_test),1)
Y_pred_array = Y_pred.reshape(len(Y_pred),1)
comparisons = np.concatenate((Y_pred_array, Y_test_array),1)
print(comparisons)

[[3970.   3533.  ]
 [2883.5  1800.  ]
 [3703.25 3600.  ]
 ...
 [1011.75  960.  ]
 [2082.5  4100.  ]
 [2775.5  3099.  ]]


## Visualize the differences

In [37]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Compute mean absolute error (MAE)
mae = mean_absolute_error(Y_test_array, Y_pred_array)

# Compute mean squared error (MSE)
mse = mean_squared_error(Y_test_array, Y_pred_array)

# Compute root mean squared error (RMSE)
rmse = np.sqrt(mse)

mae, mse, rmse

(np.float64(734.3230554075652),
 np.float64(1102302.848278503),
 np.float64(1049.9061140304418))

### Generate predictions

In [38]:
# Predict the prices of the manual predictions
Y_manual_pred = regressor.predict(X_manual)

print('Y_manual_pred:', Y_manual_pred)

Y_manual_pred: [ 3.04e+03  2.96e+03 -1.77e+13 -1.77e+13]
