In [1]:
# Initial imports
from pathlib import Path
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import export_graphviz
from sklearn import metrics
import pydot

In [2]:
# Reading in the data
sneakers_df = pd.read_csv('../data/StockX-Data-Contest-2019-3.csv', parse_dates = True)
df = sneakers_df.copy()
df.head()

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Release Date,Shoe Size,Buyer Region
0,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097.0,220,9/24/16,11.0,California
1,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,685.0,220,11/23/16,11.0,California
2,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,690.0,220,11/23/16,11.0,California
3,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,1075.0,220,11/23/16,11.5,Kentucky
4,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,828.0,220,2/11/17,11.0,Rhode Island


In [3]:
# Renaming columns to get rid of spaces 
df = df.rename(columns={
    "Order Date":"order_date",
    "Brand":"brand",
    "Sneaker Name":"sneaker_name",
    "Sale Price":"sale_price",
    "Retail Price":"retail_price",
    "Release Date":"release_date",
    "Shoe Size":"shoe_size",
    "Buyer Region":"buyer_region"
    })

In [4]:
# Converting dates into numericals
df['order_date'] = pd.to_datetime(df['order_date'])
df['order_date']=df['order_date'].map(dt.datetime.toordinal)

df['release_date'] = pd.to_datetime(df['release_date'])
df['release_date']=df['release_date'].map(dt.datetime.toordinal)
df.head()

Unnamed: 0,order_date,brand,sneaker_name,sale_price,retail_price,release_date,shoe_size,buyer_region
0,736573,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097.0,220,736231,11.0,California
1,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,685.0,220,736291,11.0,California
2,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,690.0,220,736291,11.0,California
3,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,1075.0,220,736291,11.5,Kentucky
4,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,828.0,220,736371,11.0,Rhode Island


In [5]:
# Define features set
X = df.copy()
X.drop("sale_price", axis=1, inplace=True)

# Define target vector
y = df.sale_price


In [6]:
# Encoding variables
X = pd.get_dummies(X, columns=["brand", "sneaker_name", "buyer_region"])
X.head()


Unnamed: 0,order_date,retail_price,release_date,shoe_size,brand_ Yeezy,brand_Off-White,sneaker_name_Adidas-Yeezy-Boost-350-Low-Moonrock,sneaker_name_Adidas-Yeezy-Boost-350-Low-Oxford-Tan,sneaker_name_Adidas-Yeezy-Boost-350-Low-Pirate-Black-2015,sneaker_name_Adidas-Yeezy-Boost-350-Low-Pirate-Black-2016,...,buyer_region_South Dakota,buyer_region_Tennessee,buyer_region_Texas,buyer_region_Utah,buyer_region_Vermont,buyer_region_Virginia,buyer_region_Washington,buyer_region_West Virginia,buyer_region_Wisconsin,buyer_region_Wyoming
0,736573,220,736231,11.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,736573,220,736291,11.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,736573,220,736291,11.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,736573,220,736291,11.5,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,736573,220,736371,11.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 27)


In [8]:
# 
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (79964, 107)
Training Labels Shape: (79964,)
Testing Features Shape: (19992, 107)
Testing Labels Shape: (19992,)


In [9]:
# Create the random forest classifier instance
rf_model = RandomForestRegressor(n_estimators=500,random_state=27)


In [10]:
# Fit the model
rf_model = rf_model.fit(X_train, y_train)


In [11]:
# Use the forest's predict method on the test data
predictions = rf_model.predict(X_test)

# Calculate the absolute errors
errors = abs(predictions - y_test)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))


Mean Absolute Error: 14.49


In [12]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 97.14 %.


In [13]:
# Get the feature importance array
importances = rf_model.feature_importances_

# List the top 10 most important features
importances = pd.DataFrame(sorted(zip(rf_model.feature_importances_, X.columns), reverse=True))
importances.rename(columns={0: 'Feature Importances', 1: "Feature"}, inplace=True)
importances.set_index(importances["Feature"], inplace=True)
importances.drop(columns="Feature", inplace=True)
importances.head(10)


Unnamed: 0_level_0,Feature Importances
Feature,Unnamed: 1_level_1
release_date,0.196415
brand_ Yeezy,0.157862
brand_Off-White,0.141091
sneaker_name_Air-Jordan-1-Retro-High-Off-White-Chicago,0.136636
order_date,0.124832
sneaker_name_Air-Jordan-1-Retro-High-Off-White-White,0.068482
retail_price,0.033939
shoe_size,0.030887
sneaker_name_Nike-Air-Presto-Off-White,0.020999
sneaker_name_adidas-Yeezy-Boost-350-V2-Static-Reflective,0.014841


In [14]:
# Predict outcomes for test data set
predictions = rf_model.predict(X_test)
comparison = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
comparison.head(10)

Unnamed: 0,Prediction,Actual
30843,479.008,470.0
74873,248.102,240.0
99408,565.713481,569.0
89424,260.734748,261.0
76890,256.354843,263.0
46464,260.237033,265.0
53929,614.234,600.0
81530,279.756423,280.0
88146,261.315333,264.0
55375,235.412,234.0


In [15]:
# Runing metrics to evaluate model performance 

y_true = y_test
y_pred = predictions

print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_true, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_true, y_pred))
print('Root Mean Squared Error (RMSE):', metrics.mean_squared_error(y_true, y_pred, squared=False))
print('Explained Variance Score:', metrics.explained_variance_score(y_true, y_pred))
print('Max Error:', metrics.max_error(y_true, y_pred))
print('Mean Squared Log Error:', metrics.mean_squared_log_error(y_true, y_pred))
print('Median Absolute Error:', metrics.median_absolute_error(y_true, y_pred))
print('R^2:', metrics.r2_score(y_true, y_pred))
print('Mean Poisson Deviance:', metrics.mean_poisson_deviance(y_true, y_pred))
print('Mean Gamma Deviance:', metrics.mean_gamma_deviance(y_true, y_pred))

Mean Absolute Error (MAE): 14.494080195165298
Mean Squared Error (MSE): 1023.0665423938327
Root Mean Squared Error (RMSE): 31.985411399477616
Explained Variance Score: 0.9845974940567757
Max Error: 1288.652
Mean Squared Log Error: 0.0018692007419397322
Median Absolute Error: 7.098833333333346
R^2: 0.9845955018880792
Mean Poisson Deviance: 1.1548260902463139
Mean Gamma Deviance: 0.0018861474271944462


In [16]:
from sklearn import metrics
from sklearn.metrics import  r2_score,mean_absolute_error,mean_squared_log_error,median_absolute_error,explained_variance_score

y_pred2 = predictions

print('R²: %.2f' % r2_score(y_test, y_pred2))
# print("mean_squared_log_error: %.2f" % mean_squared_log_error(y_test, y_pred2))
print("mean_absolute_error: %.2f" % mean_absolute_error(y_test, y_pred2))
print("median_absolute_error: %.2f" % median_absolute_error(y_test, y_pred2))
print("explained_variance_score: %.2f" % explained_variance_score(y_test, y_pred2))

R²: 0.98
mean_absolute_error: 14.49
median_absolute_error: 7.10
explained_variance_score: 0.98
