In [1]:
# Initial imports
import pandas as pd
import numpy as np
from pathlib import Path
import datetime as dt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt

In [2]:
# Reading in the data
sneakers_df = pd.read_csv('../data/StockX-Data-Contest-2019-3.csv', parse_dates = True)
df = sneakers_df.copy()
df.head(10)

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Release Date,Shoe Size,Buyer Region
0,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097.0,220,9/24/16,11.0,California
1,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,685.0,220,11/23/16,11.0,California
2,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,690.0,220,11/23/16,11.0,California
3,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,1075.0,220,11/23/16,11.5,Kentucky
4,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,828.0,220,2/11/17,11.0,Rhode Island
5,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,798.0,220,2/11/17,8.5,Michigan
6,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-White,784.0,220,12/17/16,11.0,California
7,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Cream-White,460.0,220,4/29/17,10.0,New York
8,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Cream-White,465.0,220,4/29/17,11.0,Kansas
9,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Cream-White,465.0,220,4/29/17,11.0,Florida


In [3]:
# Renaming columns to get rid of spaces 
df = df.rename(columns={
    "Order Date":"order_date",
    "Brand":"brand",
    "Sneaker Name":"sneaker_name",
    "Sale Price":"sale_price",
    "Retail Price":"retail_price",
    "Release Date":"release_date",
    "Shoe Size":"shoe_size",
    "Buyer Region":"buyer_region"
    })

In [4]:
# Converting dates into numericals
df['order_date'] = pd.to_datetime(df['order_date'])
df['order_date']=df['order_date'].map(dt.datetime.toordinal)

df['release_date'] = pd.to_datetime(df['release_date'])
df['release_date']=df['release_date'].map(dt.datetime.toordinal)
df.head()

Unnamed: 0,order_date,brand,sneaker_name,sale_price,retail_price,release_date,shoe_size,buyer_region
0,736573,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097.0,220,736231,11.0,California
1,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,685.0,220,736291,11.0,California
2,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,690.0,220,736291,11.0,California
3,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,1075.0,220,736291,11.5,Kentucky
4,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,828.0,220,736371,11.0,Rhode Island


In [5]:
# Define features set
X = df.drop(['sale_price'], axis=1)

# Define target vector
y = df.sale_price

In [6]:
# Encoding variables
X = pd.get_dummies(X, columns=["brand", "sneaker_name", "buyer_region"])
X.head()


Unnamed: 0,order_date,retail_price,release_date,shoe_size,brand_ Yeezy,brand_Off-White,sneaker_name_Adidas-Yeezy-Boost-350-Low-Moonrock,sneaker_name_Adidas-Yeezy-Boost-350-Low-Oxford-Tan,sneaker_name_Adidas-Yeezy-Boost-350-Low-Pirate-Black-2015,sneaker_name_Adidas-Yeezy-Boost-350-Low-Pirate-Black-2016,...,buyer_region_South Dakota,buyer_region_Tennessee,buyer_region_Texas,buyer_region_Utah,buyer_region_Vermont,buyer_region_Virginia,buyer_region_Washington,buyer_region_West Virginia,buyer_region_Wisconsin,buyer_region_Wyoming
0,736573,220,736231,11.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,736573,220,736291,11.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,736573,220,736291,11.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,736573,220,736291,11.5,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,736573,220,736371,11.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 27)


In [10]:
# Create the random forest classifier instance
rf_model = RandomForestRegressor(n_estimators=500,random_state=27)


In [None]:
# Fit the model
rf_model = rf_model.fit(X_train, y_train)


In [None]:
# Get the feature importance array
importances = rf_model.feature_importances_

# List the top 10 most important features
importances = pd.DataFrame(sorted(zip(rf_model.feature_importances_, X.columns), reverse=True))
importances.rename(columns={0: 'Feature Importances', 1: "Feature"}, inplace=True)
importances.set_index(importances["Feature"], inplace=True)
importances.drop(columns="Feature", inplace=True)
importances.head(10)


In [None]:
# Predict outcomes for test data set
predictions = rf_model.predict(X_test)
comparison = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
comparison.head(10)

In [None]:
# Runing metrics to evaluate model performance 
print('R²: %.2f' % r2_score(y_test, predictions))
print("Mean Squared Log Error: %.2f" % mean_squared_log_error(y_test, predictions))
print("Mean Absolute Error: %.2f" % mean_absolute_error(y_test, predictions))
print("Median Absolute Error: %.2f" % median_absolute_error(y_test, predictions))
print("Explained Variance Score: %.2f" % explained_variance_score(y_test, predictions))
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
# Use the forest's predict method on the test data
# Calculate the absolute errors
errors = abs(predictions - y_test)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))
