In [1]:
# Initial imports
import pandas as pd
import numpy as np
from pathlib import Path
import datetime as dt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics


In [2]:
# Reading in the data
sneakers_df = pd.read_csv('../Project_3/data/StockX-Data-Contest-2019-3.csv', parse_dates = True)
df = sneakers_df.copy()
df.head(10)

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Release Date,Shoe Size,Buyer Region
0,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097.0,220,9/24/16,11.0,California
1,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,685.0,220,11/23/16,11.0,California
2,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,690.0,220,11/23/16,11.0,California
3,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,1075.0,220,11/23/16,11.5,Kentucky
4,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,828.0,220,2/11/17,11.0,Rhode Island
5,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,798.0,220,2/11/17,8.5,Michigan
6,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-White,784.0,220,12/17/16,11.0,California
7,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Cream-White,460.0,220,4/29/17,10.0,New York
8,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Cream-White,465.0,220,4/29/17,11.0,Kansas
9,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Cream-White,465.0,220,4/29/17,11.0,Florida


In [3]:
# Renaming columns to get rid of spaces 
df = df.rename(columns={
    "Order Date":"order_date",
    "Brand":"brand",
    "Sneaker Name":"sneaker_name",
    "Sale Price":"sale_price",
    "Retail Price":"retail_price",
    "Release Date":"release_date",
    "Shoe Size":"shoe_size",
    "Buyer Region":"buyer_region"
    })

In [4]:
# Converting dates into numericals
df['order_date'] = pd.to_datetime(df['order_date'])
df['order_date']=df['order_date'].map(dt.datetime.toordinal)

df['release_date'] = pd.to_datetime(df['release_date'])
df['release_date']=df['release_date'].map(dt.datetime.toordinal)
df.head()

Unnamed: 0,order_date,brand,sneaker_name,sale_price,retail_price,release_date,shoe_size,buyer_region
0,736573,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097.0,220,736231,11.0,California
1,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,685.0,220,736291,11.0,California
2,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,690.0,220,736291,11.0,California
3,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,1075.0,220,736291,11.5,Kentucky
4,736573,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,828.0,220,736371,11.0,Rhode Island


In [5]:
# Define features set
X = df.drop(['sale_price'], axis=1)

# Define target vector
y = df.sale_price

In [6]:
# Encoding variables
X = pd.get_dummies(X, columns=["brand", "sneaker_name", "buyer_region"])
X.head()


Unnamed: 0,order_date,retail_price,release_date,shoe_size,brand_ Yeezy,brand_Off-White,sneaker_name_Adidas-Yeezy-Boost-350-Low-Moonrock,sneaker_name_Adidas-Yeezy-Boost-350-Low-Oxford-Tan,sneaker_name_Adidas-Yeezy-Boost-350-Low-Pirate-Black-2015,sneaker_name_Adidas-Yeezy-Boost-350-Low-Pirate-Black-2016,...,buyer_region_South Dakota,buyer_region_Tennessee,buyer_region_Texas,buyer_region_Utah,buyer_region_Vermont,buyer_region_Virginia,buyer_region_Washington,buyer_region_West Virginia,buyer_region_Wisconsin,buyer_region_Wyoming
0,736573,220,736231,11.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,736573,220,736291,11.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,736573,220,736291,11.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,736573,220,736291,11.5,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,736573,220,736371,11.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 27)


In [8]:
# Create the random forest classifier instance
rf_model = RandomForestRegressor(max_depth=90, min_samples_split=5, n_estimators=2000,random_state=27)


In [9]:
# Fit the model
rf_model = rf_model.fit(X_train, y_train)


In [10]:
# Get the feature importance array
importances = rf_model.feature_importances_

# List the top 10 most important features
importances = pd.DataFrame(sorted(zip(rf_model.feature_importances_, X.columns), reverse=True))
importances.rename(columns={0: 'Feature Importances', 1: "Feature"}, inplace=True)
importances.set_index(importances["Feature"], inplace=True)
importances.drop(columns="Feature", inplace=True)
importances.head(10)


Unnamed: 0_level_0,Feature Importances
Feature,Unnamed: 1_level_1
release_date,0.19726
brand_ Yeezy,0.158988
brand_Off-White,0.140624
sneaker_name_Air-Jordan-1-Retro-High-Off-White-Chicago,0.137063
order_date,0.124214
sneaker_name_Air-Jordan-1-Retro-High-Off-White-White,0.068528
retail_price,0.034133
shoe_size,0.030425
sneaker_name_Nike-Air-Presto-Off-White,0.021026
sneaker_name_adidas-Yeezy-Boost-350-V2-Static-Reflective,0.014894


In [11]:
# Predict outcomes for test data set
predictions = rf_model.predict(X_test)
comparison = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
comparison.head(10)

Unnamed: 0,Prediction,Actual
30843,478.888672,470.0
74873,248.697832,240.0
99408,565.705754,569.0
89424,260.351595,261.0
76890,256.2399,263.0
46464,259.711904,265.0
53929,614.630082,600.0
81530,279.632753,280.0
88146,259.781036,264.0
55375,236.885113,234.0


In [12]:
# Runing metrics to evaluate model performance 

y_true = y_test
y_pred = predictions

print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_true, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_true, y_pred))
print('Root Mean Squared Error (RMSE):', metrics.mean_squared_error(y_true, y_pred, squared=False))
print('Explained Variance Score:', metrics.explained_variance_score(y_true, y_pred))
print('Max Error:', metrics.max_error(y_true, y_pred))
print('Mean Squared Log Error:', metrics.mean_squared_log_error(y_true, y_pred))
print('Median Absolute Error:', metrics.median_absolute_error(y_true, y_pred))
print('R^2:', metrics.r2_score(y_true, y_pred))
print('Mean Poisson Deviance:', metrics.mean_poisson_deviance(y_true, y_pred))
print('Mean Gamma Deviance:', metrics.mean_gamma_deviance(y_true, y_pred))

Mean Absolute Error (MAE): 14.40909614433485
Mean Squared Error (MSE): 997.3707750011122
Root Mean Squared Error (RMSE): 31.581177542978224
Explained Variance Score: 0.9849839872704997
Max Error: 1303.9525117965322
Mean Squared Log Error: 0.0018254855218263419
Median Absolute Error: 7.045427817321922
R^2: 0.9849824077088475
Mean Poisson Deviance: 1.1281453182920351
Mean Gamma Deviance: 0.0018441793922129113
