### Import Libraries

In [1]:
# conda install -c conda-forge xgboost

In [2]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

import xgboost as xgb
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.linear_model import Lasso

### Preparing Data

In [3]:
# read data
data = pd.read_csv("../data/StockX-Data-Consolidated.csv")
data = data.rename(columns={"Unnamed: 0": "index"})

In [4]:
# train-test split
x = data.drop(["Pct_change",'Sale Price','index','California','New York','Oregon','Florida','Texas','Other States','Colorful'], axis=1)
y = data["Pct_change"]
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.25, random_state = 42)

### Model Selection

##### Model 1: XGBoost

In [5]:
%%capture
random.seed(1)

# fit XGBoost with 5-fold cross validation
params = {'colsample_bytree': [i/10. for i in range(8,11)],
          'subsample': [i/10. for i in range(8,11)],
          'eta': [.3, .4, .5],
          'max_depth': list(range(3,6)),
          'min_child_weight': list(range(4,7)),
          'eval_metric': ['rmse'],
          'objective': ['reg:squarederror']}
xg_reg = xgb.XGBRegressor()
r2 = make_scorer(r2_score, greater_is_better=True)
clf = GridSearchCV(xg_reg, 
                   params, 
                   cv=5, 
                   scoring=r2)
clf.fit(x_train, y_train)

In [6]:
# cross validation score
boost_best_params = clf.best_params_
boost_best_estimator = clf.best_estimator_
boost_score = clf.best_score_
print("Score: "+ str(boost_score))

Score: 0.9670915649887809


In [7]:
boost_best_estimator

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1.0, eta=0.3,
             eval_metric='rmse', gamma=0, importance_type='gain',
             learning_rate=0.1, max_delta_step=0, max_depth=5,
             min_child_weight=6, missing=None, n_estimators=100, n_jobs=1,
             nthread=None, objective='reg:squarederror', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=0.9, verbosity=1)

##### Model 2: SVM

In [8]:
%%capture
random.seed(1)

# fit SVM with 5-fold cross validation
x.train = preprocessing.scale(x_train)
linearsvr = LinearSVR(tol=0.01)
parameters = {'C': [0.01, 0.1, 1, 10, 100]} 
r2 = make_scorer(r2_score, greater_is_better=True)
clf = GridSearchCV(linearsvr, 
                   parameters, 
                   cv=5, 
                   scoring=r2)
clf.fit(x.train, y_train)

In [9]:
# cross validation score
svm_best_params = clf.best_params_
svm_best_estimator = clf.best_estimator_
svm_score = clf.best_score_
print("Score: "+ str(svm_score))

Score: 0.7508273991238662


##### Model 3: Lasso

In [5]:
%%capture
random.seed(1)

# fit LASSO with 5-fold cross validation
lasso = Lasso()
parameters = {'alpha': [1e-5,1e-4,1e-3,1e-2,1e-1,1e0,1e1,1e2,1e3,1e4,1e5]}
r2 = make_scorer(r2_score, greater_is_better=True)
clf = GridSearchCV(lasso, 
                   parameters, 
                   cv=5,
                   scoring=r2)
clf.fit(x_train, y_train)

In [6]:
# cross validation score
lasso_best_params = clf.best_params_
lasso_best_estimator = clf.best_estimator_
lasso_score = clf.best_score_
print("Score: "+ str(lasso_score))

Score: 0.7748125173830994


### The LASSO Model

In [8]:
# retrain lasso model on full traning data
lasso = Lasso(alpha=lasso_best_params['alpha'])
final_model = lasso.fit(x_train, y_train)

  positive)


In [9]:
# print regression results
print("intercept:", final_model.intercept_)
variables = x_train.columns
coefficients = final_model.coef_
for i,name in enumerate(variables):
    print(name, coefficients[i])

intercept: 2.2025292261233873
Days Since Release 0.0008978287163881856
yeezy -1.9844677781298181
airjordan 3.3886860401171064
airforce -0.8283270506592032
airmax90 -0.18793637693688447
airmax97 -0.1291268647057722
presto 0.8490061917589913
vapormax -1.3883299760430514
blazer 0.7345836499150389
zoom -2.010269302994049
react -1.5492207380081882
size_freq 0.24079974075585087
Black 0.4902977114573121
White 0.7087144242744148
Grey 0.237810714811406
Red 1.7844464484371139
Green 0.6109272956462212
Neo 0.19063791380407155
Orange 0.7521804412370577
Tan/Brown 1.8236966528597405
Pink 0.6988586678554456
Blue -2.5335176464531606
Number of Sales -3.632284645879694e-05


In [10]:
# test set performance
y_pred = final_model.predict(x_test)
print('R²: %.2f' % r2_score(y_test, y_pred))
print("mean_squared_error: %.2f" % mean_squared_error(y_test, y_pred))

R²: 0.77
mean_squared_error: 0.53


### Undervalued Sneakers

In [13]:
# Jordan 1 Retro High Travis Scott --> Released 05/11/2019
# https://stockx.com/air-jordan-1-retro-high-travis-scott?utm_source=google&utm_medium=cpc&utm_campaign=US-JordanGeneralSneakers&utm_campaignid=413888648&content=191928849084&keyword=jordan%201%20retro%20high%20travis%20scott&gclid=CjwKCAjw3-bzBRBhEiwAgnnLCsujrgiqwccfPrfXmgB4yc0Gp_hy7sSxyNEdaB7hFEjdZWrpasJsohoCZtQQAvD_BwE
undervalued_sneakers = {"Days Since Release": 318,
                        "yeezy": 0,
                        "airjordan": 1,
                        "airforce": 0,
                        "airmax90": 0,
                        "airmax97": 0,
                        "presto": 0,
                        "vapormax": 0,
                        "blazer": 0,
                        "zoom": 0,
                        "react": 0,
                        "size_freq": 0.198857,
                        "Black": 1,
                        "White": 1,
                        "Grey": 0,
                        "Red": 0,
                        "Green": 0,
                        "Neo": 0,
                        "Orange": 0,
                        "Tan/Brown": 1,
                        "Pink": 0, 
                        "Blue": 0,
                        "Number of Sales": 25712}

undervalued_sneaker = pd.DataFrame([undervalued_sneakers])
pred = final_model.predict(undervalued_sneaker)
pred # average premium 562.9% --> undervalued

array([8.01338527])

In [12]:
# Nike Blazer Mid 77 Vintage Slam Jam --> Released 01/08/2019
# https://stockx.com/nike-blazer-mid-77-vintage-slam-jam-special-slam-jam-box
undervalued_sneakers = {"Days Since Release": 441,
                        "yeezy": 0,
                        "airjordan": 0,
                        "airforce": 0,
                        "airmax90": 0,
                        "airmax97": 0,
                        "presto": 0,
                        "vapormax": 0,
                        "blazer": 1,
                        "zoom": 0,
                        "react": 0,
                        "size_freq": 0.106677,
                        "Black": 1,
                        "White": 1,
                        "Grey": 0,
                        "Red": 0,
                        "Green": 0,
                        "Neo": 0,
                        "Orange": 0,
                        "Tan/Brown": 0,
                        "Pink": 0, 
                        "Blue": 0,
                        "Number of Sales": 78}

undervalued_sneaker = pd.DataFrame([undervalued_sneakers])
pred = final_model.predict(undervalued_sneaker)
pred # average premium 381.0%% --> undervalued

array([4.55492209])