### Import Libraries

In [1]:
# conda install -c conda-forge xgboost

In [2]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

import xgboost as xgb
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.linear_model import Lasso

### Preparing Data

In [3]:
# read data
data = pd.read_csv("../data/StockX-Data-Consolidated.csv")
data = data.rename(columns={"Unnamed: 0": "index"})

In [4]:
# train-test split
x = data.drop(["Pct_change",'Sale Price','index'], axis=1)
y = data["Pct_change"]
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.25, random_state = 42)

### Model Selection

##### Model 1: XGBoost

In [5]:
%%capture
random.seed(1)

# fit XGBoost with 5-fold cross validation
params = {'colsample_bytree': [i/10. for i in range(8,11)],
          'subsample': [i/10. for i in range(8,11)],
          'eta': [.3, .4, .5],
          'max_depth': list(range(3,6)),
          'min_child_weight': list(range(4,7)),
          'eval_metric': ['rmse'],
          'objective': ['reg:squarederror']}
xg_reg = xgb.XGBRegressor()
r2 = make_scorer(r2_score, greater_is_better=True)
clf = GridSearchCV(xg_reg, 
                   params, 
                   cv=5, 
                   scoring=r2)
clf.fit(x_train, y_train)

In [6]:
# cross validation score
boost_best_params = clf.best_params_
boost_best_estimator = clf.best_estimator_
boost_score = clf.best_score_
print("Score: "+ str(boost_score))

Score: 0.9670915649887809


In [7]:
boost_best_estimator

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1.0, eta=0.3,
             eval_metric='rmse', gamma=0, importance_type='gain',
             learning_rate=0.1, max_delta_step=0, max_depth=5,
             min_child_weight=6, missing=None, n_estimators=100, n_jobs=1,
             nthread=None, objective='reg:squarederror', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=0.9, verbosity=1)

##### Model 2: SVM

In [8]:
%%capture
random.seed(1)

# fit SVM with 5-fold cross validation
x.train = preprocessing.scale(x_train)
linearsvr = LinearSVR(tol=0.01)
parameters = {'C': [0.01, 0.1, 1, 10, 100]} 
r2 = make_scorer(r2_score, greater_is_better=True)
clf = GridSearchCV(linearsvr, 
                   parameters, 
                   cv=5, 
                   scoring=r2)
clf.fit(x.train, y_train)

In [9]:
# cross validation score
svm_best_params = clf.best_params_
svm_best_estimator = clf.best_estimator_
svm_score = clf.best_score_
print("Score: "+ str(svm_score))

Score: 0.7508273991238662


##### Model 3: Lasso

In [10]:
%%capture
random.seed(1)

# fit LASSO with 5-fold cross validation
lasso = Lasso()
parameters = {'alpha': [1e-5,1e-4,1e-3,1e-2,1e-1,1e0,1e1,1e2,1e3,1e4,1e5]}
r2 = make_scorer(r2_score, greater_is_better=True)
clf = GridSearchCV(lasso, 
                   parameters, 
                   cv=5,
                   scoring=r2)
clf.fit(x_train, y_train)

In [11]:
# cross validation score
lasso_best_params = clf.best_params_
lasso_best_estimator = clf.best_estimator_
lasso_score = clf.best_score_
print("Score: "+ str(lasso_score))

Score: 0.781084670299074


### The LASSO Model

In [12]:
# retrain lasso model on full traning data
lasso = Lasso(alpha=lasso_best_params['alpha'])
final_model = lasso.fit(x_train, y_train)

  positive)


In [13]:
# print regression results
print("intercept:", final_model.intercept_)
variables = x_train.columns
coefficients = final_model.coef_
for i,name in enumerate(variables):
    print(name, coefficients[i])

intercept: 1.6732384180296382
Days Since Release 0.0007138990535844604
yeezy -2.0916984282060604
airjordan 3.726383617537441
airforce -1.1311906914110652
airmax90 -0.2863663582799625
airmax97 -0.17339025871110295
presto 0.7194145142057492
vapormax -1.5121187586969662
blazer 0.8980769012727688
zoom -2.0724055809736206
react -1.5752395055123405
California 0.05940400947520284
New York -0.0031062061753604764
Oregon -2.0567435839875397e-15
Florida 0.035030048298578434
Texas 0.0014123068369209498
Other States -0.004690767417312767
size_freq 0.2408919206700723
Black 1.0600814890088015
White 1.2694755584590267
Grey 0.8735368219093556
Red 1.9732263898871052
Green 0.7588889609727959
Neo 0.776859235885755
Orange 1.4237948982368727
Tan/Brown 2.375958012578484
Pink 1.2457783818054227
Blue -2.302586969712432
Colorful -0.6431478571563565
Number of Sales -3.0255769439499356e-05


In [14]:
# test set performance
y_pred = final_model.predict(x_test)
print('R²: %.2f' % r2_score(y_test, y_pred))
print("mean_squared_error: %.2f" % mean_squared_error(y_test, y_pred))

R²: 0.78
mean_squared_error: 0.51
