### Import Libraries

In [1]:
# conda install -c conda-forge xgboost

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.metrics import  r2_score,mean_absolute_error,mean_squared_log_error,median_absolute_error,explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score, mean_squared_error, make_scorer

import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.svm import SVR
from sklearn.svm import LinearSVR

from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso

### Preparing Data

In [5]:
# read data
data = pd.read_csv("../data/StockX-Data-Consolidated.csv")
data = data.rename(columns={"Unnamed: 0": "index"})

In [17]:
# train-test split
x = data.drop(["Pct_change",'Sale Price','index'], axis=1)
y = data["Pct_change"]
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.25, random_state = 42)

### Model Selection

In [5]:
# define scoring functions
def MSE(y_true,y_pred):
    mse = mean_squared_error(y_true, y_pred)
    print('MSE: %2.3f' % mse)
    return mse

def R2(y_true,y_pred):    
    r2 = r2_score(y_true, y_pred)
    print('R2: %2.3f' % r2)
    return r2

def two_score(y_true,y_pred):    
    MSE(y_true,y_pred)
    score = R2(y_true,y_pred)
    return score

def two_scorer():
    return make_scorer(two_score, greater_is_better=True)

##### Model 1: XGBoost

In [4]:
%%capture
# fit XGBoost with 5-fold cross validation
params = {
    'max_depth':8,
    'min_child_weight': 5,
    'eta':.45,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective':'reg:squarederror',
}
xg_reg = xgb.XGBRegressor()
clf = GridSearchCV(xg_reg, params, cv=5, scoring=two_scorer())
clf.fit(x_train, y_train)

NameError: name 'two_scorer' is not defined

In [18]:
boost_best_params = clf.best_params_
boost_best_estimator = clf.best_estimator_
boost_score = clf.best_score_
print("R^2: "+ str(boost_score))

R^2: 0.9282953911510902


##### Model 2: SVM

In [14]:
%%capture
# fit SVM with 5-fold cross validation
x_train = preprocessing.scale(x_train)
linearsvr = LinearSVR()
clf = GridSearchCV(LinearSVR(tol=0.01), cv=5, param_grid={"C": [0.01,0.1,1e0,1e1]}, scoring=two_scorer())
clf.fit(x_train, y_train)

In [16]:
# best model score
svm_best_params = clf.best_params_
svm_best_estimator = clf.best_estimator_
svm_score = clf.best_score_
print("R^2: "+ str(svm_score))

R^2: 0.750519702300101


##### Model 3: Lasso

In [6]:
%%capture
# fit LASSO with 5-fold cross validation
lasso = Lasso(random_state=0, max_iter=10000)
parameters = [{'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}]
clf = GridSearchCV(lasso, parameters, cv=5, scoring=two_scorer())
clf.fit(x_train, y_train)

In [7]:
lasso_best_params = clf.best_params_
lasso_best_estimator = clf.best_estimator_
lasso_score = clf.best_score_
print("R^2: "+ str(lasso_score))

R^2: 0.7789707559931465


### The LASSO Model

In [26]:
# Lasso Regression Results
print("intercept", lasso_best_estimator.intercept_)
variables = x_train.columns
coefficients = lasso_best_estimator.coef_
for i,name in enumerate(variables):
    print(name,coefficients[i])

intercept 1.7385674152228654
Days Since Release 0.0008867562473585179
yeezy -1.7780129326512497
airjordan 3.655533658402066
airforce -0.7026203348785043
airmax90 0.0
airmax97 0.02299535464981576
presto 0.9944213286292918
vapormax -1.1903759080761882
blazer 1.0695299680357326
zoom -1.7556322609646324
react -1.1259204071386169
California 0.05663083895476324
New York -0.0
Oregon -0.0
Florida 0.01890581020691105
Texas 0.0
Other States -0.002617679953202299
size_freq 0.0
Black 0.7680365288406558
White 0.9754043889726517
Grey 0.5672851506037508
Red 1.8557681413743325
Green 0.05688665753675684
Neo 0.4725738773015827
Orange 1.0271467102604108
Tan/Brown 1.9354964584429748
Pink 0.8892537208755394
Blue -2.2053032269024433
Colorful -0.3731773483606293
Number of Sales -3.440560039526971e-05


In [21]:
# Test Set Performance
y_pred = clf.predict(x_test)
print('R²: %.2f' % r2_score(y_test, y_pred))
print("mean_squared_error: %.2f" % mean_squared_error(y_test, y_pred))

R²: 0.77
mean_squared_error: 0.52
