In [None]:
#This is the main Jupyter Note book that walks through the entire process of how I came about my Code.
# I will not run these cells so they wont interfere with the notbooks I already set up.

In [None]:
#Importing Packages
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm 
from statsmodels.stats.outliers_influence import variance_inflation_factor
import datetime as dt
import pandas as pd
from sklearn import preprocessing, metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.metrics import r2_score
from numpy import loadtxt
from xgboost import XGBClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', lambda x: '%.6f' % x)



In [None]:
# Importing StockX Data Set into varible
sneakerData = pd.read_csv('data/StockX-Data-Contest-2019-3.csv')

In [None]:
#Data Cleaning Cell

#Making Sure all Sneaker Names are lowercase to be on the Same accord.
sneakerData['Sneaker Name'] = sneakerData['Sneaker Name'].str.lower()

#Cleaning the dates column
sneakerData['Order Date'] = sneakerData['Order Date'].astype('datetime64[ns]')
sneakerData['Release Date'] = sneakerData['Release Date'].astype('datetime64[ns]')
sneakerData['Turnover Days'] = sneakerData['Order Date'] - sneakerData['Release Date']

#Removing the $ and , From Sale Price and Retial Price to be able to turn into intergers for the future
sneakerData['Sale Price'] =  sneakerData['Sale Price'].astype(str).str.replace('$', '')
sneakerData['Sale Price'] =  sneakerData['Sale Price'].astype(str).str.replace(',', '')
sneakerData['Retail Price'] =  sneakerData['Retail Price'].astype(str).str.replace('$', '')
sneakerData['Retail Price'] =  sneakerData['Retail Price'].astype(str).str.replace(',', '')

# Making Sales and Retial Price into Int
sneakerData['Sale Price'] = sneakerData['Sale Price'].astype(int)
sneakerData['Retail Price'] = sneakerData['Retail Price'].astype(int)

In [None]:
#Adding the Profit Column to possibly compare how future features effects the profitability of the shoe.
sneakerData['Profit'] = sneakerData['Sale Price'] - sneakerData['Retail Price']
sneakerData['Profit Ratio'] = (sneakerData['Profit'] / sneakerData['Retail Price']).round(2)

In [None]:
#Configuring and adding the Model Column to do Comparisons by Modeling.
sneakerData["Model"] = sneakerData['Sneaker Name'].apply(
    lambda x : 'yeezy-boost-350' if 'yeezy' in x.split("-") else (
        'air-jordan-1-retro-high' if 'jordan' in x.split('-') else (
            'air-force-1' if 'force' in x.split('-') else(
                'air-max-90' if '90' in x.split('-') else (
                    'air-max-97' if '97' in x.split('-') else (
                        'air-presto' if 'presto' in x.split('-') else (
                            'air-vapormax' if 'vapormax' in x.split('-') else (
                                'blazer-mid' if 'blazer' in x.split('-') else (
                                    'react-hyperdunk-2017-flyknit' if 'hyperdunk' in x.split('-') else (
                                        'zoom-fly' if 'zoom' in x.split('-') else (np.nan)
                                    )
                                )
                            )
                        )
                    )
                )
            )
        )
    )
)

In [None]:
#check if all the Sneaker Names are categorised into Model Variants
uncategorised_model = pd.DataFrame()
uncategorised_model[sneakerData['Model'].isnull() == True]
print(uncategorised_model)

In [None]:
#importing the colour information from stockX website, based on each sneaker name
colourway = pd.read_excel('data/supplemental_data_colorway.xlsx')
colourway['Style'] = colourway['Style'].str.lower()
colourway.head()

#merging the colourway DF with the sneakerData DF
sneakerData = pd.merge(sneakerData,colourway,how='left',left_on='Sneaker Name',right_on='Style')

In [None]:
#Filling the Null values of Color columns with 0s to be able to self One-Hot-Encode Colors
sneakerData['Black'] = sneakerData['Black'].fillna(0)
sneakerData['White'] = sneakerData['White'].fillna(0)
sneakerData['Green'] = sneakerData['Green'].fillna(0)
sneakerData['Neo'] = sneakerData['Neo'].fillna(0)
sneakerData['Orange'] = sneakerData['Orange'].fillna(0)
sneakerData['Tan/Brown'] = sneakerData['Tan/Brown'].fillna(0)
sneakerData['Pink'] = sneakerData['Pink'].fillna(0)
sneakerData['Blue'] = sneakerData['Blue'].fillna(0)
sneakerData['Colorful'] = sneakerData['Colorful'].fillna(0)

In [None]:
#Dropping Number of Sales, Website,Product Line,
# and Buyer region to because I wanted to focus on the specific shoe features that could possibily be features.

parsed_data = sneakerData.drop(['Number of Sales','Website','Product Line','Buyer Region'],axis=1)
parsed_data['Turnover Days'] = parsed_data['Turnover Days'].dt.days
parsed_data['Turnover Weeks'] = (parsed_data['Turnover Days'] / 7).round(0)
parsed_data = parsed_data.drop(parsed_data[parsed_data['Turnover Weeks'] < 0].index)
parsed_data = parsed_data.drop(parsed_data[parsed_data['Turnover Weeks'] > 52].index)

#consolidating the primary colors into 1 single column
def get_col(row):
    for color in parsed_data.columns[10:20]:
        if row[color] == 1:
            return color
parsed_data['color'] = parsed_data.apply(get_col, axis=1)
parsed_data = parsed_data.replace(0, np.nan)


In [None]:
#Saving Clean Data into new features.
parsed_data.to_csv('parsed_data.csv')

In [None]:
# DATA Visualization Preperation: This Series of steps are for preping the data so I can get better results when 
# displaying the data to find patterns and possible future features. 

In [None]:
#Importing Parsed Data Set from orignal Cleaning
sneakerData = pd.read_csv('data/parsed_data.csv', index_col = 0)

In [None]:
#Analysing the Data Types in the Dataset
sneakerData.info()

In [None]:
#Based on Feedback from mentors Ive decided that To help evaluate time to possibly be able to model I wanted to transfer The 
# Order Season and Release Season to represent actual dates to be able to code for seasons. 

#Putting into Dates Seperating Years, Months, and Days
sneakerData['Order Date'] = pd.to_datetime(sneakerData['Order Date'],infer_datetime_format=True) 
sneakerData['Release Date'] = pd.to_datetime(sneakerData['Release Date'],infer_datetime_format=True)

sneakerData['release_season'] = sneakerData['Release Date'].apply(lambda x: (x.month%12 + 3)//3)
sneakerData['order_season'] = sneakerData['Order Date'].apply(lambda x: (x.month%12 + 3)//3)

# Changing Dates to Seasons in addition to Years for ex: Fall 2019
season_name = {1:'Winter', 2:'Spring', 3:"Summer",4:"Fall"}
sneakerData['release_season'] = sneakerData['release_season'].map(season_name)
sneakerData['order_season'] = sneakerData['order_season'].map(season_name)
sneakerData['release_season'] = sneakerData['release_season'] + ' ' + sneakerData['release_year'].astype(str)

#Adding just the Year to Release date and Order Date
sneakerData['release_year'] = sneakerData['Release Date'].apply(lambda x: x.year)
sneakerData['order_year'] = sneakerData['Order Date'].apply(lambda x: x.year)



In [None]:
#Changing Sale Price to Resell Price because that is what I am trying to measure.
sneakerData = sneakerData.rename(columns={"Sale Price":"Resell Price"})

In [None]:
#Removing Columns because I dont feel like they would be helpful to visualize
sneakerData.drop(['Profit Ratio', 'color','Style','Turnover Days'], axis=1)

In [None]:
# saving the new dataframe
sneakerData.to_csv('data/RegularShoes.csv')

In [None]:
# This Is the Data Visualizaiton Piece of the Process to be able to see posisble features and to find interesting correlaitons.

In [None]:
# Creating a function seperating Numerical Varibles and Categorical Variables
def grab_columns(RegularShoes, cat_th=10, car_th = 20):
    cat_col = [col for col in RegularShoes.columns if RegularShoes[col].dtypes == "O"]
    num_but_cat = [col for col in RegularShoes.columns if RegularShoes[col].dtypes != "O" and RegularShoes[col].nunique() < cat_th]
    cat_but_car = [col for col in RegularShoes.columns if RegularShoes[col].dtypes == "O" and RegularShoes[col].nunique() > car_th]
    
    cat_col = cat_col + num_but_cat
    cat_col = [col for col in cat_col if col not in cat_but_car]
    
    num_col = [col for col in RegularShoes.columns if RegularShoes[col].dtypes != "O" and col not in num_but_cat]
    
    print("----- Categorical Columns -----")
    print(cat_col)
    print("----- Numerical Columns -----")
    print(num_col)
    print("----- Cardinal Columns -----")
    print(cat_but_car)
    return cat_col,num_col,cat_but_car

In [None]:
#Displaying Numerical Varibles and Categorical Variables 
cat_col, num_col,cat_but_car = grab_columns(RegularShoes)

In [None]:
#Creating a function to plot Numerical Analysis (Taken from Stack overflow will adjusted variables)

def numerical_analysis(RegularShoes,col,plot=False):
    print("Analysis For {}".format(col))
    quan = [0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.99]
    print(RegularShoes[col].describe(quan))
    if plot:
        plt.figure(figsize= (10,10))
        sns.histplot(RegularShoes[col],kde=True)
        plt.title(col)
        plt.xticks(rotation = 45)
        plt.show(block=True)
    print("--------------------------------")

In [None]:
#Running that Analysis (Taken from Stack overflow will adjusted variables)
for col in num_col:
    numerical_analysis(RegularShoes,col,plot=True)

In [None]:
#Creating a function to plot Categorical Analysis (Taken from Stack overflow will adjusted variables)
def categorical_analysis(RegularShoes,col,plot=False):
    print("Analysis For {}".format(col))
    print(pd.DataFrame({col:RegularShoes[col].value_counts(),
                       "Ratio":100*RegularShoes[col].value_counts()/len(RegularShoes)}))
    
    if plot:
        plt.figure(figsize=(10,10))
        sns.countplot(x=RegularShoes[col],data=RegularShoes)
        plt.title(col)
        plt.xticks(rotation=45)
        plt.show(block=True)
        
    print("-----------------------------------")

In [None]:
#Running that Analysis (Taken from Stack overflow will adjusted variables)
for col in cat_col:
    categorical_analysis(RegularShoes,col,plot=True)

In [None]:
# To help with feature finding I looked at a correlation matrix to see which posisble features correlate with eachother.
correlations = RegularShoes.corr()
sns.heatmap(correlations)

In [None]:
# saving the dataframe
RegularShoes.to_csv('data/RegularShoes.csv')

In [None]:
#Now that I found some Interesting Features I wanted to 
#do additional Data Cleaning once more to prepare the data to go through the model.

In [None]:
# Uploading Data
sneakerData = pd.read_csv('data/RegularShoes.csv', index_col = 0)

In [None]:
#Grouping Data Fram by Sneaker Name and Shoe Size to make it more logical for the model
GroupedShoes = sneakerData.groupby(['Sneaker Name','Shoe Size'],as_index=False).agg({'Retail Price': 'min' , 'Sale Price': 'mean','Release Date': 'first','Order Date': 'first','Red':'first','Black':'first','White':'first','Green':'first','Neo':'first','Orange':'first','Tan/Brown':'first','Pink':'first','Blue':'first','Colorful':'first'})

In [None]:
#Saving Data

GroupedShoes.to_csv('data/GroupedShoes.csv')

In [None]:
# Now Onto Modeling the data into a series of simple models first to test then creating 
# a pipeline inorder to choose the best Model to run the data through then save the model at the end.

In [None]:
#Importing the data 
GroupedShoes = pd.read_csv('data/GroupedShoes.csv', index_col = 0)

In [None]:
#Chosing X and Y. I chose Resell Price because that is what I want my model to predict.
X = GroupedShoes.drop(['Resell Price',"release_year","order_year",'Release Date','Order Date'], axis=1)
y = GroupedShoes['Resell Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
# Converting categorical data to numerical
from sklearn.preprocessing import OneHotEncoder

object_cols = ['Sneaker Name','release_season','order_season']
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_test.index = X_test.index

# # Adding the column names after one hot encoding
OH_cols_train.columns = OH_encoder.get_feature_names(object_cols)
OH_cols_test.columns = OH_encoder.get_feature_names(object_cols)

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

In [None]:
#MODEL TESTING
#1. Linear Regression
#2. OLS Regression
# The first two models was only for simple testing to see what possible features my impact the model the most and 
#if the data ran into any overfitting or other errors.


# Best Model Pipeline. I chose these models to test because I felt that they could give me the best results.
#3. RandomForestRegressor
#4. DecisionTreeRegressor
#5. XGBoost

In [None]:
#Linear Regression 

from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(X_train,y_train)

# Looking at y-int
print(lm.intercept_)

In [None]:
# Storing predictions and running evaluation metrics. I chose these metrics because they can give me model accuracy and the 
#range it captured for Resell Prices because Resell Prices are volitle in a real situation.
predictions = lm.predict(X_test)
from sklearn import metrics
print("MAE:", metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
#OLS Regression
def build_model(X,y):
    X = sm.add_constant(X) #Adding the constant
    model = sm.OLS(y, X)
    results = model.fit() # fitting the model
    print(results.summary()) # model summary
    return X
    
def checkVIF(X):
    vif = pd.DataFrame()
    vif['Features'] = X.columns
    vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    vif = vif.sort_values(by = "VIF", ascending = False)
    return(vif)

In [None]:
# Running OLS Model
X_train_new = build_model(X_test,y_test)

In [None]:
# SETING UP MODEL PIPELINE
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor

In [None]:
# Setting up pipelines

from sklearn.pipeline import Pipeline
# Decision Tree Regression Pipeline
pipeline_dtr=Pipeline([('dtr', DecisionTreeRegressor(random_state=27))])

# Random Forest Pipeline
pipeline_randomforest=Pipeline([('rf_regressor',RandomForestRegressor(random_state=27))])

# XGBost Pipeline
pipeline_xgb=Pipeline([('xgb_regressor',xgb.XGBRegressor(objective="reg:linear", random_state=27))])

In [None]:
# Creating a list of the pipelines to loop through them
pipelines = [pipeline_dtr, pipeline_xgb, pipeline_randomforest]

best_accuracy=0.0
best_regressor=0
best_pipeline=""

# Dictionary of pipelines and regression types for ease of reference
pipe_dict = {0: 'DTR', 1: 'XGBoost', 2: 'RandomForest'}

# Fit the pipelines
for pipe in pipelines:
	pipe.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
# Checking the accuracy of each model
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(X_test,y_test)))

# Finding the best model
for i,model in enumerate(pipelines):
    if model.score(X_test,y_test)>best_accuracy:
        best_accuracy=model.score(X_test,y_test)
        best_pipeline=model
        best_regressor=i
print('Model with best accuracy: {}'.format(pipe_dict[best_regressor]))

In [None]:
#Hyperparameters
#Random Forest Regresson
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

In [None]:
# Using Randomized Search CV to find the best parameters

# Number of trees in random forest
n_estimators = [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 25, 50, 75, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]
# Method of selecting samples for training each tree
# bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

pprint(random_grid)

In [None]:

rf = RandomForestRegressor(random_state=27)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=27, n_jobs = -1)

# Fit the random search model

rf_random.fit(X_train, y_train)

In [None]:
# Evaluation of Random Search
def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    errors = np.sqrt(mean_squared_error(y_test, predictions))
    print('Model Performance')
    print('MSE of: ', errors)
    
    return errors

In [None]:
#Printing Results
from sklearn.metrics import mean_squared_error
base_model = rf
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)


best_random = rf_random.best_estimator_
best_random.fit(X_train , y_train)

random_accuracy = evaluate(best_random, X_test, y_test)

print('\n')
print('Base Model Error: ', base_accuracy)
print('\n')
print('Improved Model Error: ', random_accuracy)
print('Improvement of {:0.2f}%.'.format((random_accuracy - base_accuracy) / base_accuracy))

print('\n')
print('RF_Randomized_Search_CV is complete.')
print('\n')

In [None]:
#Printing Random Forest Parameters and Score
print('The best model is',rf_random.best_estimator_)
print( rf_random.best_score_)

In [None]:
#Doing Cross Evaluation and printing MAE
from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(best_random, X_test, y_test,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)
print("Average MAE Resell Price (across experiments):\n")
print(scores.mean())

In [None]:
# Saving model to disk
import pickle
pickle.dump(best_random, open('data/Resellmodel.pkl','wb'))

# Loading model to compare the results
model = pickle.load(open('data/Resellmodel.pkl','rb'))

