In [5]:
import pandas as pd
import numpy as np
import matplotlib

In [6]:
data = pd.read_excel("StockX-Data-Contest-2019-3.xlsx", sheet_name="Raw Data")

In [7]:
data.tail()

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Release Date,Shoe Size,Buyer Region
99951,2019-02-13,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,565.0,220,2018-12-26,8.0,Oregon
99952,2019-02-13,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,598.0,220,2018-12-26,8.5,California
99953,2019-02-13,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,605.0,220,2018-12-26,5.5,New York
99954,2019-02-13,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,650.0,220,2018-12-26,11.0,California
99955,2019-02-13,Yeezy,adidas-Yeezy-Boost-350-V2-Static-Reflective,640.0,220,2018-12-26,11.5,Texas


In [8]:
#Basic Data Cleaning
data["Order Date"]= pd.to_datetime(data["Order Date"])
data["Release Date"]= pd.to_datetime(data["Release Date"])
data["Release Month"]= data["Release Date"].dt.month
data["Order Month"]= data["Order Date"].dt.month
data["Sold After"] = (data["Order Date"]-data["Release Date"])/np.timedelta64(1,'D')
data["Profit Percent"] = ((data["Sale Price"]-data["Retail Price"])/data["Retail Price"])*100
data.drop(["Order Date", "Brand", "Release Date"], axis= 1, inplace=True)
colname=list(data.columns)

In [9]:
#Extracting Features From Shoe Name 
shoelist= data["Sneaker Name"].to_list()
features=[]
def featurelist(x):
    for j in x:
        y=j.split("-")
        for i in y:
            if i.lower() not in features:
                features.append(i.lower())
featurelist(shoelist)
features

#Adding the faetures to the dataset
data_list = data.values.tolist()
for i in data_list:
    for j in features:
        x=i[0].split("-")
        k=[v.lower() for v in x] 
        if j in k:
            i.append(1)
        else :
            i.append(0)
for i in features:
    colname.append(i)
df = pd.DataFrame(data_list,columns=colname)
data=df.drop(["Sneaker Name"],axis=1)

In [10]:
#Creating Dummy variables for the state
dummy_state=pd.get_dummies(data["Buyer Region"])
data = pd.concat([data,dummy_state],axis=1)
data.drop(["Buyer Region"],axis=1,inplace =True)
data.dropna(inplace=True)
#Data is cleaned and ready for Modeling

In [11]:
#Getting X(independent) and y(Target) variables
x=data.drop(["Sale Price","Profit Percent"],axis=1)
yperc = data["Profit Percent"] #Predicitng Profit Percentage
ysp = data["Sale Price"] #Predicting Sale Price

In [12]:
#Scaling the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler([-1,1])
X=scaler.fit_transform(x)

In [13]:
#We will use two methods
#1. Linear Regression
#2. RandomForest

In [14]:
#Let's start with trying to predict actual resale value
#splitting the data in training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, ysp, test_size=0.33, random_state=42)
X_train.shape

(66970, 134)

In [15]:
#Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

model=LinearRegression()
model.fit(X_train,y_train)





y_pred = model.predict(X_test)

# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))

Mean squared error: 11469.03
Coefficient of determination: 0.82
Mean squared error: 11620.34
Coefficient of determination: 0.82


In [22]:
#Random Forest

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(max_depth=9, random_state=0)
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))

Mean squared error: 1711.06
Coefficient of determination: 0.92
Mean squared error: 1847.57
Coefficient of determination: 0.92


In [17]:
pd.set_option('display.precision',10)

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(x.columns, model.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances= importances.sort_values(by='Gini-importance')
importances

Unnamed: 0,Gini-importance
2pt0,0.0000000000
1,0.0000000000
Montana,0.0000000000
Mississippi,0.0000000000
orange,0.0000000000
...,...
Release Month,0.0604142551
core,0.0629176561
Sold After,0.0884430255
chicago,0.1341343542


In [18]:
#Predicting the profit percent
#splitting the data in training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, yperc, test_size=0.33, random_state=42)

In [19]:
#Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

model=LinearRegression()
model.fit(X_train,y_train)



y_pred = model.predict(X_test)

# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))

Mean squared error: 3381.11
Coefficient of determination: 0.85


In [20]:
#Random Forest

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(max_depth=9, random_state=0)
model.fit(X_train,y_train)



y_pred = model.predict(X_test)

# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))

Mean squared error: 1847.57
Coefficient of determination: 0.92


In [21]:
pd.set_option('display.precision',10)

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(x.columns, model.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances= importances.sort_values(by='Gini-importance')
importances

Unnamed: 0,Gini-importance
Wyoming,0.0000000000
high,0.0000000000
Montana,0.0000000000
abloh,0.0000000000
Mississippi,0.0000000000
...,...
black,0.0418341770
Sold After,0.0735479923
Release Month,0.0752501312
chicago,0.1022616196
