# SNEAKER WEBSITE SCRAPE

PURPOSE: The purpose of this program is aimed to conduct analytics of across various sneaker oriented websites

AUTHOR: dxjester

DATE UPDATED: 28-MAY-21

## PHASE 1: ENVIRONMENT SETUP

Import the necessary modules

In [1]:

import requests
import time as t
import pandas as pd
import datetime
import pprint
from datetime import date 

from bs4 import BeautifulSoup # to parse web page data
import glob # to read in files

# plotting modules
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.offline as po

# supervised learning modules
import statistics as stats
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

# external python files
import plot_functions as pf
import text_functions as tf
# from collections import Counter
# from string import punctuation

ModuleNotFoundError: No module named 'requests'

Start time to calculate program duration

In [None]:
program_start = t.time()

Create a class in order to extract and create website objects

In [None]:
# 1.1 Class Declaration ------------------------------------------------------#
class sneaker_site:
    '''
    DESCRIPTION: Purpose of this class is to store website data located from various
        sneaker websites and retrieve pertinent key words from each object's scrape.
        The data scraped is then transformed into a tibble, which is then exported as 
        it's on individual CSV, later utilized for follow-on analytics
    '''
    
    # initialize the class
    def __init__ (self, name, url): # provide the name of the website and the url
        '''
        DESCRIPTION: initialize class with default class arguments
        '''
        self.website_name = name # set the name

        self.url = url # save the url
        self.site_text = '' # value to save the site text for each object
        self.converted_site_text = '' # converting the extracted value to lower case, via the 'text_functions' file
        self.lines = ''
        
        # create a dataframe to store extracted values for each object
        self.site_df = pd.DataFrame(columns = ['website','dtg', 'date','year', 'month', 'day', 'shoe_company', 'brand', 'count'])
        self.site_df['website'] = self.website_name # assign the website name to the entire class dataframe
        
        # creating Beautiful Soup variables to store individual values
        self.soup = '' # variable to store the complete values 
        self.hyperlink_list = '' # variable to store the hyperlinks tags
        self.paragraph_list = '' # variable to store paragraph value tags
        self.bold_list = '' # variable to store bold value tags
        
        # create the site variables to aggregate total counts for each object
        self.nike_site_count = 0
        self.adidas_site_count = 0
        self.reebok_site_count = 0
        self.new_balance_site_count = 0
        self.puma_site_count = 0
        self.vans_site_count = 0

        # default Nike list with different Nike shoe companies
        self.nike_master = ['nike', 'jordan', 'converse'] 
        # ['Nike', 'Air', 'Max', 'Jordan', 'Zoom', 'React', 'Shox', 'ACG', 'Max Plus', 'Joyride', 'Tinker', 'Force', 'Westbrook', 'Kyrie','Lebron', 'Durant', 'SB', 'Air Max 90', 'Air Max 97', 'Air Max 1', 'Kyrie', 'Air Max 270', 'Travis Scott' ]

        # default Adidas list with different Adidas shoe companies
        self.adidas_master = ['adidas', 'reebok', 'adidas', 'kanye', 'yeezy']
        # ['Adidas', 'ADIDAS', 'adidas', 'Yeezy', 'Kanye', 'Ultraboost', 'EQT', 'NMD', 'Ultra Boost', 'FYW', 'Harden']
        
        # default New Balance list 
        self.new_balance_master = ['NB', 'new balance']
        # ['New Balance', 'NB', 'Balance', '997', '801']
        
        # default Puma LIst
        self.puma_master = ['Puma', 'puma']
        #['Puma', 'Cell Venom', 'Thunder Spectre', 'Clyde Court']

        # default Vans list
        self.vans_master = ['Vans','vans']
        
        # concatenante the individual sneaker lists into one master list
        self.sneaker_list = self.nike_master + self.adidas_master + self.new_balance_master + self.puma_master + self.vans_master
        self.length = len(self.sneaker_list)         
        print("{} website object created".format(self.website_name))
    
    # class function to calculate the counts of each sneaker value in the master 'sneaker_list' data structure
    def site_calculate(self):
        '''
        DESCRIPTION: extract each website's raw data and append in the object's dataframe
        '''
        
        # to calculate the time needed to process the function from start to finish
        start_time = t.time() 
        print("\nRetrieving {} text and data ...".format(self.website_name))
        
        # establish connection to the website
        r = requests.get(self.url)
        self.soup = BeautifulSoup(r.content, "html.parser")
        
        # find and categorize all hyperlink (a), paragraph (p), and bold (b) html tags
        print("\nConsolidating all hyperlinks and paragraphs for", self.website_name)        
        self.hyperlink_list = self.soup.findAll('a')
        self.paragraph_list = self.soup.findAll('p')
        self.bold_list = self.soup.findAll('b')
        
        # convert individual Soup categories to text
        self.site_text = self.soup.get_text()
        self.converted_site_text = tf.normalize_string(self.site_text)
        print("\nConverting ", self.website_name, " to text file ... ")
        
        self.lines = [self.site_text.lower() for line in self.site_text]
        print("\nCalculating individual counts: " )
        
        index_num = 0

        # utilize the for loop to iterate over each object and count the .... 
        # ... amount of times a value is depicted in each extraction
        for brand in self.sneaker_list:
            
            # allocate object variables as values for the class dataframe
            website = self.website_name
            name = brand + ': '
            count = self.converted_site_text.count(brand) # count text items
            today = date.today()
            dtg = datetime.datetime.now()
            year = dtg.year
            month = dtg.month
            day_num = dtg.day

            shoe_company = ''
            
            # if count > 0 , aggregate the count based on shoe company name
            if count > 0:
                if brand in self.nike_master:
                    self.nike_site_count += count
                    shoe_company = 'Nike'
                elif brand in self.adidas_master:
                    self.adidas_site_count += count
                    shoe_company = 'Adidas'
                elif brand in self.new_balance_master:
                    self.new_balance_site_count += count
                    shoe_company = 'New Balance'
                elif brand in self.puma_master:
                    self.puma_site_count += count
                    shoe_company = 'Puma'
                elif brand in self.vans_master:
                    self.vans_site_count += count
                    shoe_company = 'Vans'
                else: 
                    0
            else: 
                if brand in self.nike_master:
                    shoe_company = 'Nike'
                elif brand in self.adidas_master:
                    shoe_company = 'Adidas'
                elif brand in self.new_balance_master:
                    shoe_company = 'New Balance'
                elif brand in self.puma_master:
                    shoe_company = 'Puma'
                elif brand in self.vans_master:
                    shoe_company = 'Vans'
                else: 
                    0      
                    
            # append each new row to the class dataframe
            self.site_df.loc[index_num] = [website, dtg, today, year, month, day_num, shoe_company, brand, count]        
            print(name, count)
            index_num += 1
        
        elapsed_time = t.time() - start_time 
        print("\n{} data ingest completed, total elapsed time: {} seconds\n".format(self.website_name, round(elapsed_time,2)))
        
    def display_info(self):
        '''
        DESCRIPTION: display object information
        '''
        print("\nCalculating total counts by shoe company...")
        print("Total Nike mentions: ", self.nike_site_count)
        print("Total Adidas mentions: ", self.adidas_site_count)
        print("Total New Balance mentions: ", self.new_balance_site_count)
        print("Total Puma mentions: ", self.puma_site_count)      
        print("Total Vans mentions: ", self.vans_site_count)      
        # print(self.site_df)      
        
    def return_df(self):
        '''
        DESCRIPTION: return class dataframe 
        '''
        return self.site_df

    def display_soup(self):
        '''
        DESCRIPTION: display hyperlinks for the object
        '''
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(self.soup)
    
    def display_links(self):
        '''
        DESCRIPTION: display hyperlinks for the object
        '''
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(self.hyperlink_list)
    
    def display_paragraphs(self):
        '''
        DESCRIPTION: display paragraphs for the object
        '''
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(self.paragraph_list)
        
    def display_bold(self):
        '''
        DESCRIPTION: display bold tags for the object
        '''
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(self.bold_list)  

Create a cummulative sum detection function

In [None]:
def cusum(df, time_col, val_col, running_avg_count, confidence_interval):
    '''
    Purpose: A timeseries function aimed to conduct a change point detection analysis of timeseries data
    '''
    
    mod_df = df[[time_col, val_col]]
    mod_df['cusum'] = mod_df[val_col].cusum()
    return mod_df

## PHASE 2: DATA ETL

### Sneakernews.com Exploration

In [None]:
## 2.1: SNEAKERNEWS.com ingest and analysis -----------------------------------#

start_time = t.time() # calculate elapsed time

sneaker_news = sneaker_site('sneakernews.com', 'https://sneakernews.com/')
sneaker_news.site_calculate()
sneaker_news.display_info()

elapsed_time = round(t.time() - start_time, 2)
print(" Total elapsed time in seconds: ", elapsed_time)

Display the extracted site information

In [None]:
sneaker_news.display_soup()

Display the corresponding sneakernews.com links

In [None]:
sneaker_news.display_links()

Display sneakernews.com paragraphs

In [None]:
sneaker_news.display_paragraphs()

In [None]:
sneaker_news.display_bold()

Create a sneakernews.com dataframe from the extracted Beautiful Soup information

In [None]:
# retrieve master sneakernews.com dataframe
sneaker_news_df = sneaker_news.return_df()
sneaker_news_df.head(10)

Plot total shoe company counts for sneakernews.com 

In [None]:
# website plotting
#pf.bar_chart(sneaker_news_df,'shoe_company', 'count', 'Sneakernews.com Count Summary')

sneaker_news_raw = sneaker_news_df[['shoe_company', 'count']]
sneaker_news_sum = sneaker_news_raw.groupby(['shoe_company']).sum().reset_index()

fig = px.bar(sneaker_news_df, x='shoe_company', y='count', color = 'shoe_company')
fig.show()

Display the percentage breakdown

In [None]:
# pie chart

fig = px.pie(sneaker_news_df, values='count', names='shoe_company')
fig.update_layout(
    title="sneakernews.com Shoe Company Mentions ",
    font=dict(
        family="Helvetica",
        size=12,
        color="#7f7f7f"
    )
)
fig.show()

## Solecollector.com Exploration

Create the solecollector.com object

In [None]:
## 2.2: SOLECOLLECTOR.com ingest and analysis ---------------------------------#

start_time = t.time() # calculate elapsed time

sole_collector = sneaker_site('Solecollector.com', 'https://solecollector.com/')
sole_collector.site_calculate()
sole_collector.display_info()

elapsed_time = round(t.time() - start_time, 2)
print(" Total elapsed time in seconds: ", elapsed_time)

Display the exctracted site information

In [None]:
sole_collector.display_soup()

Display the solecollector.com links

In [None]:
sole_collector.display_links()

Display solecollector.com paragraphs

In [None]:
sole_collector.display_paragraphs()

In [None]:
sole_collector.display_bold()

Create a solecollector.com dataframe for the extracted object

In [None]:
# retrieve master sneakernews.com dataframe
sole_collector_df = sole_collector.return_df()
sole_collector_df.head(10)

Create a solecollector.com bar graph of shoe company mentions

In [None]:
# website plotting
#pf.bar_chart(sole_collector_df,'shoe_company', 'count', 'Solecollector.com')

sole_collector_raw = sole_collector_df[['shoe_company', 'count']]
sole_collector_final = sole_collector_raw.groupby(['shoe_company']).sum().reset_index()

fig = px.bar(sole_collector_final, x='shoe_company', y='count', color = 'shoe_company')
fig.show()

Display the percentage breakdown

In [None]:
# pf.pie_chart(sole_collector_df,'shoe_company', 'count', 'Solecollector.com')

fig = px.pie(sole_collector_df, values='count', names='shoe_company')
fig.update_layout(
    title="solecollector.com Shoe Company Mentions ",
    font=dict(
        family="Helvetica",
        size=12,
        color="#7f7f7f"
    )
)
fig.show()

## Hypebeast.com Exploration

Create the hypebeast.com object

In [None]:
# 2.3: HYPEBEAST.com ingest and analysis -------------------------------------#

start_time = t.time() # calculate start time

hypebeast = sneaker_site('hypebeast.com', 'https://hypebeast.com/')
hypebeast.site_calculate()
hypebeast.display_info()

elapsed_time = round(t.time() - start_time, 2)
print(" Total elapsed time in seconds: ", elapsed_time)

Display all extracted raw hypebeast.com information

In [None]:
hypebeast.display_soup()

Display the hypebeast.com links

In [None]:
hypebeast.display_links()

Display hypebeast.com paragraphs

In [None]:
hypebeast.display_paragraphs()

In [None]:
hypebeast.display_bold()

Create the hypebeast.com dataframe from the raw data set

In [None]:
# retrieve master hypebeast.com dataframe
hypebeast_df = hypebeast.return_df()
hypebeast_df.head(10)

Plot the shoe company mentions for hypebeast.com

In [None]:
# website plotting
#pf.bar_chart(hypebeast_df,'shoe_company', 'count', 'Hypebeast.com')

hypebeast_raw = hypebeast_df[['shoe_company', 'count']]
hypebeast_final = hypebeast_raw.groupby(['shoe_company']).sum().reset_index()

fig = px.bar(hypebeast_final, x='shoe_company', y='count', color = 'shoe_company')
fig.show()

Display the percentage breakdown for hypebeast.com

In [None]:
#pf.pie_chart(hypebeast_df,'shoe_company', 'count', 'Hypebeast.com')


df = px.data.tips()
fig = px.pie(hypebeast_df, values='count', names='shoe_company')
fig.update_layout(
    title="Hypebeast.com Shoe Company Mentions ",
    font=dict(
        family="Helvetica",
        size=12,
        color="#7f7f7f"
    )
)
fig.show()
print("\n End of Phase 2 ...\n")

## PHASE 3: MACRO LEVEL ANALYSIS

The purpose of this phase is to conduct a top level analysis of all cummulative data for the day executed.  The program concats three (3) x separate data frames into master dataframe, day_df.

In [None]:
sneaker_news_df

Concat the three individual class objects (sneaker_news_df, sole_collector_df, hypebeast_df) as a master dataframe, day_master

In [None]:
print("\n Starting Phase 3 ...\n")

# concat the three dataframes into a single, unified dataframe
frames = [sneaker_news_df, sole_collector_df, hypebeast_df]
day_master = pd.concat(frames)
day_master['short_date'] = day_master['dtg'].dt.date

day_master

Plot the consolidated shoe company mention count for all three sites

In [None]:
#pf.bar_chart(day_master,'shoe_company', 'count', 'Consolidated Bar Chart Report')

day_raw = day_master[['shoe_company', 'count']]
day_df = day_raw.groupby(['shoe_company']).sum().reset_index()

fig = px.bar(day_df, x='shoe_company', y='count', color = 'shoe_company')
fig.show()

Plot the percentage breakdown for the day

In [None]:
#pf.pie_chart(day_master,'shoe_company', 'count', 'Consolidated Pie Report')

df = px.data.tips()
fig = px.pie(day_master, values='count', names='shoe_company')
fig.update_layout(
    title="Shoe Company Mentions (Cummulative) ",
    font=dict(
        family="Helvetica",
        size=12,
        color="#7f7f7f"
    )
)
fig.show()

Export the daily file to the root storage folder

In [None]:
path = '/Users/patrickbenitez/Desktop/GT/Codebook/Git/Py.sneakernews.webscrape/df_exports/'
# Converting date into DD-MM-YYYY format
temp_date = datetime.datetime.today()

file_date = temp_date.strftime('%Y-%m-%d')

In [None]:
# create the full file path
full_path = path + "v3_" +  file_date + ".csv"

# export the file to the /df_exports/ directory
day_master.to_csv(full_path)

print("\nFile successfully exported!")

Summarize daily counts

In [None]:
day_master

## PHASE 4: CSV IMPORT AND EXPLORATION

The purpose of this phase is to conduct historical level analysis of all cummulative data extracted since project inception.  The program invokes the 'glob' module in order to import all standalone csv files from previous daily extracts.

Due to continuous improvements in the master branch, historical data is formatted in three (3) x separate versions. The glob function imports these three (3) x separate versions for all stored files and conditions the data in order to conduct unified analysis on one master dataframe

In [None]:
## 4.1: Determine all version 1.0 files located in the 'df_exports' directory--#
import glob # to read in multiple csv files


print("\nRetrieving version 1.0 csv files ...")

csv_list = [] # store values in the list


In [None]:
# import version 1.0 files
for csv_file_v1 in glob.glob('df_exports/v1_*.csv'): # only retrieve "v1_" csv files
    csv_list.append(csv_file_v1)
    print (csv_file_v1)
    
print("\nTotal amount of v1.0 files: {}".format(len(csv_list)))

In [None]:
# import version 2.0 files
for csv_file_v2 in glob.glob('df_exports/v2_*.csv'): # only retrieve "v2_" csv files
    csv_list.append(csv_file_v2)
    print (csv_file_v2)

print("\nTotal amount of v2.0 files: {}".format(len(csv_list)))

Create a dataframe of version 1.0 and 2.0 files in order to change 'category_name' and 'item' column headers to 'shoe_company' and 'brand'

In [None]:
# 4.2: Read in each csv file into the master dataframe -----------------------#
# 4.2.1: read in the local files and aggregate as a single dataframe -#
old_df = pd.DataFrame(columns=['date', 'category_name', 'item', 'count'])

# extract the four columns from each csv file and append to 'master_df'
for csv_file in csv_list:
    temp_df = pd.read_csv(csv_file)
    sliced_df = temp_df[['date', 'category_name', 'item', 'count']]
    old_df = pd.concat([old_df, sliced_df])
    
old_df['count'] = old_df['count'].astype(int)
old_df['date'] = old_df['date'].astype('datetime64[ns]')
old_df.rename(columns = {'category_name':'shoe_company', 'item':'brand'}, inplace = True) 
old_df.head(25)

In [None]:
# import version 3.0 files

csv_list2 = [] # list to store csv version 3 files
for csv_file_v3 in glob.glob('df_exports/v3_*.csv'): # only retrieve "v2_" csv files
    csv_list2.append(csv_file_v3)
    print (csv_file_v3)

print("\nTotal amount of files: {}".format(len(csv_list2)))

In [None]:
# 4.2: Read in each csv file into the master dataframe -----------------------#
# 4.2.1: read in the local files and aggregate as a single dataframe -#
master_df = pd.DataFrame(columns=['date', 'shoe_company', 'brand', 'count'])

# extract the four columns from each csv file and append to 'master_df'
for csv_file in csv_list2:
    temp_df = pd.read_csv(csv_file)
    sliced_df = temp_df[['date', 'shoe_company', 'brand', 'count']]
    master_df = pd.concat([master_df, sliced_df])
    
master_df['count'] = master_df['count'].astype(int)
master_df['date'] = master_df['date'].astype('datetime64[ns]')
master_df.dtypes
master_df.head(10)

In [None]:
final_df = old_df.append(master_df)
final_df.tail(10)

Display the histogram count

In [2]:
# group by sum the master_df dataframe for follow-on analysis
master_sum_df = final_df.groupby(['date','shoe_company', 'brand']).sum().reset_index()
summarized_df = master_sum_df[master_sum_df['count'] != 0]
summarized_df.head(5)

sns.pairplot(summarized_df)

NameError: name 'final_df' is not defined

In [None]:
master_sum_df

Drop the 'brand' column from the master_sum_df and name new df as 'category_df'

In [None]:
# 4.2.2: Unstack and pairplot the master dataframe for category_name df - #
category_df = master_sum_df[['date','shoe_company','count']]
category_df = category_df.groupby(['date','shoe_company']).sum().reset_index()
category_df.head(10)

Unstack the 'shoe_company' values as stand alone columns in order to conduct multivariate analysis

In [None]:
unstack_category_df = category_df.pivot_table(index = ['date'], 
                                   columns = 'shoe_company',
                                   values = 'count',
                                   aggfunc='first').reset_index().rename_axis(None, axis=1)

unstack_category_df.tail(10)

Pairplot the 'unstack_category_df' dataframe

In [None]:
sns.pairplot(unstack_category_df) # pairplot the category dataframe

Create a new dataframe with 'brand' and 'date' values

In [None]:
item_temp_df = master_df[['date','brand','count']]
item2_df = item_temp_df.groupby(['date', 'brand']).sum().reset_index()

Remove rows where the count is equal to 0

In [None]:
# remove rows where count is equal to '0'
item_df = item2_df[item2_df['count'] != 0]
item_df.head(20)

Unstack the 'item_df' dataframe in order to conduct follow-on multivariate regression analysis

In [None]:
unstack_item_df = item_df.pivot_table(index = ['date'],
                                      columns = 'brand',
                                      values = 'count',
                                      aggfunc='first').reset_index().rename_axis(None, axis=1)

unstack_item_df.tail(50)

Pairplot the 'unstack_item_df' dataframe

Create a count for each date

In [3]:
# 4.2.4: Date and Count dataframe
date_count_temp = final_df[['date','count']]
date_count_df = date_count_temp.groupby('date').sum().reset_index()
date_count_df.tail()

NameError: name 'final_df' is not defined

Create a timeseries plot of all counts by day

In [None]:
# master counts, company agnostic

fig = px.line(date_count_df, x='date', y='count')
fig.update_layout(
    title="Shoe Company Daily Count Summary",
    xaxis_title="Date",
    yaxis_title="Total Daily Counts",
    font=dict(
        family="Helvetica",
        size=12,
        color="#7f7f7f"
    )
)
fig.show()

Create timeseries analysis, categorized by shoe company

In [None]:
# master counts, by day and shoe company

# 4.2.4: Date and Count dataframe
date_shoe_temp = final_df[['date','shoe_company', 'count']]
date_shoe_df = date_shoe_temp.groupby(['date', 'shoe_company']).sum().reset_index()

fig = px.line(date_shoe_df, x='date', y='count', color='shoe_company')
fig.update_layout(
    title="Shoe Company Daily Count Summary",
    xaxis_title="Date",
    yaxis_title="Total Daily Counts",
    font=dict(
        family="Helvetica",
        size=12,
        color="#7f7f7f"
    )
)
fig.show()

Display the area chart

In [4]:
fig = px.area(date_shoe_df, x='date', y='count', color = 'shoe_company')
fig.show()

NameError: name 'px' is not defined

Create a sankey chart in order to depict feeder flow from shoe company to brand

In [None]:
sum_df = master_df.groupby(['date', 'shoe_company', 'brand']).sum().reset_index()

fig = pf.genSankey(sum_df,cat_cols=['shoe_company', 'brand'],value_cols='count',title='Sneaker Sankey Analysis')
po.offline.plot(fig, validate=False)

### Multi-variate Regression Analysis

Slice dataframe in order to begin multivariate regression.

Set the shoe companie column values as the predictor variables, with 'total counts' serving as the response variable.

In [None]:
unstack_df = unstack_category_df.copy()
unstack_df.fillna(0, inplace = True)
unstack_df.head(10)

Create a new column of total counts, categorized by day

In [None]:
unstack_df['total_counts'] = unstack_df['Adidas']  + unstack_df['New Balance'] + unstack_df['Nike'] + unstack_df['Puma'] + unstack_df['Vans']

unstack_df.head(5)

Create a dataframe with the predictor variables

In [None]:
X = unstack_df[['Adidas', 'New Balance','Nike','Puma','Vans']]
X

Create the response variable values

In [5]:
y = unstack_df[['total_counts']]
y

NameError: name 'unstack_df' is not defined

### Build the linear regression model

Create the model boject

In [None]:
regr = linear_model.LinearRegression()
regr.fit(X, y)

In [None]:
print('Intercept: \n', regr.intercept_)

In [None]:
print('Coefficients: \n', regr.coef_)

Fit the model and predict values for each of the given records

In [None]:
# with statsmodels
X = sm.add_constant(X) # adding a constant
 
model = sm.OLS(y, X).fit()
predictions = model.predict(X) 

Print the model summary

In [None]:
print_model = model.summary()
print(print_model)

Build a three-dimensional plot of Nike vs. Adidas values

Slice out Nike and Adidas counts

In [None]:
tri_dim = unstack_df[['Adidas','Nike','total_counts']]
tri_dim

Find the 95% confidence interval for total_counts column

In [6]:
import numpy as np, scipy.stats as st

count_list = list(tri_dim['total_counts'])
count_array = np.array(count_list)

lower, higher = st.t.interval(0.95, len(count_array)-1, loc=np.mean(count_array), scale=st.sem(count_array))


ModuleNotFoundError: No module named 'numpy'

In [None]:
lower

In [None]:
higher

In [None]:
count_mask = (tri_dim['total_counts'] > lower) & (tri_dim['total_counts'] < higher)
tri_mask = tri_dim.loc[count_mask]

3D plot Nike vs. Adidas

In [None]:
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm

threedee = plt.figure(figsize = (12,10)).gca(projection='3d')
threedee.scatter(tri_mask['Adidas'], tri_mask['Nike'], tri_mask['total_counts'], cmap=cm.rainbow)
plt.title('Adidas vs Nike counts')
threedee.set_xlabel('Adidas')
threedee.set_ylabel('Nike')
threedee.set_zlabel('Total Counts')
plt.show()



## Change Point Detection

Retrieve data from the unstacked dataframe, previously aggregated

In [None]:
unstack_df.head(5)

Slice on the 'date,' 'Nike,' and 'total_counts' columns

In [None]:
cusum_nike_raw = unstack_df[['date', 'Nike']]
cusum_nike_raw.head(5)

Invoke the cummulative sum for the Nike column

In [None]:
cusum_nike_raw['cusum'] = cusum_nike_raw['Nike'].cumsum()

Display the Nike data

In [None]:
cusum_nike_raw

Calculate running 3 day average

In [None]:
for i in range(0,cusum_nike_raw.shape[0]-2):
    cusum_nike_raw.loc[cusum_nike_raw.index[i+2],'Nike SMA_3'] = np.round(((cusum_nike_raw.iloc[i,1]+ cusum_nike_raw.iloc[i+1,1] + cusum_nike_raw.iloc[i+2,1])/3),1)

Display the top 15 values

In [7]:
cusum_nike_raw.head(15)

NameError: name 'cusum_nike_raw' is not defined

Calculate rolling standard deviation

In [None]:
cusum_nike_raw['sigma_3day'] = cusum_nike_raw['Nike'].rolling(3).std()
cusum_nike_raw['sigma_7day'] = cusum_nike_raw['Nike'].rolling(7).std()
cusum_nike_raw.head(15)

Conver the cummulative column to a list 

In [None]:
nike_count_list = list(cusum_nike_raw['Nike'])
nike_count_list[:5]

Display the five number summary

In [None]:
print("Minimum value: ", np.min(nike_count_list))
print("Maximum value: ", np.max(nike_count_list))
print("Standard Deviation: ", np.std(nike_count_list))
print("Mean: ", np.mean(nike_count_list))
print("Median: ", np.median(nike_count_list))

Boxplot the findings

In [None]:
plt.boxplot(nike_count_list)
plt.show()

Create five number summary of the running 3 day standard deviation column

In [None]:
import math
ma_count_list = list(cusum_nike_raw['Nike SMA_3'])
cleaned_ma3_list = [0.0 if math.isnan(x) else x for x in ma_count_list]
cleaned_ma3_list[:5]

In [None]:
print("Minimum value: ", np.min(cleaned_ma3_list))
print("Maximum value: ", np.max(cleaned_ma3_list))
print("Standard Deviation: ", np.std(cleaned_ma3_list))
print("Mean: ", np.mean(cleaned_ma3_list))
print("Median: ", np.median(cleaned_ma3_list))

In [None]:
plt.boxplot(cleaned_ma3_list)
plt.show()

Create five number summary of the running 7 day standard deviation column

In [None]:
ma_count_list = list(cusum_nike_raw['sigma_7day'])
cleaned_ma7_list = [0.0 if math.isnan(x) else x for x in ma_count_list]
cleaned_ma7_list[:5]

In [None]:
print("Minimum value: ", np.min(cleaned_ma7_list))
print("Maximum value: ", np.max(cleaned_ma7_list))
print("Standard Deviation: ", np.std(cleaned_ma7_list))
print("Mean: ", np.mean(cleaned_ma7_list))
print("Median: ", np.median(cleaned_ma7_list))

In [None]:
plt.boxplot(cleaned_ma7_list)
plt.show()

## AR Model Forecasting

Conduct a timeseries forecasting analysis using the ARIMA model, using the Nike dataframe and counts as a base testing

In [8]:
from statsmodels.tsa.arima_model import ARIMA
from random import random

ModuleNotFoundError: No module named 'statsmodels'

Slice a new dataframe with the Nike values

In [None]:
nike_forecast = cusum_nike_raw[['date','Nike']]
nike_forecast.head(5)

Verify the datatypes of the new dataframe

In [None]:
nike_forecast.dtypes

In [None]:
from random import randrange
series = [i+randrange(10) for i in range(1,100)]
series[:10]

Slice out cummulative data summary

In [None]:
nike_forecast = cusum_nike_raw[['date','Nike']]
nike_forecast

Set the 'date' column as the index for the dataframe

In [None]:
nike_forecast.set_index('date', inplace=True)

Plot the sliced dataframe

In [None]:
nike_forecast.plot()

Plot the auto correlation

In [None]:
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(nike_forecast)

Calculate the differences between individual dates

In [None]:
nike_diff = nike_forecast.diff(periods = 1)
nike_diff.reset_index()

In [None]:
nike_diff.dtypes

In [None]:
nike_diff.plot()

Plot the autocorrelation for the nike difference dataframe

In [9]:
plot_acf(nike_diff)

NameError: name 'plot_acf' is not defined

Convert Nike values to a numpy array for calcualtion

In [None]:
from statsmodels.tsa.ar_model import AR
from sklearn.metrics import mean_squared_error

X = nike_forecast[['Nike']].to_numpy()
X[:5]

Create the train set

In [None]:
train = X[:80]
train.size

Create the test set

In [None]:
test = X[81:]
test.size

Create a predictions list to store the prediction values

In [None]:
predictions = []

Build the AR model

In [None]:
model_ar = AR(train)
model_ar_fit = model_ar.fit()

Predict the values from index 80 to 108

In [None]:
predictions = model_ar_fit.predict(start = 80, end= 108)

In [None]:
plt.figure(figsize = (10,6))
plt.grid(b=True, which='major', color='#666666', linestyle='-')
plt.plot(test)
plt.plot(predictions, color ='red')

In [None]:
plt.figure(figsize = (10,6))

plt.grid(b=True, which='major', color='#666666', linestyle='-')
plt.plot(nike_forecast)

## ARIMA Model Forecast

In [None]:
from statsmodels.tsa.arima_model import ARIMA

Invoke the ARIMA model function in order to build the forecast model

In [None]:
# parameters: p , d, q
# p = periods taken for autoregessive model
# d = order of integrated, number of times differences is executed
# q = periods in moving average model

model_arima = ARIMA(train, order = (10,0,1) )
model_arima_fit = model_arima.fit()
print(model_arima_fit.aic)

Fit the model with the test data set

In [None]:
arima_predictions = model_arima_fit.forecast(steps = 25)[0] # predict 25 values
arima_predictions

Plot the forecasted data against the test data

In [None]:
plt.figure(figsize = (12,8))
plt.grid(b=True, which='major', color='#666666', linestyle='-')
plt.title("Arima Forecasting Analysis")
plt.plot(test)
plt.plot(arima_predictions, color ='red')

In [None]:
print(model_arima_fit.summary())

### AIC Value Calculation

Calculate the AIC value in order to identify the optimized pdq set for the optimized model.  Lowest value is taken and inputted back in the ARIMA model

In [10]:
import itertools
p=d=q=range(0,5)
pdq = list(itertools.product(p,d,q))
pdq[:10]

[(0, 0, 0),
 (0, 0, 1),
 (0, 0, 2),
 (0, 0, 3),
 (0, 0, 4),
 (0, 1, 0),
 (0, 1, 1),
 (0, 1, 2),
 (0, 1, 3),
 (0, 1, 4)]

In [11]:
for parameter in pdq:
    try:
        model_arima = ARIMA(train, order = parameter )
        model_arima_fit = model_arima.fit()
        print(parameter, model_arima_fit.aic)
    except:
        continue

## Forecast

In [12]:
nike_count = cusum_nike_raw[['Nike']]
nike_count

NameError: name 'cusum_nike_raw' is not defined

Create the training and test sets

In [None]:
train_length = int(len(nike_count.Nike)*.8)
test_length = len(nike_count) - train_length

print("Train length {}".format(train_length))
print("Test length {}".format(test_length))

In [None]:
train = nike_count.Nike[:train_length]
test = nike_count.Nike[train_length:]

In [None]:
test_length = len(test)
test_length

Forecast the values

In [None]:
fc, se, conf = model_arima_fit.forecast(test_length, alpha=0.05)  # 95% conf
fc

Make a pandas series

In [None]:
fc_series = pd.Series(fc, index=test.index)
lower_series = pd.Series(conf[:, 0], index=test.index)
upper_series = pd.Series(conf[:, 1], index=test.index)

Plot the final findings 

In [None]:
plt.figure(figsize=(12,5), dpi=100)
plt.plot(train, label='training')
plt.plot(test, label='actual')
plt.plot(fc_series, label='forecast')
plt.grid(b=True, which='major', color='#666666', linestyle='-')
plt.fill_between(lower_series.index, lower_series, upper_series, 
                 color='k', alpha=.15)
plt.title('Forecast vs Actuals')
plt.legend(loc='upper left', fontsize=8)
plt.show()

# Program Completed

In [None]:
program_end = t.time() - program_start
print("Total time for program execution: ", round(program_end, 2), " seconds")