In [9]:
## setup
import numpy as np
import pandas as pd
import plotly.offline as pyo
import plotly.graph_objs as go
# for pretty printing dataframe that is not the last line
from IPython.display import display, HTML

pyo.init_notebook_mode(connected=True)
df = pd.read_excel("NHL Attendance.xlsx", sheet_name="Sheet1",header=0)
# df.to_csv('nhl_attendance.csv', index=False)
df.head()
# fig = go.FigureWidget(data=go.Bar(y=[2, 3, 1]))
# fig.show()

Unnamed: 0,SEASON,RANK,TEAM,HOME GAMES,HOME ATTENDANCE,ROAD GAMES,ROAD ATTENDANCE,TOTAL GAMES,TOTAL ATTENDANCE
0,2017-18,1,Chicago,41,887794,41,723773,82,1611567
1,2017-18,2,Montreal,41,873283,41,733736,82,1607019
2,2017-18,3,Philadelphia,41,800214,41,702781,82,1502995
3,2017-18,4,Detroit,41,800115,41,717295,82,1517410
4,2017-18,5,Toronto,41,786677,41,751940,82,1538617


In [8]:
# General info
display(df.shape)
display(df.info())


(511, 9)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511 entries, 0 to 510
Data columns (total 9 columns):
SEASON              511 non-null object
RANK                511 non-null int64
TEAM                511 non-null object
HOME GAMES          511 non-null int64
HOME ATTENDANCE     511 non-null int64
ROAD GAMES          511 non-null int64
ROAD ATTENDANCE     511 non-null int64
TOTAL GAMES         511 non-null int64
TOTAL ATTENDANCE    511 non-null int64
dtypes: int64(7), object(2)
memory usage: 36.1+ KB


None

In [11]:
# number of seasons
seasons = df["SEASON"]
seasons.nunique()

17

In [26]:
# seasons data
season_starts = seasons.str.split('-', expand=True)

season_starts = pd.to_numeric(season_starts[0])
print("start year of earliest season:")
display(season_starts.min())
print("start year of latest season:")
display(season_starts.max())


start year of earliest season:


2000

start year of latest season:


2017

In [32]:
# Team data
team = df["TEAM"]
display(team.value_counts())
team_count = team.nunique()
print(f"number of teams = {team_count}")
# there are 30 teams that appeared in 17 season with 1 team (Vegas) joining in the 2017-2018 season

Edmonton        17
Los Angeles     17
Minnesota       17
NY Rangers      17
Detroit         17
Arizona         17
Vancouver       17
Pittsburgh      17
Anaheim         17
Boston          17
St. Louis       17
Calgary         17
Winnipeg        17
Dallas          17
Carolina        17
Ottawa          17
Columbus        17
Colorado        17
Philadelphia    17
Washington      17
San Jose        17
Montreal        17
Tampa Bay       17
Nashville       17
Toronto         17
New Jersey      17
NY Islanders    17
Buffalo         17
Florida         17
Chicago         17
Vegas            1
Name: TEAM, dtype: int64

number of teams = 31


In [36]:
# add average attendance into dataframe
df['average_attendance'] = df["TOTAL ATTENDANCE"] / df["TOTAL GAMES"]
df['average_attendance'] = df['average_attendance'].round().astype(int)
df.head()

Unnamed: 0,SEASON,RANK,TEAM,HOME GAMES,HOME ATTENDANCE,ROAD GAMES,ROAD ATTENDANCE,TOTAL GAMES,TOTAL ATTENDANCE,average_attendance
0,2017-18,1,Chicago,41,887794,41,723773,82,1611567,19653
1,2017-18,2,Montreal,41,873283,41,733736,82,1607019,19598
2,2017-18,3,Philadelphia,41,800214,41,702781,82,1502995,18329
3,2017-18,4,Detroit,41,800115,41,717295,82,1517410,18505
4,2017-18,5,Toronto,41,786677,41,751940,82,1538617,18764


In [45]:
# checking which year is the original visualization showing
year_filter = df['SEASON'] == "2010-11"
s2012_df = df.loc[year_filter, :].sort_values(by=["TEAM"])
display(s2012_df[["TEAM", "average_attendance"]])
# data = go.FigureWidget(data=go.Bar(
#     x=s2012_df["TEAM"],
#     y=s2012_df["average_attendance"]
#     )
#     )
# season = s2012_df.iloc[0,0]
# layout = go.Layout(
#     title= f"Average NFL attendance {season}"
#     )
# fig = go.Figure(data=data, layout=layout)
# fig.show()

Unnamed: 0,TEAM,average_attendance
236,Anaheim,15914
239,Arizona,14454
226,Boston,17706
219,Buffalo,17744
216,Calgary,17907
230,Carolina,16602
211,Chicago,19649
234,Colorado,15971
237,Columbus,15165
233,Dallas,15959


# Notes
Dataset spans from 2000-01 to 2017-2018. The original visualization shows attendance data from 
  
There are 30 teams that participated in all 17 season and 1 team (Vegas) that only joined in 2017-18 hence should be excluded from analysis.  
Attendence and games played are divided into home and road. As the number of games played are not the same across teams, average attendance per game should be used instead of total attendance

# Problems with original visualization
1. lack of a title that informs the season
2. double y-axis with 2 graphs lumped together makes it difficult to identify which y-axis belongs to which data
3. left y-axis is not zero based, which makes it hard to compare data accurately between teams
4. Line graphs usually implies a connection between points and is not a good choice for categorical data. Using line graph for year on year change is potentially confusing for audiences
5. Similar color choices and shades for 2 overlaping graph makes the graphs difficult to differentiate
# Changes
1. Add title with applicable season/s
2. Separate the 2 dataset
3. Show average attendance sorted by attendance instead of teams
4. Show Year on Year increase as a slope chart beside the attendance chart instead of overlapping
5. Improve color scheme to add focus
# Original Visualization
![NHL attendance](original_visualization.png "Original viz")