# Plotly tutorial - 120 years of Olympic games

In [1]:
import pandas as pd 
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
#from bubbly.bubbly import bubbleplot 
#from __future__ import division
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

  shapely_geos_version, geos_capi_version_string


/kaggle/input/120-years-of-olympic-history-athletes-and-results/noc_regions.csv
/kaggle/input/120-years-of-olympic-history-athletes-and-results/athlete_events.csv


# 1. Read in kaggle "120 years of Olympic games" data and analyse.

In [2]:
athlete_events_df = pd.read_csv('../input/120-years-of-olympic-history-athletes-and-results/athlete_events.csv')
athlete_events_df.head(5)

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [3]:
noc_regions_df = pd.read_csv('../input/120-years-of-olympic-history-athletes-and-results/noc_regions.csv')
noc_regions_df.head(5)

Unnamed: 0,NOC,region,notes
0,AFG,Afghanistan,
1,AHO,Curacao,Netherlands Antilles
2,ALB,Albania,
3,ALG,Algeria,
4,AND,Andorra,


In [4]:
athlete_events_df.shape

(271116, 15)

In [5]:
noc_regions_df.shape

(230, 3)

In [6]:
athlete_events_df.isnull().sum().sort_values(ascending = False)

Medal     231333
Weight     62875
Height     60171
Age         9474
ID             0
Name           0
Sex            0
Team           0
NOC            0
Games          0
Year           0
Season         0
City           0
Sport          0
Event          0
dtype: int64

In [7]:
noc_regions_df.isnull().sum().sort_values(ascending = False)

notes     209
region      3
NOC         0
dtype: int64

In [8]:
tmp = athlete_events_df.groupby(['Year', 'City'])['Season'].value_counts()
df = pd.DataFrame(data={'Athlets': tmp.values}, index=tmp.index).reset_index()

In [9]:
df.head()

Unnamed: 0,Year,City,Season,Athlets
0,1896,Athina,Summer,380
1,1900,Paris,Summer,1936
2,1904,St. Louis,Summer,1301
3,1906,Athina,Summer,1733
4,1908,London,Summer,3101


# 2. Plot Year & Athlets in scatter graph; use markers or markers+lines.

In [10]:
trace = go.Scatter(
    x = df['Year'],
    y = df['Athlets'],
    name="Athlets per Olympic game",
    marker=dict(size=14,
                color='rgba(152, 0, 0, .8)',
    ),
    mode = "markers"
)
data = [trace]

layout = dict(title = 'Athlets per Olympic game',
          xaxis = dict(title = 'Year', showticklabels=True), 
          yaxis = dict(title = 'Number of athlets'),
          hovermode = 'closest'
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='events-athlets1')

# 3. Plot Year & Athlets in scatter plot, but showing now the Summer and Winter games with diferent colors, on the same plot, in two different traces.

In [11]:
dfS = df[df['Season'] =='Summer']; dfW = df[df['Season'] =='Winter']

traceS = go.Scatter(
    x = dfS['Year'],y = dfS['Athlets'],
    name="Summer Games",
    marker=dict(color="Red"),
    mode = "markers+lines"
)
traceW = go.Scatter(
    x = dfW['Year'],y = dfW['Athlets'],
    name="Winter Games",
    marker=dict(color="rgba(0, 0, 112, 1)"),
    mode = "markers+lines"
)

data = [traceS, traceW]
layout = dict(title = 'Athlets per Olympic game',
          xaxis = dict(title = 'Year', showticklabels=True), 
          yaxis = dict(title = 'Number of athlets'),
          hovermode = 'closest'
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='events-athlets2')

# 4. Show in subplots and display the previous variables side-by-side, on two colums.

In [12]:
traceS = go.Scatter(
    x = dfS['Year'],y = dfS['Athlets'],
    name="Summer Games",
    marker=dict(color="Red"),
    mode = "markers+lines",
    text=dfS['City'],
)
traceW = go.Scatter(
    x = dfW['Year'],y = dfW['Athlets'],
    name="Winter Games",
    marker=dict(color="Blue"),
    mode = "markers+lines",
    text=dfW['City']
)

data = [traceS, traceW]

fig = tools.make_subplots(rows=1, cols=2, subplot_titles=('Number athlets: Summer Games', 'Number athlets: Winter Games'))
fig.append_trace(traceS, 1, 1)
fig.append_trace(traceW, 1, 2)

iplot(fig, filename='events-athlets2')


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



# 5. Display the number of athlets per Olympic Game using barplot.

In [13]:
tmp = athlete_events_df.groupby('Year')['City'].value_counts()
df2 = pd.DataFrame(data={'Athlets': tmp.values}, index=tmp.index).reset_index()
df2 = df2.merge(df)

In [14]:
dfS = df2[df2['Season']=='Summer']; dfW = df2[df2['Season']=='Winter']

traceS = go.Bar(
    x = dfS['Year'],y = dfS['Athlets'],
    name="Summer Games",
    marker=dict(color="rgba(0, 83, 156, 1.00)"),
    text=dfS['City']
)
traceW = go.Bar(
    x = dfW['Year'],y = dfW['Athlets'],
    name="Winter Games",
    marker=dict(color="rgba(238, 164, 127, 1.00)"),
    text=dfS['City']
)

data = [traceS, traceW]
layout = dict(title = 'Athlets per Olympic game',
          xaxis = dict(title = 'Year', showticklabels=True), 
          yaxis = dict(title = 'Number of athlets'),
          hovermode = 'closest'
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='events-athlets3')

In [15]:
traceS = go.Bar(
    x = dfS['Year'],y = dfS['Athlets'],
    name="Summer Games",
     marker=dict(
                color='rgb(238,23,11)',
                line=dict(
                    color='black',
                    width=0.75),
                opacity=0.7,
            ),
    text=dfS['City'],
    
)
traceW = go.Bar(
    x = dfW['Year'],y = dfW['Athlets'],
    name="Winter Games",
    marker=dict(
                color='rgb(11,23,245)',
                line=dict(
                    color='black',
                    width=0.75),
                opacity=0.7,
            ),
    text=dfS['City']
)

data = [traceS, traceW]
layout = dict(title = 'Athlets per Olympic game',
          xaxis = dict(title = 'Year', showticklabels=True), 
          yaxis = dict(title = 'Number of athlets'),
          hovermode = 'closest',
          barmode='stack'
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='events-athlets4')

# 6. Show the distribution of athlets number during the Olympic games editions, grouped by Season, in Box plot.

In [16]:
traceS = go.Box(
    x = dfS['Athlets'],
    name="Summer Games",
    
     marker=dict(
                color='rgba(238,23,11,0.5)',
                line=dict(
                    color='red',
                    width=1.2),
            ),
    text=dfS['City'],
    orientation='h',
    
)
traceW = go.Box(
    x = dfW['Athlets'],
    name="Winter Games",
    marker=dict(
                color='rgba(11,23,245,0.5)',
                line=dict(
                    color='black',
                    width=1.2),
            ),
    text=dfS['City'],  orientation='h',
)

data = [traceS, traceW]
layout = dict(title = 'Athlets per Olympic game',
          xaxis = dict(title = 'Number of athlets',showticklabels=True),
          yaxis = dict(title = 'Season', showticklabels=True, tickangle=-90), 
          hovermode = 'closest',
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='events-athlets5')

# 7. Plot the number of athlets per sport for each year. For each sport, each year, a point should be plotted.

In [17]:
tmp = athlete_events_df.groupby(['Year', 'City','Season'])['Sport'].nunique()
df = pd.DataFrame(data={'Sports': tmp.values}, index=tmp.index).reset_index()

In [18]:
df.head(3)

Unnamed: 0,Year,City,Season,Sports
0,1896,Athina,Summer,9
1,1900,Paris,Summer,20
2,1904,St. Louis,Summer,18


In [19]:
dfS = df[df['Season']=='Summer']; dfW = df[df['Season']=='Winter']

traceS = go.Bar(
    x = dfS['Year'],y = dfS['Sports'],
    name="Summer Games",
     marker=dict(
                color='rgb(238,23,11)',
                line=dict(
                    color='red',
                    width=1),
                opacity=0.5,
            ),
    text= dfS['City'],
)
traceW = go.Bar(
    x = dfW['Year'],y = dfW['Sports'],
    name="Winter Games",
    marker=dict(
                color='rgb(11,23,245)',
                line=dict(
                    color='blue',
                    width=1),
                opacity=0.5,
            ),
    text=dfS['City']
)

data = [traceS, traceW]
layout = dict(title = 'Sports per Olympic edition',
          xaxis = dict(title = 'Year', showticklabels=True), 
          yaxis = dict(title = 'Number of sports'),
          hovermode = 'closest',
          barmode='stack'
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='events-sports1')

In [20]:
tmp = athlete_events_df.groupby(['Year', 'City','Season'])['Sport'].value_counts()
df = pd.DataFrame(data={'Athlets': tmp.values}, index=tmp.index).reset_index()
df.head()

Unnamed: 0,Year,City,Season,Sport,Athlets
0,1896,Athina,Summer,Athletics,106
1,1896,Athina,Summer,Gymnastics,97
2,1896,Athina,Summer,Shooting,65
3,1896,Athina,Summer,Cycling,41
4,1896,Athina,Summer,Tennis,23


In [21]:
dfS = df[df['Season']=='Summer']; dfW = df[df['Season']=='Winter']


traceS = go.Scatter(
    x = dfS['Year'],y = dfS['Athlets'],
    name="Summer Games",
     marker=dict(
                color='rgb(238,23,11)',
                line=dict(
                    color='red',
                    width=1),
                opacity=0.5,
            ),
    text= "City:"+dfS['City']+" Sport:"+dfS['Sport'],
    mode = "markers"
)
traceW = go.Scatter(
    x = dfW['Year'],y = dfW['Athlets'],
    name="Winter Games",
    marker=dict(
                color='rgb(11,23,245)',
                line=dict(
                    color='blue',
                    width=1),
                opacity=0.5,
            ),
   text= "City:"+dfW['City']+" Sport:"+dfW['Sport'],
    mode = "markers"
)

data = [traceS, traceW]
layout = dict(title = 'Number of athlets per sport for each Olympic edition',
          xaxis = dict(title = 'Year', showticklabels=True), 
          yaxis = dict(title = 'Number of athlets per sport'),
          hovermode='closest'
         )
fig = dict(data=data, layout=layout)
iplot(fig, filename='events-sports1')

# 8. Plot the distribution of number of athlets per sport. Group by Year and Season and count the athlets per each sport.

The legend show the sport and city, as well as the number of athlets per sport per edition.

Let's also show the distribution of number of athlets per sport. For this we group by `Year` and `Season` and count the athlets per each sport.

In [22]:
tmp = athlete_events_df.groupby(['Year', 'City','Season'])['Sport'].value_counts()
df = pd.DataFrame(data={'Athlets': tmp.values}, index=tmp.index).reset_index()
df.head(3)

Unnamed: 0,Year,City,Season,Sport,Athlets
0,1896,Athina,Summer,Athletics,106
1,1896,Athina,Summer,Gymnastics,97
2,1896,Athina,Summer,Shooting,65


Let's define a list with all the Sports.

In [23]:
sports = (athlete_events_df.groupby(['Sport'])['Sport'].nunique()).index

We will create a function to display `trace` and one function to display the set of traces.  

We will also filter the Games per Summer and Winter.

In [24]:
def draw_trace(dataset, sport):
    dfS = dataset[dataset['Sport']==sport];
    trace = go.Box(
        x = dfS['Athlets'],
        name=sport,
         marker=dict(
                    line=dict(
                        color='black',
                        width=0.8),
                ),
        text=dfS['City'], 
        orientation = 'h'
    )
    return trace


def draw_group(dataset, title,height=800):
    data = list()
    for sport in sports:
        data.append(draw_trace(dataset, sport))


    layout = dict(title = title,
              xaxis = dict(title = 'Number of athlets',showticklabels=True),
              yaxis = dict(title = 'Sport', showticklabels=True, tickfont=dict(
                family='Old Standard TT, serif',
                size=8,
                color='black'),), 
              hovermode = 'closest',
              showlegend=False,
                  width=800,
                  height=height,
             )
    fig = dict(data=data, layout=layout)
    iplot(fig, filename='events-sports1')

# select only Summer Olympics
df_S = df[df['Season']=='Summer']
# draw the boxplots for the Summer Olympics
draw_group(df_S, "Athlets per Sport (Summer Olympics)")

Let's now use the same function defined above to plot the sports in Winter Olympics.

In [25]:
# select only Winter Olympics
df_W = df[df['Season']=='Winter']
# draw the boxplots for the Summer Olympics
draw_group(df_W, "Athlets per Sport (Winter Olympics)",600)

# 9. Use a Heatmap to show the number of athlets per Game event and per Sport.

In [26]:
piv = pd.pivot_table(df_S, values="Athlets",index=["Year"], columns=["Sport"], fill_value=0)
m = piv.values

We prepare the `Heatmap`.

The attributes we use are:
* z - the matrix with values to be displayed;
* x - the columns names;
* y - the rows names;  
* colorsacale - the color scale to be used for display; 

In [27]:
trace = go.Heatmap(z = m, y= list(piv.index), x=list(piv.columns),colorscale='Reds',reversescale=False)
data=[trace]
layout = dict(title = "Number of athlets per year and sport (Summer Olympics)",
              xaxis = dict(title = 'Sport',
                        showticklabels=True,
                           tickangle = 45,
                        tickfont=dict(
                                size=10,
                                color='black'),
                          ),
              yaxis = dict(title = 'Year', 
                        showticklabels=True, 
                        tickfont=dict(
                            size=10,
                            color='black'),
                      ), 
              hovermode = 'closest',
              showlegend=False,
                  width=1000,
                  height=800,
             )
fig = dict(data=data, layout=layout)
iplot(fig, filename='labelled-heatmap')

Let's show also the corresponding heatmap plot for Winter Olympics.

In [28]:
piv = pd.pivot_table(df_W, values="Athlets",index=["Year"], columns=["Sport"], fill_value=0)
m = piv.values

In [29]:
trace = go.Heatmap(z = m, y= list(piv.index), x=list(piv.columns),colorscale='Blues',reversescale=True)
data=[trace]
layout = dict(title = "Number of athlets per year and sport (Winter Olympics)",
              xaxis = dict(title = 'Sport',
                        showticklabels=True,
                           tickangle = 30,
                        tickfont=dict(
                                size=8,
                                color='black'),
                          ),
              yaxis = dict(title = 'Year', 
                        showticklabels=True, 
                        tickfont=dict(
                            size=10,
                            color='black'),
                      ), 
              hovermode = 'closest',
              showlegend=False,
                  width=800,
                  height=800,
             )
fig = dict(data=data, layout=layout)
iplot(fig, filename='labelled-heatmap')

# 10. Show the number of athlets per Game event and per Sport in a pie plot for only the Summer Olympics data

In [30]:
labels = ['Sunny side of pyramid','Shaddy side of pyramid','Sky']
values = [300,150,1200]
colors = ['gold', 'brown', 'lightblue']

BOTTOM_OF_THE_PYRAMID_ACCORDING_TO_NEWTON_LAWS = 220

trace = go.Pie(labels=labels, values=values,
               hoverinfo='label', textinfo='none', 
               textfont=dict(size=20),
               rotation=BOTTOM_OF_THE_PYRAMID_ACCORDING_TO_NEWTON_LAWS,
               marker=dict(colors=colors, 
                           line=dict(color='#000000', width=1)))
iplot([trace], filename='styled_pie_chart')

We used `rotation` to align the base part of the pyramid to the ground. 

We used `textinfo` = `none` to remove percent or label text from the pie slices.


Let's use here to show the proportion of athlets number per sports, separatelly for Summer and Winter Olympics.

In [31]:
tmp = athlete_events_df.groupby(['Season'])['Sport'].value_counts()
df = pd.DataFrame(data={'Athlets': tmp.values}, index=tmp.index).reset_index()
df.head(3)

Unnamed: 0,Season,Sport,Athlets
0,Summer,Athletics,38624
1,Summer,Gymnastics,26707
2,Summer,Swimming,23195


In [32]:
df_S = df[df['Season']=='Summer']

trace = go.Pie(labels=df_S['Sport'], 
               values=df_S['Athlets'],
               hoverinfo='label+value+percent', 
               textinfo='value+percent', 
               textfont=dict(size=8),
               rotation=180,
               marker=dict(colors=colors, 

                           line=dict(color='#000000', width=1)
                        )
            )

data = [trace]
layout = dict(title = "Number of athlets per sport (Summer Olympics)",
                  width=800,
                  height=1200,
              legend=dict(orientation="h")
             )
fig = dict(data=data,layout=layout)
iplot(fig, filename='styled_pie_chart')

In [33]:
df_S = df[df['Season']=='Winter']

trace = go.Pie(labels=df_S['Sport'], 
               values=df_S['Athlets'],
               hoverinfo='label+value+percent', 
               textinfo='value+percent', 
               textfont=dict(size=8),
               rotation=180,
               marker=dict(colors=colors, 

                           line=dict(color='#000000', width=1)
                        )
            )

data = [trace]
layout = dict(title = "Number of athlets per sport (Winter Olympics)",
                  width=800,
                  height=800,
              legend=dict(orientation="h")
             )
fig = dict(data=data,layout=layout)
iplot(fig, filename='styled_pie_chart')