# Lecture 6 – Visualization

## History of Data Science, Winter 2022

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

## Playfair's bar chart of imports and exports from Scotland

<img src='images/scotland.png' width=500>

The kind folks who wrote [one of the readings](https://higherlogicdownload.s3.amazonaws.com/AMSTAT/1484431b-3202-461e-b7e6-ebce10ca8bcd/UploadedImages/Classroom_Activities/HS_3_Origins_of_graphs_in_statistics.pdf) have posted the dataset that Playfair used.

In [None]:
scotland = pd.read_csv('data/playfair-scotland.csv')
scotland

In [None]:
scotland.plot(kind='barh', x='country');

It seems like Playfair's graph is sorted in some sort of order... but it's not sorted by imports or by exports.

In [None]:
scotland.sort_values('imports', ascending=False).plot(kind='barh', x='country', figsize=(10, 5));

Let's see how we can make an interactive version of this plot. The library `plotly` will come in handy here.

In [None]:
import plotly.express as px

In [None]:
fig = px.bar(scotland.sort_values('imports', ascending=False), 
             x=['exports', 'imports'], 
             y='country', 
             barmode='group', 
             orientation='h',
             color_discrete_map={
                 'exports': '#151EA6',
                 'imports': '#FCB305',
              },      
             title='Exports and Imports of <b>Scotland</b> to and from different parts for one Year'
            )

fig.update_layout(
    font_family="Arial",
    title_font_family="Arial",
    paper_bgcolor='#FFFFFF',
    plot_bgcolor='#FFFFFF',
    legend = {
        'title': '',
        'orientation': 'h'
    }
)

fig.add_annotation( # add a text callout with arrow
    text="no exports to Greenland!", x=10000, y="Greenland", ax=125,
    arrowhead=2, showarrow=True
)

fig.update_xaxes(title_text='',
                 side='top',
                 showline=True, 
                 linewidth=2, 
                 linecolor='black',
                 mirror=True,
                 showgrid=True, 
                 gridwidth=1, 
                 gridcolor='#EEEEEE',
                 tick0=0, 
                 dtick=25000,
                 tickangle=0)

fig.update_yaxes(title_text='',
                 side='right',
                 showline=True, 
                 linewidth=2, 
                 linecolor='black',
                 mirror=True,
                 showgrid=True, 
                 gridwidth=1, 
                 gridcolor='#EEEEEE',
                 tickson='boundaries')

As an aside – what if we want to export this chart to HTML, to put on a website? (Say, for making a data science portfolio?)

The `.to_html()` method will come in handy.

In [None]:
with open('scotland.html', 'w') as f:
    f.write(fig.to_html())
    f.close()

## Playfair's wheat and wages chart

In [None]:
wheat = pd.read_csv('data/wheat/Wheat.csv').drop(columns=['Unnamed: 0']).iloc[:-1]
wheat.head()

This task is a bit different, since it involves two different types of visualizations – a line chart and a bar chart.

In [None]:
px.line(wheat, x='Year', y='Wages')

In [None]:
px.bar(wheat, x='Year', y='Wages')

Instead of using `plotly.express`, which is a "lite" version of `plotly`, we will use `plotly`'s `graph_objects` module.

In [None]:
import plotly.graph_objects as go

In [None]:
wheat_fig = go.Figure()

# Add bar chart
wheat_fig.add_trace(
    go.Bar(
        x=wheat['Year'],
        y=wheat['Wheat'],
        name='Wheat',
        marker={'color': '#AAAAAA'},
        width=5
    )
)

# Add line chart
wheat_fig.add_trace(
    go.Scatter(
        x=wheat['Year'],
        y=wheat['Wages'],
        name='Wages',
        marker={'color': 'red'},
        fill='tozeroy',
        fillcolor='rgba(135,206,235,0.65)'
    )
)

# Adjust overall attributes
wheat_fig.update_layout(
    font_family="Arial",
    title_font_family="Arial",
    paper_bgcolor='#FFFFFF',
    plot_bgcolor='#FFFFFF',
    showlegend=False
)

# Adjust x-axis
wheat_fig.update_xaxes(title_text='<i>5 Years each division</i>', 
                       tickmode='array',
                       tickvals=[1565, 1600, 1650, 1700, 1750, 1800, 1820], 
                       tickangle=0,
                       showgrid=False,
                       showline=True, 
                       linewidth=2, 
                       linecolor='black',
                       mirror=True)

# Adjust y-axis
wheat_fig.update_yaxes(title_text='<i>Price of the Quarter of Wheat in Shillings</i>',
                       side='right',
                       tick0=0, 
                       dtick=5, 
                       gridcolor='#EEEEEE',
                       gridwidth=1,
                       showline=True, 
                       linewidth=2, 
                       linecolor='black',
                       mirror=True)

# Add annotations
wheat_fig.add_annotation( # add a text callout with arrow
    text="<i>Weekly Wages of a Good Mechanic</i>", 
    x=1640, 
    y=9, 
    showarrow=False, 
    font = {
        'size': 10,
        'color': 'white'
    }
    
)

# Add annotations
title_text = 'CHART,<br><i>Showing at One View</i><br><i>The Price of The Quarter of Wheat</i><br>& Wages of Labour by the Week,<br>-- from --<br><i>The Year 1565 to 1821</i><br>-- by --<br><i>William Playfair</i>'

wheat_fig.add_annotation(
    text=title_text, 
    x=1640, 
    y=70, 
    font = {
        'size': 10,
        'color': 'black'
    },
    bordercolor="black",
    borderwidth=2,
    borderpad=4,
    bgcolor="#FFFFFF",
    opacity=1
    
)

wheat_fig.add_annotation(
    text="<i>Weekly Wages of a Good Mechanic</i>", 
    x=1640, 
    y=9, 
    showarrow=False, 
    font = {
        'size': 10,
        'color': 'black'
    }
    
)

## Pie charts

In [None]:
dist = pd.DataFrame().assign(
    continent=['African', 'European', 'Asiatic'],
    proportion=[0.2, 0.25, 0.55]
)

dist

In [None]:
px.pie(dist,
       values='proportion',
       names='continent',
       width=400,
       height=300)

## Gantt charts (i.e. timelines)

In [None]:
phases = [
 ['Newborn', '1998-11-26', '1999-11-26', 'Canada'],
 ['Toddler, Preschooler', '1999-11-26', '2005-09-03', 'US'],
 ['Elementary School Student', '2005-09-03', '2009-06-30', 'Canada'],
 ['Middle School Student', '2009-09-15', '2012-06-15', 'Canada'],
 ['High School Student', '2012-09-05', '2016-05-30', 'Canada'],
 ['Undergrad @ UC Berkeley', '2016-08-22','2020-05-15', 'US'],
 ['Masters @ UC Berkeley', '2020-08-25', '2021-05-14', 'Canada'],
 ['Lecturer @ UCSD', '2021-09-01', '2022-02-14', 'US']]

phases_df = pd.DataFrame().append(phases)
phases_df.columns=['Phase', 'Start', 'End', 'Location']
phases_df

In [None]:
tim = px.timeline(phases_df,
                  x_start = 'Start',
                  x_end = 'End',
                  y = 'Phase',
                  text = 'Location',
                  title = 'My Life Trajectory',
                  width=700,
                  height=400)

tim.update_yaxes(autorange='reversed')

## Box plots

In [None]:
world = px.data.gapminder()
world_2007 = world[world['year'] == 2007]
world_2007

In [None]:
px.box(world_2007,
       y = 'lifeExp',
       x = 'continent',
       color = 'continent',
       hover_name = 'country',
       title = 'Distribution of Life Expectancy in 2007 by Continent'
      )