In [None]:
# Altair Graphs

In [2]:
# Standard imports
import pandas as pd
import numpy as np
import altair as alt

In [3]:
from vega_datasets import data
cars = data.cars()
cars.sample(5)

Unnamed: 0,Name,Miles_per_Gallon,Cylinders,Displacement,Horsepower,Weight_in_lbs,Acceleration,Year,Origin
35,datsun pl510,27.0,4,97.0,88.0,2130,14.5,1971-01-01,Japan
354,datsun 210,37.0,4,85.0,65.0,1975,19.4,1982-01-01,Japan
245,dodge colt m/m,33.5,4,98.0,83.0,2075,15.9,1977-01-01,USA
83,volvo 145e (sw),18.0,4,121.0,112.0,2933,14.5,1972-01-01,Europe
110,chevrolet impala,11.0,8,400.0,150.0,4997,14.0,1973-01-01,USA


In [5]:
import altair as alt
alt.Chart(cars).mark_point()

While we have now given this Chart data, and specified we’ll be adding points to the chart, we haven’t actually told it which variables will help determine the location of points yet. As a result, this code does not generate any output. But if we tell it we want to encode Miles_per_Gallon to the x-axis and Horsepower to the y-axis, we get:
https://www.practicaldatascience.org/html/plotting_altair_part1.html

In [6]:
alt.Chart(cars).mark_point().encode(x="Miles_per_Gallon", y="Horsepower")

In [7]:
wdi_data = ( "https://raw.githubusercontent.com/nickeubank/"
            "practicaldatascience/master/Example_Data/wdi_plotting.csv")
world = pd.read_csv(wdi_data)
world.sample(5)

Unnamed: 0,Year,Country Name,Country Code,GDP per capita (constant 2010 US$),"Population, total",CO2 emissions (metric tons per capita),"Mortality rate attributed to household and ambient air pollution, age-standardized (per 100,000 population)","PM2.5 air pollution, population exposed to levels exceeding WHO guideline value (% of total)","Life expectancy at birth, total (years)","Mortality rate, under-5 (per 1,000 live births)","Literacy rate, youth female (% of females ages 15-24)"
9148,2013,Cameroon,CMR,1360.108747,22077300.0,0.333827,,100.0,56.576,97.2,
4594,1992,Central African Republic,CAF,431.213106,2959236.0,0.050689,,,48.267,176.6,
7030,2003,"Hong Kong SAR, China",HKG,24010.5374,6730800.0,,,,81.378049,,
1012,1975,Nigeria,NGA,1958.832322,63374289.0,0.747874,,,43.187,241.5,
6687,2001,South Africa,ZAF,6017.17811,45571272.0,7.035134,,,55.089,73.0,


In [8]:
for c in world.columns: print(c)

Year
Country Name
Country Code
GDP per capita (constant 2010 US$)
Population, total
CO2 emissions (metric tons per capita)
Mortality rate attributed to household and ambient air pollution, age-standardized (per 100,000 population)
PM2.5 air pollution, population exposed to levels exceeding WHO guideline value (% of total)
Life expectancy at birth, total (years)
Mortality rate, under-5 (per 1,000 live births)
Literacy rate, youth female (% of females ages 15-24)


In [9]:
world.Year.describe()

count    10850.000000
mean      1995.500000
std         14.431535
min       1971.000000
25%       1983.000000
50%       1995.500000
75%       2008.000000
max       2020.000000
Name: Year, dtype: float64

In [10]:
# How many countries?
world["Country Name"].nunique()

217

In [11]:
world = world[world.Year == 2018]

In [14]:
alt.Chart(world).mark_point().encode(     x="GDP per capita (constant 2010 US$)",
    y="Mortality rate, under-5 (per 1,000 live births)", )

In [17]:
world["log_gdp_per_cap"] = np.log(world["GDP per capita (constant 2010 US$)"])
world["log_under5_mortality_rate"] = np.log( world["Mortality rate, under-5 (per 1,000 live births)"]
)

Altair allows information to be encoded in a range of mark features, including: 
-  Color
-  Size
-  Shape
-  Stroke (for lines)
-  Opacity

In [18]:
alt.Chart(world).mark_point().encode( x=alt.X("log_gdp_per_cap", scale=alt.Scale(zero=False)),
    y="log_under5_mortality_rate",    size="Population, total")

In [19]:
world["log_population"] = np.log(world["Population, total"])
alt.Chart(world).mark_point().encode(
    x=alt.X("log_population", scale=alt.Scale(zero=False)),
    y="log_under5_mortality_rate",
    size=alt.Size("log_gdp_per_cap", scale=alt.Scale(zero=False)),
)

In [21]:
base = ( alt.Chart(world).mark_point().encode(
        x=alt.X("log_gdp_per_cap", scale=alt.Scale(zero=False)),
        y="log_under5_mortality_rate",  size="Population, total", ))

In [22]:
fit = base.transform_regression( "log_gdp_per_cap", "log_under5_mortality_rate" ).mark_line()
fit

In [23]:
base + fit

In [24]:
loess = base.transform_loess(
    "log_gdp_per_cap", "log_under5_mortality_rate").mark_line(color="red")
base + fit + loess

In [25]:
#Faceting
base | base.encode(  y=alt.Y("Life expectancy at birth, total (years)", scale=alt.Scale(zero=False))
)


In [26]:
#Text
(  base   + fit + alt.Chart(world).encode(x=alt.X("log_gdp_per_cap", 
    scale=alt.Scale(zero=False)), y="log_under5_mortality_rate", text="Country Code", ).mark_text(size=5) )


In [30]:
#titles
base = (
    alt.Chart(world, title="GDP per Capita and Child Mortality")
    .mark_point()
    .encode(
        x=alt.X( "log_gdp_per_cap", scale=alt.Scale(zero=False), title="Log GDP per Capita"
        ),   y=alt.Y("log_under5_mortality_rate", title="Log Under-5 Mortality Rate"),
        size=alt.Size("Population, total", title="Population"),
    )
)

base + fit

In [31]:
c = base + fit
c.properties(title="A New Title!")

In [32]:
fit = base.transform_regression(  "log_gdp_per_cap", "log_under5_mortality_rate"
).mark_line(color="red", strokeDash=[15, 15])
base + fit

#save figures
c = base + fit

import altair_saver
altair_saver.save(c, "altair_figures/my_first_altair_figure.png")