In [442]:
import numpy as np

from lets_plot import *
import pandas as pd
import vega_datasets
import seaborn as sns
LetsPlot.setup_html()


# Faceting

In [443]:
data = sns.load_dataset("penguins")
data

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [444]:
ggplot() + geom_point(data=data, mapping=aes(x='bill_depth_mm', y='flipper_length_mm', color='species'))

In [445]:
ggplot() + geom_point(
    data=data, 
    mapping=aes(
        x='bill_depth_mm', 
        y='flipper_length_mm',
    )
) + facet_grid(x='species')

In [446]:
ggplot() + geom_point(
    data=data, 
    mapping=aes(
        x='bill_depth_mm', 
        y='flipper_length_mm',
        color='species'
    )
) + facet_grid(x='species')

In [447]:
ggplot()+ geom_point(
    data=data[['bill_depth_mm', 'flipper_length_mm']], 
    mapping=aes(
        x='bill_depth_mm', 
        y='flipper_length_mm',
    ), color='lightgrey'
) + geom_point(
    data=data, 
    mapping=aes(
        x='bill_depth_mm', 
        y='flipper_length_mm',
        color='species'
    )
) + facet_grid(x='species')

In [452]:
ggplot()+ geom_point(
    data=data[['bill_depth_mm', 'flipper_length_mm']], 
    mapping=aes(
        x='bill_depth_mm', 
        y='flipper_length_mm',
    )
) + geom_point(
    data=data, 
    mapping=aes(
        x='bill_depth_mm', 
        y='flipper_length_mm',
        color='species'
    )
) + facet_grid(x='species', y='sex')

# Visualizing many variables

In [333]:
ggplot() + geom_point(data=data, mapping=aes(x='bill_depth_mm', y='flipper_length_mm', size='bill_length_mm', color='body_mass_g'))

In [334]:
ggplot() + geom_point(
    data=data, 
    mapping=aes(
        x='bill_depth_mm', 
        y='flipper_length_mm',
        color='bill_length_mm',
        size='body_mass_g',
    )
) + facet_grid(x='species', y='sex')

In [335]:
melted = data.reset_index().melt(id_vars=["index", "species", "island", "sex"])
merged = melted.merge(melted,  on=["index", "species", "island", "sex"]) 

ggplot() + geom_point(
    data=merged, 
    mapping=aes(
        x='value_x', 
        y='value_y',
        color='species'
    )
) + facet_grid(x='variable_x', y='variable_y',  scales='free')

In [336]:
import altair as alt

for field in ['bill_depth_mm', 'flipper_length_mm', 'bill_length_mm', 'body_mass_g']:
    data[field] = (data[field] - data[field].mean()) / data[field].std()

alt.Chart(data, width=500).transform_window(
    index='count()'
).transform_fold(
    ['bill_depth_mm', 'flipper_length_mm', 'bill_length_mm', 'body_mass_g']
).mark_line().encode(
    x='key:N',
    y='value:Q',
    color='species:N',
    detail='index:N',
    opacity=alt.value(0.5)
)

In [337]:
import altair as alt
from vega_datasets import data

source = data.iris()

alt.Chart(source, width=500).transform_window(
    index='count()'
).transform_fold(
    ['petalLength', 'petalWidth', 'sepalLength', 'sepalWidth']
).mark_line().encode(
    x='key:N',
    y='value:Q',
    color='species:N',
    detail='index:N',
    opacity=alt.value(0.5)
)

# Tiles and raster marks

In [338]:
from vega_datasets import data

data = data.windvectors()

In [339]:
data

Unnamed: 0,longitude,latitude,dir,dirCat,speed
0,0.125,45.125,228,225,3.12
1,0.375,45.125,228,225,3.24
2,0.625,45.125,229,225,3.34
3,0.875,45.125,229,225,3.44
4,1.125,45.125,228,225,3.48
...,...,...,...,...,...
4795,-1.125,59.875,155,150,5.96
4796,-0.875,59.875,154,150,6.34
4797,-0.625,59.875,153,150,6.71
4798,-0.375,59.875,152,150,7.09


In [340]:
ggplot() + geom_point(data=data, mapping=aes(x='longitude', y='latitude', color='speed')) + ggsize(1000, 600)

In [341]:
ggplot() + geom_point(data=data, mapping=aes(x='longitude', y='latitude', color='speed')) + scale_color_brewer('div', palette='RdBu', direction=-1) + ggsize(1000, 600)

In [342]:
ggplot() + geom_tile(data=data, mapping=aes(x='longitude', y='latitude', fill='speed')) + scale_fill_brewer('div', palette='RdBu', direction=-1) + ggsize(1000, 600)

In [343]:
ggplot() + geom_raster(data=data, mapping=aes(x='longitude', y='latitude', fill='speed')) + scale_fill_brewer('div', palette='RdBu', direction=-1) + ggsize(1000, 600)

# 2-D Histograms

In [432]:
from vega_datasets import data

data = data.movies()

In [433]:
ggplot() + geom_point(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating', color='Major_Genre', label='Name')) + ggsize(1000, 600)

In [434]:
ggplot() + geom_point(
    data=data[['IMDB_Rating', 'Rotten_Tomatoes_Rating']], 
    mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating'), color='lightgrey'
) + geom_point(
    data=data, 
    mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating', color='Major_Genre', label='Name')
)  + facet_wrap('Major_Genre') +  ggsize(1600, 1000)

In [435]:
ggplot() + geom_bin2d(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating'), bins=[20, 20])

In [436]:
ggplot() + geom_bin2d(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating'), bins=[20, 20]) \
    + coord_cartesian()

In [437]:
ggplot() + geom_bin2d(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating'), bins=[20, 20]) \
    + coord_cartesian() + scale_fill_brewer('div', palette='Spectral', direction=-1) 

In [438]:
data

Unnamed: 0,Title,US_Gross,Worldwide_Gross,US_DVD_Sales,Production_Budget,Release_Date,MPAA_Rating,Running_Time_min,Distributor,Source,Major_Genre,Creative_Type,Director,Rotten_Tomatoes_Rating,IMDB_Rating,IMDB_Votes
0,The Land Girls,146083.0,146083.0,,8000000.0,Jun 12 1998,R,,Gramercy,,,,,,6.1,1071.0
1,"First Love, Last Rites",10876.0,10876.0,,300000.0,Aug 07 1998,R,,Strand,,Drama,,,,6.9,207.0
2,I Married a Strange Person,203134.0,203134.0,,250000.0,Aug 28 1998,,,Lionsgate,,Comedy,,,,6.8,865.0
3,Let's Talk About Sex,373615.0,373615.0,,300000.0,Sep 11 1998,,,Fine Line,,Comedy,,,13.0,,
4,Slam,1009819.0,1087521.0,,1000000.0,Oct 09 1998,R,,Trimark,Original Screenplay,Drama,Contemporary Fiction,,62.0,3.4,165.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3196,Zack and Miri Make a Porno,31452765.0,36851125.0,21240321.0,24000000.0,Oct 31 2008,R,101.0,Weinstein Co.,Original Screenplay,Comedy,Contemporary Fiction,Kevin Smith,65.0,7.0,55687.0
3197,Zodiac,33080084.0,83080084.0,20983030.0,85000000.0,Mar 02 2007,R,157.0,Paramount Pictures,Based on Book/Short Story,Thriller/Suspense,Dramatization,David Fincher,89.0,,
3198,Zoom,11989328.0,12506188.0,6679409.0,35000000.0,Aug 11 2006,PG,,Sony Pictures,Based on Comic/Graphic Novel,Adventure,Super Hero,Peter Hewitt,3.0,3.4,7424.0
3199,The Legend of Zorro,45575336.0,141475336.0,,80000000.0,Oct 28 2005,PG,129.0,Sony Pictures,Remake,Adventure,Historical Fiction,Martin Campbell,26.0,5.7,21161.0


In [440]:
def density(data, x, y, bw=[0.25, 2.5]):
    x_dist = ((x - data['IMDB_Rating']) / bw[0]) ** 2
    y_dist = ((y - data['Rotten_Tomatoes_Rating']) / bw[1]) ** 2
    return np.exp(-(x_dist + y_dist)).mean()

x_coords, y_coords = np.meshgrid(np.linspace(0, 10, 200), np.linspace(0, 100, 200))
xy = np.stack([x_coords.flatten(), y_coords.flatten()]).T
data = pd.DataFrame(dict(
    x=xy[:, 0],
    y=xy[:, 1],
    density=[density(data, xi, yi) for (xi, yi) in xy]
))



In [441]:
ggplot() + geom_raster(data=data, mapping=aes(x='x', y='y', fill='density')) + scale_fill_brewer('div', palette='RdBu', direction=-1) + ggsize(1000, 600) + coord_cartesian()

In [350]:
ggplot() + geom_density2df(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating', fill='..level..'), bins=20, show_legend=True) + scale_fill_brewer('div', palette='Spectral', direction=-1)  + coord_cartesian()

In [351]:
ggplot() + geom_density2df(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating', fill='..level..'), bw=[0.25, 2.5], bins=20, show_legend=True) + scale_fill_brewer('div', palette='Spectral', direction=-1)  + coord_cartesian()

In [352]:
ggplot() + geom_density2df(data=data, mapping=aes(x='IMDB_Rating', y='Rotten_Tomatoes_Rating', fill='..group..'), show_legend=False) + scale_fill_brewer('div', palette='Spectral', direction=-1)  + coord_cartesian()

# Voronoi diagrams

In [353]:
from vega_datasets import data

airports = data.airports()
flights = data.flights_airport()

In [354]:
airports

Unnamed: 0,iata,name,city,state,country,latitude,longitude
0,00M,Thigpen,Bay Springs,MS,USA,31.953765,-89.234505
1,00R,Livingston Municipal,Livingston,TX,USA,30.685861,-95.017928
2,00V,Meadow Lake,Colorado Springs,CO,USA,38.945749,-104.569893
3,01G,Perry-Warsaw,Perry,NY,USA,42.741347,-78.052081
4,01J,Hilliard Airpark,Hilliard,FL,USA,30.688012,-81.905944
...,...,...,...,...,...,...,...
3371,ZEF,Elkin Municipal,Elkin,NC,USA,36.280024,-80.786069
3372,ZER,Schuylkill Cty/Joe Zerbey,Pottsville,PA,USA,40.706449,-76.373147
3373,ZPH,Zephyrhills Municipal,Zephyrhills,FL,USA,28.228065,-82.155916
3374,ZUN,Black Rock,Zuni,NM,USA,35.083227,-108.791777


In [355]:
ggplot() + geom_point(data=data, mapping=aes(x='longitude', y='latitude'))

In [357]:
counts = flights.merge(airports, left_on='origin', right_on='iata')
counts

Unnamed: 0,origin,destination,count,iata,name,city,state,country,latitude,longitude
0,ABE,ATL,853,ABE,Lehigh Valley International,Allentown,PA,USA,40.652363,-75.440402
1,ABE,BHM,1,ABE,Lehigh Valley International,Allentown,PA,USA,40.652363,-75.440402
2,ABE,CLE,805,ABE,Lehigh Valley International,Allentown,PA,USA,40.652363,-75.440402
3,ABE,CLT,465,ABE,Lehigh Valley International,Allentown,PA,USA,40.652363,-75.440402
4,ABE,CVG,247,ABE,Lehigh Valley International,Allentown,PA,USA,40.652363,-75.440402
...,...,...,...,...,...,...,...,...,...,...
5361,YUM,IPL,326,YUM,Yuma MCAS-Yuma International,Yuma,AZ,USA,32.656583,-114.605972
5362,YUM,LAS,99,YUM,Yuma MCAS-Yuma International,Yuma,AZ,USA,32.656583,-114.605972
5363,YUM,LAX,1044,YUM,Yuma MCAS-Yuma International,Yuma,AZ,USA,32.656583,-114.605972
5364,YUM,PHX,1961,YUM,Yuma MCAS-Yuma International,Yuma,AZ,USA,32.656583,-114.605972


In [358]:
counts = counts[['origin', 'count', 'latitude', 'longitude']]
counts = counts.groupby('origin').agg({
    'count': 'sum',
    'latitude': 'first',
    'longitude': 'first',
})
counts

Unnamed: 0_level_0,count,latitude,longitude
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ABE,4807,40.652363,-75.440402
ABI,2660,32.411320,-99.681897
ABQ,41146,35.040222,-106.609194
ABY,1095,31.535515,-84.194473
ACK,457,41.253052,-70.060181
...,...,...,...
WYS,264,44.688399,-111.117638
XNA,14112,36.281869,-94.306811
YAK,725,59.503361,-139.660226
YKM,340,46.568170,-120.544059


In [359]:
counts.query('(count > 2000) & (latitude > 25) & (latitude < 50)')

Unnamed: 0_level_0,count,latitude,longitude
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ABE,4807,40.652363,-75.440402
ABI,2660,32.411320,-99.681897
ABQ,41146,35.040222,-106.609194
ACV,3714,40.978115,-124.108619
AEX,2330,31.327372,-92.548556
...,...,...,...
TVC,4262,44.741445,-85.582235
TYS,14004,35.812487,-83.992856
VPS,6820,30.483250,-86.525400
XNA,14112,36.281869,-94.306811


In [360]:
ggplot() + geom_point(data=counts.query('(count > 2000) & (latitude > 25) & (latitude < 50)'), mapping=aes(x='longitude', y='latitude'))

In [361]:
from scipy.spatial import Voronoi

v = Voronoi(points = counts[['longitude', 'latitude']].values)
v_df = pd.DataFrame([(i, *v.vertices[v_id]) for i, r in enumerate(v.regions) \
                                            for v_id in r if any(r) and not -1 in r],
                    columns=['id', 'x', 'y'])

ggplot() + \
    geom_polygon(aes(x='x', y='y', group='id', fill='id'), \
                 data=v_df, show_legend=False, color='black', alpha=0.) + \
    geom_point(aes(x='longitude', y='latitude'), data=counts, shape=21, color='black', fill='white') + \
    scale_fill_discrete() + scale_x_continuous(limits=[-125, -70]) + scale_y_continuous(limits=[25, 50]) + ggsize(1200, 800)

In [362]:
v.vertices[v.ridge_vertices].shape

(897, 2, 2)

In [363]:
v.vertices[0]

array([-122.52061032,  -13.33427622])

In [364]:
from scipy.spatial import Voronoi

v = Voronoi(points = counts[['longitude', 'latitude']].values)
v_df = pd.DataFrame([(v.vertices[r[0]].tolist() + v.vertices[r[1]].tolist()) for i, r in enumerate(v.ridge_vertices) if any(r) and not -1 in r],
                    columns=['x1', 'y1', 'x2', 'y2'])

ggplot() + \
    geom_segment(aes(x='x1', y='y1', xend='x2', yend='y2',), \
                 data=v_df, show_legend=False, color='black') + \
    geom_point(aes(x='longitude', y='latitude'), data=counts, shape=21, color='black', fill='white') + \
    scale_fill_discrete() + scale_x_continuous(limits=[-125, -70]) + scale_y_continuous(limits=[25, 50]) + ggsize(1200, 800)

In [365]:
v_df

Unnamed: 0,x1,y1,x2,y2
0,-162.443216,63.804203,-157.848949,64.376560
1,-162.443216,63.804203,-169.585957,56.901014
2,-157.848949,64.376560,-165.377812,49.900827
3,-169.585957,56.901014,-165.377812,49.900827
4,-156.615711,65.020800,-157.848949,64.376560
...,...,...,...,...
883,-90.176172,43.823228,-89.330875,43.993172
884,-88.647938,42.812487,-88.565038,43.432585
885,-91.127222,45.568721,-88.856878,45.038552
886,-88.306081,45.632959,-88.755175,45.108732


In [366]:
ggplot() + geom_point(data=counts.query('(count > 2000) & (latitude > 25) & (latitude < 50)'), mapping=aes(x='longitude', y='latitude')) + geom_segment()