In [1]:
import altair as alt
from altair import datum
alt.data_transformers.disable_max_rows()



DataTransformerRegistry.enable('default')

In [2]:
cars = alt.load_dataset('cars')



In [3]:
cars

Unnamed: 0,Name,Miles_per_Gallon,Cylinders,Displacement,Horsepower,Weight_in_lbs,Acceleration,Year,Origin
0,chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,1970-01-01,USA
1,buick skylark 320,15.0,8,350.0,165.0,3693,11.5,1970-01-01,USA
2,plymouth satellite,18.0,8,318.0,150.0,3436,11.0,1970-01-01,USA
3,amc rebel sst,16.0,8,304.0,150.0,3433,12.0,1970-01-01,USA
4,ford torino,17.0,8,302.0,140.0,3449,10.5,1970-01-01,USA
...,...,...,...,...,...,...,...,...,...
401,ford mustang gl,27.0,4,140.0,86.0,2790,15.6,1982-01-01,USA
402,vw pickup,44.0,4,97.0,52.0,2130,24.6,1982-01-01,Europe
403,dodge rampage,32.0,4,135.0,84.0,2295,11.6,1982-01-01,USA
404,ford ranger,28.0,4,120.0,79.0,2625,18.6,1982-01-01,USA


In [8]:
alt.Chart(cars).mark_circle(opacity=.33).encode(
    x = 'Horsepower',
    y = 'Miles_per_Gallon',
    color = 'Origin',
    size = 'Acceleration'
).interactive()

In [11]:
alt.Chart(cars).mark_bar().encode(
    y = 'Origin',
    x = 'count()',
    color = 'Cylinders:N'
)

In [17]:
alt.Chart(cars).mark_area(opacity=.25, interpolate='step').encode(
    x = alt.X('Miles_per_Gallon', bin=alt.Bin(maxbins=20)),
    y = alt.Y('count()', stack=None),
    color = 'Origin'
)

In [21]:
alt.Chart(cars).mark_circle().encode(
    x = alt.X(alt.repeat('column'), type='quantitative'),
    y = alt.Y(alt.repeat('row'), type='quantitative'),
    color = 'Origin'
).properties(
    width = 150,
    height = 150
).repeat(
    row = ['Miles_per_Gallon', 'Acceleration', 'Horsepower'],
    column = ['Miles_per_Gallon', 'Acceleration', 'Horsepower']
).interactive()

In [26]:
alt.Chart(cars).mark_line().encode(
    x = 'Year:T',
    y = 'count()',
    color = 'Origin'
)

In [27]:
alt.Chart(cars).mark_line().encode(
    x = 'Year:T',
    y = 'mean(Weight_in_lbs)',
    color = 'Origin'
)

In [28]:
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np

In [29]:
cars_df = pd.DataFrame(cars)

In [31]:
X_cars = cars_df.select_dtypes(include=[np.number]).dropna().values

In [33]:
X_embedding = TSNE(n_components=2).fit_transform(X_cars)

In [35]:
clustering = KMeans(n_clusters = 3).fit(X_cars)

In [38]:
df = pd.DataFrame(X_embedding)

In [39]:
df['C'] = clustering.labels_

In [40]:
df.columns = ['X', 'Y', 'C']

In [42]:
alt.Chart(df).mark_circle().encode(
    x = 'X',
    y = 'Y',
    color = 'C:N'
)

In [44]:
cars['Make'] = cars['Name'].str.split(' ').str[0]
cars

Unnamed: 0,Name,Miles_per_Gallon,Cylinders,Displacement,Horsepower,Weight_in_lbs,Acceleration,Year,Origin,Make
0,chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,1970-01-01,USA,chevrolet
1,buick skylark 320,15.0,8,350.0,165.0,3693,11.5,1970-01-01,USA,buick
2,plymouth satellite,18.0,8,318.0,150.0,3436,11.0,1970-01-01,USA,plymouth
3,amc rebel sst,16.0,8,304.0,150.0,3433,12.0,1970-01-01,USA,amc
4,ford torino,17.0,8,302.0,140.0,3449,10.5,1970-01-01,USA,ford
...,...,...,...,...,...,...,...,...,...,...
401,ford mustang gl,27.0,4,140.0,86.0,2790,15.6,1982-01-01,USA,ford
402,vw pickup,44.0,4,97.0,52.0,2130,24.6,1982-01-01,Europe,vw
403,dodge rampage,32.0,4,135.0,84.0,2295,11.6,1982-01-01,USA,dodge
404,ford ranger,28.0,4,120.0,79.0,2625,18.6,1982-01-01,USA,ford


In [46]:
alt.Chart(cars).mark_bar().encode(
    x = alt.X('Make', sort=alt.EncodingSortField(field='count', op= 'count', order= 'descending')),
    y = 'count()'
)