In [136]:
import altair as alt
import pandas as pd

In [137]:
alt.data_transformers.enable("json")

housing_data = pd.read_csv("../data/ca-housing-umap.csv")
housing_data

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,x,y
0,0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,10.808258,8.449437
1,1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,11.427483,7.456382
2,2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,10.477100,9.221636
3,3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,11.099041,10.675879
4,4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,10.908834,10.497625
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20428,20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND,1.091979,-4.283607
20429,20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND,0.361509,-3.971345
20430,20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND,1.350724,-3.754926
20431,20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND,1.034270,-3.852482


In [138]:
selection_xy = alt.selection_interval()
selection_income = alt.selection_interval(encodings=['x'])
selection_value = alt.selection_interval(encodings=['x'])
selection_age = alt.selection_interval(encodings=['x'])
selection_ocean = alt.selection_multi(fields=['ocean_proximity'])

Deprecated since `altair=5.0.0`. Use selection_point instead.
  selection_ocean = alt.selection_multi(fields=['ocean_proximity'])


In [139]:
filter_all = selection_xy & selection_income & selection_value & selection_age & selection_ocean

In [140]:
def make_hist(feature, selection, title, bins=30):
    background = alt.Chart(housing_data).mark_bar(color='lightgray').encode(
        x=alt.X(f'{feature}:Q', bin=alt.Bin(maxbins=bins), title=feature),
        y='count():Q'
    ).properties(width=180, height=120, title=title)
    
    foreground = alt.Chart(housing_data).mark_bar(color='steelblue').encode(
        x=alt.X(f'{feature}:Q', bin=alt.Bin(maxbins=bins), title=feature),
        y='count():Q'
    ).transform_filter(filter_all)
    
    return alt.layer(background, foreground).add_selection(selection)

In [141]:
background_scatter = alt.Chart(housing_data).mark_circle(color='lightgray', opacity=0.8).encode(
    x=alt.X('x:Q', title='UMAP X'),
    y=alt.Y('y:Q', title='UMAP Y')
).properties(width=300, height=300, title='UMAP Coordinates')

foreground_scatter = alt.Chart(housing_data).mark_circle(color='steelblue', opacity=0.8).encode(
    x=alt.X('x:Q', title='UMAP X'),
    y=alt.Y('y:Q', title='UMAP Y')
).transform_filter(filter_all)

scatter = alt.layer(background_scatter, foreground_scatter).add_selection(selection_xy)

Deprecated since `altair=5.0.0`. Use add_params instead.
  scatter = alt.layer(background_scatter, foreground_scatter).add_selection(selection_xy)


In [142]:
hist_income = make_hist('median_income', selection_income, 'Median Income', bins=30)
hist_value = make_hist('median_house_value', selection_value, 'Median House Value', bins=30)
hist_age = make_hist('housing_median_age', selection_age, 'Housing Median Age', bins=20)

Deprecated since `altair=5.0.0`. Use add_params instead.
  return alt.layer(background, foreground).add_selection(selection)


In [143]:
background_ocean = alt.Chart(housing_data).mark_bar(color='lightgray').encode(
    x=alt.X('ocean_proximity:N', title='Ocean Proximity'),
    y='count():Q'
).properties(width=180, height=120, title='Ocean Proximity')

foreground_ocean = alt.Chart(housing_data).mark_bar(color='steelblue').encode(
    x=alt.X('ocean_proximity:N', title='Ocean Proximity'),
    y='count():Q'
).transform_filter(filter_all)

ocean_chart = alt.layer(background_ocean, foreground_ocean).add_selection(selection_ocean)

Deprecated since `altair=5.0.0`. Use add_params instead.
  ocean_chart = alt.layer(background_ocean, foreground_ocean).add_selection(selection_ocean)


In [144]:
left = scatter
middle = alt.vconcat(hist_income, hist_age, spacing=10)
right = alt.vconcat(hist_value, ocean_chart, spacing=10)
dashboard = alt.hconcat(left, middle, right, spacing=20).resolve_scale(color='independent')

dashboard

I chose to use Altair for this assignment because it is a much higher level interface, that in my opinion is easier to understand, and it allows me to use Pandas for an easier and quicker analysis. Whille D3 has more options and can give you more control over the visualizations that you make, the D3 exercise that we completed last week was very time consuming, and I think for an assignment like this would have taken me much longer to complete if I hadn't used Altair. Altair is a more rapid way of doing data analysis, and it is very helpful that it automatically handles some details of the designs of the charts. D3 would be something that I would use if I need a much more professional grade and customizable charts and dashboards, but Altair is sufficient for quick creation and analysis.

When making this dashboard, I selected the median income, house values, house ages, and ocean proximity as the features for the 4 additional charts because they represent economic and geographical characteristics of the data that could help intepret the clusters on the UMAP coordinate map. I used histograms for my numerical features (income, house values, house ages) to show the distribution of their values, and used a bar chart for ocean proximity since it is a categorical value. I also needed to specify the size of the charts, since not setting a size would make the charts too big to properly display in this ipynb file. I also modified the layout of the charts to make it easier to see them all, while making the most important chart, the UMAP coordinates scatterplot, the biggest and most visible. One of the drawbacks that was very noticeable when displaying and using the charts was that the charts were very laggy, most likely due to the size of the dataset and having to filter through so much brushed data.