# Tiny Charts Implementation for Stacked Bar Charts
## CS765 Final Project
## Instructions:
1.   Run the First Code Block. This allows you to interact with the program. There are three built in datasets you can choose from. 
2.   Run the rest of the code blocks. At the bottom of this page, there will be 4 stacked bar charts generated, each of varying size.



In [1]:
# User input/output with program.
# Decide what dataset to visualize and the variable of interest
print("Which Dataset Would you Like to View? \n 1) Time Usage Survey \n 2) Cereals \n 3) Museums")
dataset_choice = int(input("Enter Option (1, 2, or 3): "))
if(dataset_choice == 1):
  print("You have chosen to analyze the Time Usage Survey data!")
  print("Do you have any particular variable of interest? If so, which one? These are the following categories")
  print(" 0) No particular variable in interest. Show me general trends. \n 1) American Indian, Alaskan Native \n 2) Asian Only \n 3) Black Only \n 4) White Only \n 5) White-American Indian")
  category_choice = int(input("Enter your choice of category (0-5): "))
  if(category_choice < 0 or category_choice > 5):
    print("Invalid Choice. Please restart the program and try again.")
  orig_categories_order = ['None', 'American Indian, Alaskan Native', 'Asian only', 'Black only', 'White only', 'White-American Indian']
elif(dataset_choice == 2):
  print("You have chosen to analyze the Cereals data!")
  print("Do you have any particular variable of interest? If so, which one? These are the following categories")
  print(" 0) No particular variable in interest. Show me general trends. \n 1) American Home Food Products \n 2) General Mills \n 3) Kelloggs \n 4) Nabisco \n 5) Post \n 6) Quaker Oats \n 7) Ralston Purina")
  category_choice = int(input("Enter your choice of category (0-7): "))
  if(category_choice < 0 or category_choice > 7):
    print("Invalid Choice. Please restart the program and try again.")
  orig_categories_order = ['None', 'A', 'G', 'K', 'N', 'P', 'Q', 'R']
elif(dataset_choice == 3):
  print("You have chosen to analyze the Museums data!")
  print("Do you have any particular variable of interest? If so, which one? These are the following categories")
  print(" 0) No particular variable in interest. Show me general trends. \n 1) HISTORY MUSEUM \n 2) ARBORETUM, BOTANICAL GARDEN, OR NATURE CENTER \n 3) SCIENCE & TECHNOLOGY MUSEUM OR PLANETARIUM \n 4) GENERAL MUSEUM \n 5) HISTORIC PRESERVATION \n 6) ZOO, AQUARIUM, OR WILDLIFE CONSERVATION \n 7) ART MUSEUM \n 8) CHILDREN'S MUSEUM \n 9) NATURAL HISTORY MUSEUM" )
  category_choice = int(input("Enter your choice of category (0-9): "))
  if(category_choice < 0 or category_choice > 9):
    print("Invalid Choice. Please restart the program and try again.")
  orig_categories_order = ['None', 'HISTORY MUSEUM', 'ARBORETUM, BOTANICAL GARDEN, OR NATURE CENTER', 'SCIENCE & TECHNOLOGY MUSEUM OR PLANETARIUM', 'GENERAL MUSEUM', 'HISTORIC PRESERVATION', 'ZOO, AQUARIUM, OR WILDLIFE CONSERVATION', 'ART MUSEUM', "CHILDREN'S MUSEUM", 'NATURAL HISTORY MUSEUM']
else:
  print("Invalid Chioce. Please restart the program and try again.")

Which Dataset Would you Like to View? 
 1) Time Usage Survey 
 2) Cereals 
 3) Museums
Enter Option (1, 2, or 3): 3
You have chosen to analyze the Museums data!
Do you have any particular variable of interest? If so, which one? These are the following categories
 0) No particular variable in interest. Show me general trends. 
 1) HISTORY MUSEUM 
 2) ARBORETUM, BOTANICAL GARDEN, OR NATURE CENTER 
 3) SCIENCE & TECHNOLOGY MUSEUM OR PLANETARIUM 
 4) GENERAL MUSEUM 
 5) HISTORIC PRESERVATION 
 6) ZOO, AQUARIUM, OR WILDLIFE CONSERVATION 
 7) ART MUSEUM 
 8) CHILDREN'S MUSEUM 
 9) NATURAL HISTORY MUSEUM
Enter your choice of category (0-9): 4


In [2]:
# Import Statements
import numpy as np
import pandas as pd
import altair as alt
import copy

# Clone Github Repository that holds current datasets
!git clone 'https://github.com/dmshah4/CS765_Data_Vis.git'

Cloning into 'CS765_Data_Vis'...
remote: Enumerating objects: 85, done.[K
remote: Counting objects: 100% (85/85), done.[K
remote: Compressing objects: 100% (77/77), done.[K
remote: Total 85 (delta 25), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (85/85), done.


In [3]:
# Determine what variables to extract from dataset pickle files
if(dataset_choice == 1):
  df = pd.read_pickle('CS765_Data_Vis/datasets/time_usage_survey_dataset_filtered.pkl')
  varX_name = "Family Income"
  x_axis_title = "Family Income (USD)"
  varY_name = "Number of People"
  y_axis_title = "Number of People within Given Income Range in US"
  varColor_name = "Race"
  chart_title = "Family Income Ranges in the US based on Race"
elif(dataset_choice == 2):
  df = pd.read_pickle('CS765_Data_Vis/datasets/cereals_dataset_filtered.pkl')
  varX_name = "Rating"
  x_axis_title = "Cereal Rating"
  varY_name = "Number of Cereals"
  y_axis_title = "Number of Cereals with Given Rating"
  varColor_name = "Manufacturer"
  chart_title = "Rating of Various Cereals based on Manufacturer"
else:
  df = pd.read_pickle('CS765_Data_Vis/datasets/museums_dataset_filtered.pkl')
  varX_name = "Revenue"
  x_axis_title = "Revenue (USD)"
  varY_name = "Number of Museums"
  y_axis_title = "Number of Museums with Given Revenue in US"
  varColor_name = "Museum Type"
  chart_title = "Revenue of Museums in the US Based on Museum Type"

In [4]:
# This code block includes all the functions that do the calculations for
# downsampling to different sizes. 


# This method re-orders the categories from the dataframe. The goal is to
# create the order such that the most common category is first and the least
# common is last. This is for the ordering of the bars in the baseline design
def bar_order_max_to_min(df, varColor_name, varY_name):
  options = list(df[varColor_name].unique())
  counts = [0] * len(options)
  for index, row in df.iterrows():
    currY = row[varColor_name]
    loc = options.index(currY)
    counts[loc] = counts[loc] + row[varY_name]

  zipped_lists = zip(counts, options)
  sorted_pairs = sorted(zipped_lists, reverse=True)
  tuples = zip(*sorted_pairs)
  counts, options = [ list(tuple) for tuple in  tuples]

  if('Other' in options):
    loc = options.index('Other')
    lastloc = len(options) - 1
    temp = options[loc]
    options[loc] = options[lastloc]
    options[lastloc] = temp
  
  return options


# This method sorts all the x_values in our dataframe such that they are in
# increasing order. This works because we are using an ordinal dataset for our
# batched data. This uses a custom sorter to parse the x-value strings
def sort_x_values(df, varX_name):
  options = list(df[varX_name].unique())
  helper = [0] * len(options)

  for i in range(0, len(options)):
    currStr = options[i]
    firstHalf = currStr.split('-')[0]
    if(firstHalf[-1] == 'K'):
      firstHalf = firstHalf[:-1]
    firstHalf = float(firstHalf)
    helper[i] = firstHalf

  zipped_lists = zip(helper, options)
  sorted_pairs = sorted(zipped_lists)
  tuples = zip(*sorted_pairs)
  helper, options = [ list(tuple) for tuple in  tuples]

  return options


# This method downsamples the number of bars we have in our distribution.
# It takes in a dataframe and returns a new dataframe with the target_number
# of bars in it. The most similar bars are merged based on clustering methods
def downsample_bars(df, varX_name, varY_name, varColor_name, target_numBars):
  x_values = list(df[varX_name].unique())
  x_values = sort_x_values(df, varX_name)
  bars = []
  stacked_options = list(df[varColor_name].unique())
  for i in range(0, len(x_values)):
    to_add = []
    currX = x_values[i]
    for j in range(0, len(stacked_options)):
      currStack = stacked_options[j]
      tempdf = df.loc[(df[varX_name] == currX) & (df[varColor_name] == currStack)]
      value = tempdf.iloc[0][varY_name]
      to_add.append(value)
    bars.append(to_add)

  new_x_values, new_bars = combine_2_bars(x_values, stacked_options, bars)
  while(len(new_x_values) > target_numBars):
    new_x_values, new_bars = combine_2_bars(new_x_values, stacked_options, new_bars)

  resulting_df = []
  for i in range(0, len(new_bars)):
    currBar = new_bars[i]
    currX = new_x_values[i]
    for j in range(0, len(currBar)):
      currY = currBar[j]
      currColor = stacked_options[j]
      toAdd = [currColor, currY, currX]
      resulting_df.append(toAdd)

  headers = [varColor_name, varY_name, varX_name]
  new_df = pd.DataFrame(resulting_df, columns=headers)
  return new_df


# This method calculates the two most similar bars using euclidean distance 
# between all the bars (similar to clustering methods in ML). The new set of 
# bars is returned 
def combine_2_bars(x_values, stacked_options, bars):
  distances = [0] * (len(bars) - 1)
  for i in range(0, len(distances)):
    a = np.array(bars[i])
    b = np.array(bars[i+1])
    distances[i] = np.linalg.norm(a-b, 2)
  
  minPosition = distances.index(min(distances))
  firstBarloc = minPosition
  secondBarloc = minPosition + 1

  new_x_values = []
  new_bars = []
  for i in range(0, len(bars)):
    if(i == firstBarloc):
      firstBar = np.array(bars[i])
      secondBar = np.array(bars[i + 1])
      avgBar = np.mean([firstBar, secondBar], axis=0)
      new_bars.append(list(avgBar))

      firstX = x_values[i]
      secondX = x_values[i+1]
      firstX_start = firstX.split('-')[0]
      secondX_end = secondX.split('-')[1]
      avgX = firstX_start + '-' + secondX_end
      new_x_values.append(avgX)

    elif(i == secondBarloc):
      continue
    else:
      new_bars.append(bars[i])
      new_x_values.append(x_values[i])

  return new_x_values, new_bars


# This method downsamples the number of colors we have in our bar chart.
# It keeps the target_numColors of colors and makes the rest of the colors all
# merged into gray. The largest, most relevant categories are kept along with
# the variable of interest
def downsample_colors(df, varX_name, varY_name, varColor_name, target_numColors, target_category):
  x_values = list(df[varX_name].unique())
  x_values = sort_x_values(df, varX_name)
  bars = []
  stacked_options = list(df[varColor_name].unique())
  stacked_options.append('Other')
  for i in range(0, len(x_values)):
    to_add = []
    currX = x_values[i]
    for j in range(0, len(stacked_options)):
      if(j == (len(stacked_options) - 1)):
        to_add.append(0)
        continue
      currStack = stacked_options[j]
      tempdf = df.loc[(df[varX_name] == currX) & (df[varColor_name] == currStack)]
      value = tempdf.iloc[0][varY_name]
      to_add.append(value)
    bars.append(to_add)
  
  if(target_category == 'None'):
    target_loc = -1
  else:
    target_loc = stacked_options.index(target_category)

  for bar in bars:
    a = bar
    if(target_category == 'None'):
      topK = sorted(range(len(a)), key=lambda i: a[i], reverse=True)[:target_numColors]
    else:
      topK = sorted(range(len(a)), key=lambda i: a[i], reverse=True)[:(target_numColors-1)]
      target_loc = stacked_options.index(target_category)
      if(target_loc in topK):
        topK = sorted(range(len(a)), key=lambda i: a[i], reverse=True)[:target_numColors]
      else:
        topK.append(target_loc)
    counter = 0
    for i in range(0, len(bar)):
      if(i in topK):
        continue
      else:
        counter = counter + bar[i]
        bar[i] = 0
    bar[-1] = counter

  resulting_df = []
  for i in range(0, len(bars)):
    currBar = bars[i]
    currX = x_values[i]
    for j in range(0, len(currBar)):
      currY = currBar[j]
      currColor = stacked_options[j]
      toAdd = [currColor, currY, currX]
      resulting_df.append(toAdd)

  headers = [varColor_name, varY_name, varX_name]
  new_df = pd.DataFrame(resulting_df, columns=headers)
  return new_df


# This method creates an original color dictionary that maps the different
# categories in our stacked bar chart to colors. The color choice comes from:
# https://colorbrewer2.org/#type=qualitative&scheme=Paired&n=10 
def create_color_dict(df, varColor_name, varY_name):
  possible_color_choices = ['#80b1d3', '#fdb462', '#bebada', '#fb8072', '#8dd3c7', '#F9E79F', '#b3de69', '#fccde5', '#bc80bd', '#ccebc5']
  stacks = bar_order_max_to_min(df, varColor_name, varY_name)
  color_dict = {}
  for i in range(0, len(stacks)):
    color_dict[stacks[i]] = possible_color_choices[i]
  color_dict['Other'] = '#969696'
  return color_dict


# This determines the order of the categories for each bar. The variable of 
# interest is first and then the rest are ordered by largest to smallest
# categories.
def bar_order_with_target(bar_order, target):
  if(target == 'None'):
    return bar_order
  else:
    bar_order.remove(target)
    bar_order.insert(0, target)
  return bar_order


# This method uses the bar order we determined above and reorders it such
# that the original bar order from the baseline large design is preserved.
# This is used for the medium design where we want to scale down the number of 
# categories but still preserve the same ordering as the large scale design
def bar_order_from_orig(bar_order, orig_bar_order):
  result = []
  for i in orig_bar_order:
    if (i in bar_order):
      result.append(i)
      bar_order.remove(i)
  
  for j in bar_order:
    result.append(j)
  
  return result


# This method determines the colors for each of the categories after we 
# have both reordered the bar order and scaled down the number of categories.
# The bulk category is assigned the gray color at the top of the bar. 
def determine_colors(color_dict, barOrder, category, tiny):
  colors = [] * len(barOrder)
  if(category == 'None'):
    category = barOrder[0]
  if(tiny):
    for i in barOrder:
      if(i == category):
        colors.append(color_dict[i])
      else:
        colors.append('#969696')
  else:
    for i in barOrder:
      colors.append(color_dict[i])
  return colors

In [5]:
color_dict = create_color_dict(df, varColor_name, varY_name)

### LARGE ###### LARGE ###### LARGE ###### LARGE ###### LARGE ###
x_values_options = sort_x_values(df, varX_name)
barOrder = bar_order_max_to_min(df, varColor_name, varY_name)
orig_barOrder = copy.deepcopy(barOrder)
colorOrder = determine_colors(color_dict, barOrder, orig_categories_order[category_choice], False)

large = alt.Chart(df, title=chart_title).mark_bar(size=24.5).encode(
    x = alt.X(varX_name, sort=x_values_options, title=x_axis_title),
    y = alt.Y(varY_name, title=y_axis_title, axis=alt.Axis(grid=True)),
    color = alt.Color(varColor_name, sort=barOrder, scale=alt.Scale(domain=barOrder, range=colorOrder)),
    order = alt.Order('color_' + varColor_name + '_sort_index:Q')
).properties(
    width=800, height=380).configure_legend(
        orient='bottom', columns=4, labelFontSize=12).configure_axis(
            labelFontSize=12, titleFontSize=20).configure_title(
                fontSize=24).configure_view(
                    strokeWidth=0)

### MEDIUM ###### MEDIUM ###### MEDIUM ###### MEDIUM ###### MEDIUM ###
new_df = downsample_bars(df, varX_name, varY_name, varColor_name, 15)
new_df = downsample_colors(new_df, varX_name, varY_name, varColor_name, 5, orig_categories_order[category_choice])
x_values_options = sort_x_values(new_df, varX_name)
barOrder = bar_order_max_to_min(new_df, varColor_name, varY_name)
barOrder = bar_order_from_orig(barOrder, orig_barOrder)
colorOrder = determine_colors(color_dict, barOrder, orig_categories_order[category_choice], False)

medium = alt.Chart(new_df, title=chart_title).mark_bar(size=19).encode(
    x = alt.X(varX_name, sort=x_values_options, title=x_axis_title),
    y = alt.Y(varY_name, title=y_axis_title, axis=alt.Axis(grid=False)),
    color = alt.Color(varColor_name, sort=barOrder, scale=alt.Scale(domain=barOrder, range=colorOrder)),
    order = alt.Order('color_' + varColor_name + '_sort_index:Q')
).properties(
    width=480, height=250).configure_legend(
        orient='bottom', columns=3, labelFontSize=12).configure_axis(
            labelFontSize=10, titleFontSize=12).configure_title(
                fontSize=20).configure_view(
                    strokeWidth=0)
                
### SMALL ###### SMALL ###### SMALL ###### SMALL ###### SMALL ###
new_df = downsample_bars(df, varX_name, varY_name, varColor_name, 10)
new_df = downsample_colors(new_df, varX_name, varY_name, varColor_name, 3, orig_categories_order[category_choice])
x_values_options = sort_x_values(new_df, varX_name)
barOrder = bar_order_max_to_min(new_df, varColor_name, varY_name)
barOrder = bar_order_from_orig(barOrder, orig_barOrder)
barOrder = bar_order_with_target(barOrder, orig_categories_order[category_choice])
colorOrder = determine_colors(color_dict, barOrder, orig_categories_order[category_choice], False)

small = alt.Chart(new_df).mark_bar(size=15).encode(
    x = alt.X(varX_name, sort=x_values_options, axis=alt.Axis(labels=False)),
    y = alt.Y(varY_name, axis=alt.Axis(grid=False)),
    color = alt.Color(varColor_name, sort=barOrder, scale=alt.Scale(domain=barOrder, range=colorOrder), legend=None),
    order = alt.Order('color_' + varColor_name + '_sort_index:Q')
).properties(
    width=250, height=150).configure_axis(
            labelFontSize=10, titleFontSize=12).configure_title(
                fontSize=14).configure_view(
                    strokeWidth=0)

### TINY ###### TINY ###### TINY ###### TINY ###### TINY ###
new_df = downsample_bars(df, varX_name, varY_name, varColor_name, 5)
x_values_options = sort_x_values(new_df, varX_name)
barOrder = bar_order_max_to_min(new_df, varColor_name, varY_name)
barOrder = bar_order_from_orig(barOrder, orig_barOrder)
barOrder = bar_order_with_target(barOrder, orig_categories_order[category_choice])
colorOrder = determine_colors(color_dict, barOrder, orig_categories_order[category_choice], True)

tiny = alt.Chart(new_df).mark_bar(size=12).encode(
    x = alt.X(varX_name, sort=x_values_options, axis=alt.Axis(labels=False, title="")),
    y = alt.Y(varY_name, axis=alt.Axis(grid=False, labels=False, title="")),
    color = alt.Color(varColor_name, sort=barOrder, scale=alt.Scale(domain=barOrder, range=colorOrder), legend=None),
    order = alt.Order('color_' + varColor_name + '_sort_index:Q')
).properties(
    width=100, height=60).configure_view(
                    strokeWidth=0)

In [6]:
large

In [7]:
medium

In [8]:
small

In [9]:
tiny