# Episode 02: Is this Data Science?

### MovieStatsFlow loads the movie metadata CSV file into a Pandas Dataframe and computes some movie genre specific statistics. You can use this notebook and the Metaflow client to eyeball the results and make some simple plots. 

## Import the metaflow client

In [None]:
from metaflow import Flow, get_metadata
import matplotlib.pyplot as plt
import pandas as pd
print("Current metadata provider: %s" % get_metadata())

## Get the movie statistics from the latest run of MovieStatsFlow

In [None]:
run = Flow('MovieStatsFlow').latest_successful_run
print("Using run: %s" % str(run))

genre_stats = run.data.genre_stats

## Create a bar plot of the median gross box office for the top-5 grossing genres

In [None]:
# Get median for each genre
data = [(genre, data['quartiles'][1]) \
        for genre, data \
        in genre_stats.items()]

# Sort and unpack into a list of labels, and medians
sorted_data = sorted(data, key = lambda pair: pair[1])
genre, median = zip(*[(genre, median)\
                      for genre, median\
                      in sorted(data, key=lambda pair: pair[1])])

# Create the bar plot
plt.bar(genre[-5:], median[-5:], align='center', alpha=0.5)
plt.ylabel("Gross Box office (US Dollars)")
plt.show()

## Create a line plot of the gross box office for every decade for the top-5 grossing genres

In [None]:
top_genres=set(map(lambda x : x[0], sorted_data[-5:]))
for genre in genre_stats.keys():
    if genre not in top_genre:
        continue
    df = genre_stats[genre]['dataframe']
    if 'title_year_by_decade' not in df:
        print('Column: title_year_by_decade not found in df')
        break
    out = df.groupby('title_by_decade').sum()['gross'].plot(linewidth=10)
    out.legend(list(top_genres))
    out.set_xlim(1960,2020)