In [None]:
import pandas as pd
import altair as alt
import math
import numpy as np

In [None]:
alt.data_transformers.disable_max_rows()

## Load datasets and clean variable names

In [None]:
series = pd.read_csv("../data/tv_series_over_1k_votes.csv")

In [None]:
episodes = pd.read_csv("../data/episodes_details.csv")

In [None]:
episodes_renaming = {
    "Title_show_name_tconst": "seriesId",	
    "Title_basics_tconst": "episodeId",	
    "Title_show_name_primaryTitle": "seriesTitle",	
    "Title_basics_primaryTitle": "episodeTitle",	
    "SeriesName": "seriesFullName"
}

series_renaming = {
    "tconst": "seriesId",
    "primaryTitle": "seriesTitle",
    "SeriesName": "seriesFullName"
}


series = series.rename(series_renaming, axis=1)
episodes = episodes.rename(episodes_renaming, axis=1)

## Cleanup

In [None]:
print(len(series))
series.dropna(inplace=True)
print(len(series))
series["startYear"] = series["startYear"].astype("int")
series.dtypes

In [None]:
# Clean "\\N" characters in "episodeNumber" & "seasonNumber"
#episodes["seasonNumber"] = episodes["seasonNumber"].replace(r"\\N",np.nan, regex=True)
#episodes["episodeNumber"] = episodes["episodeNumber"].replace(r"\\N",np.nan, regex=True)
#episodes["episodeNumber"].sort_values()

# Drop missing values
#print(len(episodes))
#episodes.dropna(inplace=True)
#print(len(episodes))
#episodes.sort_values(by="seasonNumber", )

# Investigate dropped rows

## Create new variables

In [None]:
# Create decade
series = series.dropna()

def get_decade(x):
    return str(x)[:-1] + "0"

series["startYear"] = series["startYear"].astype("int")
series["decade"] = series["startYear"].apply(lambda x: get_decade(x))
series

In [None]:
# Create total number of episodes
# Create total number of seasons


## Exploratory charts

In [None]:
alt.Chart(series).mark_bar(tooltip=True).encode(
    alt.X("numVotes:Q"),
    alt.Y("primaryTitle:N", sort="-x"),
    color="averageRating:O"
).transform_window(rank="rank(numVotes)", sort=[alt.SortField("numVotes", order="descending")]).transform_filter(alt.datum.rank <= 10)

In [None]:
alt.Chart(series).mark_circle().transform_filter(
    "datum.numVotes >= 10000"
).encode(
    alt.X("numVotes:Q"),
    alt.Y("averageRating:Q"),
    alt.Color("decade:N"),
    tooltip=["primaryTitle", "genres"],
    opacity=alt.value(0.7)
).properties(width=600, height=600).interactive()