# Data Visualization for Introduction

In [None]:
import sqlite3
import plotly.express as px

import dask.dataframe as dd
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import pearsonr
from dask.distributed import Client
from matplotlib import pyplot as plt
from scipy.stats import pearsonr

from utils.constants import BigMarketsSubset, SmallMarketsSubset, Correspondence, BigMarkets, SmallMarkets, MediumMarkets
from data.API_KEYS import chart_studio_key
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from math import log
import chart_studio
import chart_studio.plotly as py
import chart_studio.tools as tls

import logging

username = "abiola_adeye"
api_key = chart_studio_key

chart_studio.tools.set_credentials_file(username=username, api_key=api_key)

In [None]:
client = Client(n_workers=6, silence_logs=logging.ERROR)
client

# Data Preparation

<div class="alert alert-block alert-info">
Our first step is to prepare the dataframe on which we will conduct our analysis. We then load our data in parquet format and filter the categories of videos that are the most insightful for us. This step permits us to focus on the videos that are the most interesting for our research, remove noise from irrelevant categories and make our queries faster. <br>
Therefore, we filter our videos to only keep categories Sports, People & Blogs and Entertainment.

In [None]:
video_metadata_df = dd.read_parquet("./data/video_metadata/parquet/")
video_metadata_df["upload_date"] = dd.to_datetime(video_metadata_df.upload_date)

In [None]:
video_metadata_df_sports = video_metadata_df[video_metadata_df["categories"].isin(["Sports", "People & Blogs", "Entertainment"])]
video_metadata_df_sports["tags"] = video_metadata_df_sports["tags"].str.lower()
video_metadata_df_sports["tags"] = video_metadata_df_sports["tags"].apply(lambda x: f",{x},")

In [None]:
date_obj = video_metadata_df_sports.upload_date.dt
group_by = [date_obj.year, date_obj.month, video_metadata_df.categories]

video_metadata_df_sports_grouped = video_metadata_df_sports[["upload_date"]].groupby(by=group_by).count().persist()

In [None]:
required_tags = ["nba", "basketball"]

# Verifying that each video contains the required tags
nba_basketball_df = video_metadata_df_sports[video_metadata_df_sports["tags"].str.contains(f',{",|,".join(required_tags)},')].persist()
nba_basketball_df["year_month"] = dd.to_datetime(
    nba_basketball_df.upload_date.dt.year.astype("str") + "-" + nba_basketball_df.upload_date.dt.month.astype("str")
)

In [None]:
nba_basketball_df.head()

In [None]:
BigMarketsSubset

In [None]:
SmallMarketsSubset

<div class="alert alert-block alert-info">
We groupby the videos according to their respective team, sum the number of views for each team on a monthly basis, label each team with its market size and finally calculate its view ratio compared to all NBA/Basketball views. 

In [None]:
nba_basketball_views_per_year_df = nba_basketball_df[["year_month", "view_count"]].groupby("year_month").sum().persist().compute()
nba_basketball_views_per_year_df.rename(columns={"view_count": "total_nba_views"}, inplace=True)

dd_dict = {}

# Getting the total amount of views per month for every big market and small market team
for key, value in {**BigMarketsSubset, **SmallMarketsSubset}.items():

    print(key, value)

    team_df = nba_basketball_df[nba_basketball_df["tags"].str.contains(f',{",|,".join(value)},')]
    team_df = team_df[["year_month", "view_count"]].groupby("year_month").sum()

    team_df = team_df.reset_index()

    team_df["team"] = key
    team_df["market_size"] = "small" if key in SmallMarketsSubset else "big"
    team_df = team_df.persist().compute()

    dd_dict[key] = team_df

In [None]:
all_team_results = pd.concat([v for k, v in dd_dict.items()])
all_team_results = all_team_results.merge(nba_basketball_views_per_year_df, on="year_month")
all_team_results["total_nba_view_percentage"] = all_team_results["view_count"] / all_team_results["total_nba_views"]
all_team_results["season"] = all_team_results["year_month"].apply(lambda x: f"{x.year - 1}-{x.year}" if x.month < 10 else f"{x.year}-{x.year + 1}")

In [None]:
all_team_results.to_csv("./data/all_team_results.csv", index=False)

# Data Visualization for Introduction

### Generate visualization for comparison of major sports leagues in the US

In [None]:
major_league_comparison = pd.read_csv("data/major_league_comparison.csv")

major_league_comparison

In [None]:
major_league_comparison.plot(
    x="League",
    y=["Viewers (in millions)", "US TV rev. (in billions of $)"],
    kind="bar",
    rot=0,
    title="US major leagues viewership and revenue comparison",
)

# Display a legend for the chart
plt.legend()

# Save the chart to a file with a resolution of 300 dpi
plt.savefig("major_league_viewership_comparison.png", dpi=300)

### Generate visualization for evolution of NBA videos views

In [None]:
df = nba_basketball_views_per_year_df.copy()

# Set 'year_month' column as index, sort by index in ascending order, and reset index
df.sort_index(inplace=True, ascending=True)
df.reset_index(inplace=True)

# Plot 'total_nba_views' against 'year_month' for rows with 'year_month' <= '2018-01-01'
df[df.year_month <= "2018-01-01"].plot(x="year_month", y="total_nba_views", legend=None)

# Customize plot
plt.xticks(rotation=45)
plt.xlabel("Date")
plt.ylabel("Aggregate views (in billions)")
plt.title('Viewership evolution of videos containing the keyword "NBA"', pad=15)
plt.tight_layout()

# Save plot as image file
plt.savefig("viewership_evolution_nba.jpeg", dpi=400)

# Big Market vs Small Market Teams

## Data Visualization for the Market Size Comparison

In [None]:
def to_real_market_size(elem):
    if "M" in elem:
        return float(elem[:-1])
    if "K" in elem:
        return float(elem[:-1]) / 1000
    else:
        return np.nan


market_size_df = pd.read_csv("./data/market_sizes.csv")
market_size_df["TV MARKET SIZE"] = market_size_df["TV MARKET SIZE"].apply(to_real_market_size)
market_size_df["METRO POPULATION"] = market_size_df["METRO POPULATION"].apply(lambda x: x / (10**6))
market_size_df["MARKET TYPE"] = market_size_df["TV MARKET SIZE"].apply(
    lambda x: "BIG MARKET" if x > 2 else "SMALL MARKET" if x < 1.5 else "MEDIUM MARKET"
)

In [None]:
market_size_df.head()

In [None]:
tv_market_size = market_size_df["TV MARKET SIZE"]
population = market_size_df["METRO POPULATION"]
team_name = market_size_df["TEAM"]

# Choose a font
font = fm.FontProperties(family="Ubuntu", size=20, weight="bold")
label_font = fm.FontProperties(family="Ubuntu", size=12, style="italic")
market_font = fm.FontProperties(family="Ubuntu", size=16, weight="bold")


# Set figure size and create subplot
fig, ax = plt.subplots(figsize=(18, 10))
ax.scatter(tv_market_size, population, s=population * tv_market_size * 10, c=market_size_df.index)
ax.axvline(x=2, color="r", linestyle="-")
ax.axvline(x=1.5, color="r", linestyle="-")

# Add gridlines
ax.grid(True, linestyle="-", linewidth=0.5, color="lightgray")

for i in range(len(population)):
    ax.annotate(market_size_df["TEAM"].iloc[i], (tv_market_size.iloc[i] * 1.03, population.iloc[i] * 1.02), font=label_font)

# Add title and axis labels
ax.set_title("TV Market Size vs Metro Population", fontproperties=font)
ax.set_xlabel("TV Market Size", fontproperties=font)
ax.set_ylabel("Metro Population", fontproperties=font)


# Add labels for market sizes
ax.annotate("SMALL MARKET", (0.75, 17.5), font=market_font)
ax.annotate("BIG MARKET", (2.4, 17.5), font=market_font)

# Use a logarithmic scale for both axes
ax.set_xscale("log")
ax.set_yscale("log")

In [None]:
import plotly.express as px

# Set figure size and create subplot
fig = px.scatter(
    market_size_df,
    x="TV MARKET SIZE",
    y="METRO POPULATION",
    custom_data=["TEAM"],
    color="MARKET TYPE",
    size=market_size_df["METRO POPULATION"],
    log_x=True,
    log_y=False,
)

# Add vertical lines
fig.add_shape(type="line", x0=2, x1=2, y0=0, y1=1, name="1.5M", yref="paper", line=dict(color="red", width=2))
fig.add_shape(type="line", x0=1.5, x1=1.5, y0=0, y1=1, yref="paper", line=dict(color="red", width=2))


# Add title and axis labels
fig.update_layout(
    xaxis_title="TV Market Size (in millions)",
    xaxis_title_font=dict(family="Ubuntu", size=12),
    yaxis_title="Metro Population (in millions)",
    yaxis_title_font=dict(family="Ubuntu", size=12),
)

# Add hover labels for team names
fig.update_traces(hovertemplate="<b>%{customdata[0]}</b>")
fig.update_layout(
    hoverlabel=dict(
        bgcolor="white",
        font_size=12,
        font_family="Ubuntu",
    )
)
fig.update_layout(yaxis_range=[0, 15])
fig.update_layout(xaxis_range=[0, 0.8])


# fig.update_layout(hovermode='x unified')
fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=0.8))
fig.show()

fig.write_html("data/team_market_size_comparison.html")

py.plot(fig, filename="market_size_comparison", auto_open=False)

## Views distribution study 

In [None]:
df = all_team_results.copy()
df = df[df.year_month.dt.year != 2006]
df = df[df.season != "2019-2020"]

In [None]:
small_vs_big = df.groupby(["season", "market_size"]).sum().drop(["total_nba_views", "view_count"], axis=1)
small_vs_big.reset_index(inplace=True)

big_market_percentage_df = small_vs_big[small_vs_big["market_size"] == "big"]
small_market_percentage_df = small_vs_big[small_vs_big["market_size"] == "small"]

small_vs_big

In [None]:
sns.set_style("darkgrid")

plt.fill_between(
    big_market_percentage_df["season"],
    big_market_percentage_df["total_nba_view_percentage"],
    small_market_percentage_df["total_nba_view_percentage"],
    color="peachpuff",
    alpha=0.3,
)
plt.plot(big_market_percentage_df["season"], big_market_percentage_df["total_nba_view_percentage"], color="firebrick", label="Big Markets")
plt.plot(small_market_percentage_df["season"], small_market_percentage_df["total_nba_view_percentage"], color="steelblue", label="Small Markets")
plt.xlabel("Year")
plt.ylabel("Percentage")
plt.title("Small vs Big Markets Percentage of all NBA video views")
plt.legend()

plt.tight_layout()


# rotate and align the tick labels so they look better
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment="right")

plt.savefig("small_vs_big_market_percentge_comparison.png", dpi=300, bbox_inches="tight")

plt.show()

In [None]:
percentage_per_season = df.groupby("season").sum().drop(["total_nba_views"], axis=1)
views_per_team = df.groupby(["season", "team"]).agg({"total_nba_view_percentage": "sum", "market_size": "first", "view_count": "sum"})
# divide each team's views by the total views for that season
views_per_team["subset_percentage"] = views_per_team["total_nba_view_percentage"].div(
    percentage_per_season["total_nba_view_percentage"], axis=0, level="season"
)
views_per_team.rename(columns={"view_count": "Views"}, inplace=True)
views_per_team.reset_index(inplace=True)
views_per_team

In [None]:
# create a plotly pie chart, where the season is an additional dimension
fig = px.pie(
    views_per_team,
    values="subset_percentage",
    names="team",
    color="team",
    title="Percentage of all NBA video views by team",
    color_discrete_map={"big": "firebrick", "small": "steelblue"},
    labels={"subset_percentage": "Percentage of all NBA video views by team"},
    hover_data=["market_size"],
    width=800,
    height=800,
)

fig

In [None]:
fig = px.bar(
    views_per_team,
    x="team",
    y=views_per_team.subset_percentage * 100,
    color="market_size",
    barmode="stack",
    color_discrete_sequence=px.colors.qualitative.Pastel,
    animation_frame="season",
    animation_group="team",
    custom_data=["Views"],
)

fig.update_layout(
    # title="Distribution of views inside our subset by team",
    xaxis_title="Team",
    yaxis_title="Percentage",
    legend_title="Market Size",
    title_x=0.5,
    yaxis_range=[0, 55],
)

# Add hover labels for team names
fig.update_traces(hovertemplate="<b>Number of views: %{customdata[0]}</b>")
fig.update_layout(
    hoverlabel=dict(
        bgcolor="white",
        font_size=12,
        font_family="Ubuntu",
    )
)

py.plot(fig, filename="distribution_of_views_animation", auto_open=False)

fig

## Direction of Free Agent Moves in the NBA between 2020 and 2021

In [None]:
free_agent_moves_2021 = pd.read_csv("./data/free_agent_moves/2021_free_agent_moves.csv")
free_agent_moves_2022 = pd.read_csv("./data/free_agent_moves/2022_free_agent_moves.csv")

free_agent_moves = pd.concat([free_agent_moves_2021, free_agent_moves_2022])

# keep only the rows where the NEWS column starts with "Signs with"
free_agent_moves = free_agent_moves[free_agent_moves["NEWS"].str.startswith("Signs with") | False]

# add from and to columns to see direction of trade
free_agent_moves["TO"] = free_agent_moves["NEWS"].str.split(" ").str[2]
free_agent_moves["FROM"] = ""

# free_agent_moves.to_csv("./data/free_agent_moves/free_agent_moves.csv", index=False)

In [None]:
# reload file after manually completing "TO" column
free_agent_moves = pd.read_csv("./data/free_agent_moves/free_agent_moves.csv")
free_agent_moves.drop(["EXP", "NEWS", "TYPE", "POS"], axis=1, inplace=True)

In [None]:
def map_market_size(team):
    if team in BigMarkets:
        return 2
    elif team in MediumMarkets:
        return 1
    else:
        return 0


free_agent_moves["TO_MARKET"] = free_agent_moves["TO"].map(map_market_size)
free_agent_moves["FROM_MARKET"] = free_agent_moves["FROM"].map(map_market_size)

free_agent_moves

In [None]:
free_agent_moves[(free_agent_moves["PPG"] > 0) & (free_agent_moves["TO_MARKET"] < free_agent_moves["FROM_MARKET"])]

In [None]:
free_agent_moves[(free_agent_moves["PPG"] > 0) & (free_agent_moves["TO_MARKET"] > free_agent_moves["FROM_MARKET"])]

In [None]:
free_agent_moves[(free_agent_moves["PPG"] > 0) & (free_agent_moves["TO_MARKET"] == free_agent_moves["FROM_MARKET"])]


## Study on the correlation between views and winning percentage

In [None]:
all_team_results = pd.read_csv("./data/all_team_results.csv", parse_dates=["year_month"])

all_team_results

<div class="alert alert-block alert-info">
To compare our view ratio with the teams' results, we import from our database the monthly occupancy rate of the stadium of each team and the winning rate on the matches since the beginning of each season.

In [None]:
con = sqlite3.connect("data/nba_api.sqlite")

occupancy_rate_df = pd.read_sql_query(
    """select home, avg(attendance) as avg_attendace, round(avg(attendance/season_high_attendance*100),2) as avg_occupancy_rate, strftime('%Y-%m',time) as year_month
from game_data
group by home, year_month""",
    con,
)
occupancy_rate_df = occupancy_rate_df.reset_index()

display(occupancy_rate_df.home.unique())

occupancy_rate_df = occupancy_rate_df[occupancy_rate_df["home"].isin(Correspondence.keys())]


winning_rate_df = pd.read_sql_query(
    """
select home as team,
       strftime('%Y-%m', time)                                                                         as year_month,

       last_value(curr_season_win_pct) over (partition by home, strftime('%Y-%m', time) order by time) as win_percentage
from game_data
group by home, year_month;
""",
    con,
)
winning_rate_df = winning_rate_df.reset_index()
winning_rate_df = winning_rate_df[winning_rate_df["team"].isin(Correspondence.keys())]

winning_rate_df["year_month"] = pd.to_datetime(winning_rate_df["year_month"])
occupancy_rate_df["year_month"] = pd.to_datetime(occupancy_rate_df["year_month"])

winning_rate_df["team"] = winning_rate_df["team"].apply(lambda x: Correspondence[x])
occupancy_rate_df["team"] = occupancy_rate_df["home"].apply(lambda x: Correspondence[x])

winning_rate_df.head()

In [None]:
# Add winning percentages and occupancy percentages for every team to all team results df
all_team_results = all_team_results.merge(winning_rate_df.drop(["index"], axis=1), on=["team", "year_month"], how="inner")
all_team_results = all_team_results.merge(occupancy_rate_df.drop(["index", "home"], axis=1), on=["team", "year_month"], how="inner")

# Save to csv since computation is expensive
all_team_results.to_csv("data/all_team_results_with_winning_percentages.csv")

all_team_results.head()

In [None]:
# add a new column that is the mean of view_counts per year_month
all_team_results["mean_view_count"] = all_team_results.groupby("year_month")["view_count"].transform("mean")
all_team_results["std_view_count"] = all_team_results.groupby("year_month")["view_count"].transform("std").fillna(1)

all_team_results["view_count_normalized"] = (all_team_results["view_count"] - all_team_results["mean_view_count"]) / all_team_results[
    "std_view_count"
]
all_team_results["year"] = all_team_results["year_month"].dt.year

In [None]:
# compute correlation between view_count and win_percentage for both big and small market teams
grouped_views_and_wins_df = all_team_results.groupby("market_size").agg(
    {
        "view_count_normalized": lambda x: list(x),
        "win_percentage": lambda x: list(x),
    }
)

correlations = {}

for market_size in ["small", "big"]:
    df = grouped_views_and_wins_df[grouped_views_and_wins_df.index == market_size]

    correlations[market_size] = pearsonr(df["view_count_normalized"].iloc[0], df["win_percentage"].iloc[0])


correlations