In [None]:
import sqlite3

import dask.array as da
import dask.bag as db
import dask.dataframe as dd
import numpy as np
import pandas as pd
import seaborn as sns
from dask.distributed import Client
from matplotlib import pyplot as plt
from scipy.stats import pearsonr

from utils.constants import BigMarkets, Correspondence, SmallMarkets

In [None]:
client = Client(n_workers=6)
client

# Big Market vs Small Market Teams Case Study

<div class="alert alert-block alert-success">
    In this notebook, we will analyze the relationship that may exist between the results of a team and their popularity on Youtube. We definend the popularity of a team as the view ratio of that team compared to all the views generated by basketball and NBA videos during that month. To have a more interesting and concrete view on this subject, we decided to conduct our analysis on 2 separated sets of teams.<br> 
The first set correspond to the <b>big market teams</b>, these teams usually represent big cities or very dense cities such as the New York Knicks, the Los Angeles Lakers or the Chicago Bulls. These cities represent a big market as they are the most supported at the national and international level. <br>
The second set represents the <b> small market teams </b>, these teams are usually less supported by the NBA fans as they represent cities that are less demographically dense or where basketball is not very popular.

<div class="alert alert-block alert-info">
Our first step is to prepare the dataframe on which we will conduct our analysis. We then load our data in parquet format and filter the categories of videos that are the most insightful for us. This step permits us to focus on the videos that are the most interesting for our research, remove noise from irrelevant categories and make our queries faster. <br>
Therefore, we filter our videos to only keep categories Sports, People & Blogs and Entertainment.

In [None]:
video_metadata_df = dd.read_parquet("./data/video_metadata/parquet/")
video_metadata_df["upload_date"] = dd.to_datetime(video_metadata_df.upload_date)

In [None]:
video_metadata_df_sports = video_metadata_df[video_metadata_df["categories"].isin(["Sports", "People & Blogs", "Entertainment"])]
video_metadata_df_sports["tags"] = video_metadata_df_sports["tags"].str.lower()
video_metadata_df_sports["tags"] = video_metadata_df_sports["tags"].apply(lambda x: f",{x},")

In [None]:
# video_metadata_df_sports = video_metadata_df_sports[video_metadata_df_sports['tags'].str.contains('basketball')]

date_obj = video_metadata_df_sports.upload_date.dt
group_by = [date_obj.year, date_obj.month, video_metadata_df.categories]

video_metadata_df_sports_grouped = video_metadata_df_sports[["upload_date"]].groupby(by=group_by).count().persist()

In [None]:
video_metadata_df_sports.head()

<div class="alert alert-block alert-info">
To only keep the videos that are related to our main subject, we filter another time on the tags of the videos and remove all videos that do not contain the tag "nba" or "basketball".

In [None]:
required_tags = ["nba", "basketball"]

# Verifying that each video contains the required tags
nba_basketball_df = video_metadata_df_sports[video_metadata_df_sports["tags"].str.contains(f',{",|,".join(required_tags)},')].persist()
nba_basketball_df["year_month"] = dd.to_datetime(
    nba_basketball_df.upload_date.dt.year.astype("str") + "-" + nba_basketball_df.upload_date.dt.month.astype("str")
)

In [None]:
nba_basketball_df.head()

<div class="alert alert-block alert-info">
We import our dictionnary of big and small market teams from our helper file. 

In [None]:
BigMarkets

In [None]:
SmallMarkets

<div class="alert alert-block alert-info">
We groupby the videos according to their respective team, sum the number of views for each team on a monthly basis, label each team with its market size and finally calculate its view ratio compared to all NBA/Basketball views. 

In [None]:
nba_basketball_views_per_year_df = nba_basketball_df[["year_month", "view_count"]].groupby("year_month").sum().persist().compute()
nba_basketball_views_per_year_df.rename(columns={"view_count": "total_nba_views"}, inplace=True)

dd_dict = {}

# Getting the total amount of views per month for every big market and small market team
for key, value in {**BigMarkets, **SmallMarkets}.items():

    team_df = nba_basketball_df[nba_basketball_df["tags"].str.contains(f',{",|,".join(value)},')]
    # team_df["year_month"] = team_df.upload_date.dt.year.astype("str") + "-" + team_df.upload_date.dt.month.astype("str")
    team_df = team_df[["year_month", "view_count"]].groupby("year_month").sum()

    team_df = team_df.reset_index()

    team_df["team"] = key
    team_df["market_size"] = "small" if key in SmallMarkets else "big"
    team_df = team_df.persist().compute()

    dd_dict[key] = team_df

In [None]:
all_team_results = pd.concat([v for k, v in dd_dict.items()])
all_team_results = all_team_results.merge(nba_basketball_views_per_year_df, on="year_month")
all_team_results["total_nba_view_percentage"] = all_team_results["view_count"] / all_team_results["total_nba_views"]

<div class="alert alert-block alert-info">
To compare our view ratio with the teams' results, we import from our database the monthly occupancy rate of the stadium of each team and the winning rate on the matches since the beginning of each season.

In [None]:
con = sqlite3.connect("data/nba_api.sqlite")

In [None]:
occupancy_rate_df = pd.read_sql_query(
    """select home, avg(attendance) as avg_attendace, round(avg(attendance/season_high_attendance*100),2) as avg_occupancy_rate, strftime('%Y-%m',time) as year_month
from game_data
group by home, year_month""",
    con,
)
occupancy_rate_df = occupancy_rate_df.reset_index()

display(occupancy_rate_df.home.unique())

occupancy_rate_df = occupancy_rate_df[occupancy_rate_df["home"].isin(Correspondence.keys())]


winning_rate_df = pd.read_sql_query(
    """
select home as team,
       strftime('%Y-%m', time)                                                                         as year_month,

       last_value(curr_season_win_pct) over (partition by home, strftime('%Y-%m', time) order by time) as win_percentage
from game_data
group by home, year_month;
""",
    con,
)
winning_rate_df = winning_rate_df.reset_index()
winning_rate_df = winning_rate_df[winning_rate_df["team"].isin(Correspondence.keys())]

winning_rate_df["year_month"] = pd.to_datetime(winning_rate_df["year_month"])
occupancy_rate_df["year_month"] = pd.to_datetime(occupancy_rate_df["year_month"])

In [None]:
winning_rate_df["team"] = winning_rate_df["team"].apply(lambda x: Correspondence[x])
occupancy_rate_df["team"] = occupancy_rate_df["home"].apply(lambda x: Correspondence[x])

In [None]:
winning_rate_df.head()

In [None]:
all_team_results = all_team_results.merge(winning_rate_df.drop(["index"], axis=1), on=["team", "year_month"], how="inner")
display(all_team_results.sort_values("year_month"))
all_team_results = all_team_results.merge(occupancy_rate_df.drop(["index", "home"], axis=1), on=["team", "year_month"], how="inner")
all_team_results

In [None]:
all_team_results.to_csv("data/all_team_results.csv")

 <div class="alert alert-block alert-info">

Hypothesis : Is the correlation between results and popularity bigger for big market teams than for small market teams? 

In [None]:
small_market_correlations = {}
for team in SmallMarkets:
    df = all_team_results[all_team_results["team"] == team]
    small_market_correlations[team] = pearsonr(df["total_nba_view_percentage"], df["win_percentage"]).statistic


big_market_correlations = {}
for team in BigMarkets:
    df = all_team_results[all_team_results["team"] == team]
    big_market_correlations[team] = pearsonr(df["total_nba_view_percentage"], df["win_percentage"]).statistic

In [None]:
big_market_correlations

In [None]:
small_market_correlations

In [None]:
lakers_df = all_team_results[(all_team_results["team"] == "LA") & (all_team_results["year_month"] > "2010")].sort_values("year_month")

In [None]:
sns.set(rc={"figure.figsize": (11, 4)})
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})
ax = plt.subplot
ax = sns.lineplot(x="year_month", y="win_percentage", data=lakers_df, color="r", marker="x")

plt.show()

In [None]:
sns.set(rc={"figure.figsize": (11, 4)})
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})
ax = plt.subplot
ax = sns.lineplot(x="year_month", y="total_nba_view_percentage", data=lakers_df, color="b")

plt.show()