In [None]:
import sqlite3

import dask.array as da
import dask.bag as db
import dask.dataframe as dd
import numpy as np
import pandas as pd
import seaborn as sns
from dask.distributed import Client
from matplotlib import pyplot as plt
from utils.constants import BigMarkets, SmallMarkets, Correspondence

In [None]:
client = Client(n_workers=6)
client

# Big Market vs Small Market Teams Case Study

In [None]:
video_metadata_df = dd.read_parquet("./data/video_metadata/parquet/0.parquet")
video_metadata_df["upload_date"] = dd.to_datetime(video_metadata_df.upload_date)

In [None]:
video_metadata_df_sports = video_metadata_df[video_metadata_df["categories"].isin(["Sports", "People & Blogs", "Entertainment"])]
video_metadata_df_sports["tags"] = video_metadata_df_sports["tags"].str.lower()
video_metadata_df_sports["tags"] = video_metadata_df_sports["tags"].apply(lambda x: f",{x},")

In [None]:
# video_metadata_df_sports = video_metadata_df_sports[video_metadata_df_sports['tags'].str.contains('basketball')]


date_obj = video_metadata_df_sports.upload_date.dt
group_by = [date_obj.year, date_obj.month, video_metadata_df.categories]

video_metadata_df_sports_grouped = video_metadata_df_sports[["upload_date"]].groupby(by=group_by).count().persist()

In [None]:
video_metadata_df_sports.head()

In [None]:
required_tags = ["nba", "basketball"]
nba_basketball_df = video_metadata_df_sports[video_metadata_df_sports["tags"].str.contains(f',{",|,".join(required_tags)},')].persist()
nba_basketball_df["year_month"] = dd.to_datetime(
    nba_basketball_df.upload_date.dt.year.astype("str") + "-" + nba_basketball_df.upload_date.dt.month.astype("str")
)

In [None]:
nba_basketball_df.head()

In [None]:
BigMarkets

In [None]:
nba_basketball_views_per_year_df = nba_basketball_df[["year_month", "view_count"]].groupby("year_month").sum().persist().compute()
nba_basketball_views_per_year_df.rename(columns={"view_count": "total_nba_views"}, inplace=True)

dd_dict = {}

# Getting the total amount of views per month for every big market and small market team
for key, value in {**BigMarkets, **SmallMarkets}.items():

    team_df = nba_basketball_df[nba_basketball_df["tags"].str.contains(f',{",|,".join(value)},')]
    # team_df["year_month"] = team_df.upload_date.dt.year.astype("str") + "-" + team_df.upload_date.dt.month.astype("str")
    team_df = team_df[["year_month", "view_count"]].groupby("year_month").sum()

    team_df = team_df.reset_index()

    team_df["team"] = key
    team_df["market_size"] = "small" if key in SmallMarkets else "big"
    team_df = team_df.persist().compute()

    dd_dict[key] = team_df

In [None]:
all_team_results = pd.concat([v for k, v in dd_dict.items()])
all_team_results = all_team_results.merge(nba_basketball_views_per_year_df, on="year_month")
all_team_results["total_nba_view_percentage"] = all_team_results["view_count"] / all_team_results["total_nba_views"]

In [None]:
all_team_results[all_team_results["team"] == "UT"].sort_values("year_month", ascending=False).head(50)

In [None]:
con = sqlite3.connect("data/nba_api.sqlite")

In [None]:
occupancy_rate_df = pd.read_sql_query(
    """select home, avg(attendance) as avg_attendace, round(avg(attendance/season_high_attendance*100),2) as avg_occupancy_rate, strftime('%Y-%m',time) as year_month
from game_data
group by home, year_month""",
    con,
)
occupancy_rate_df = occupancy_rate_df.reset_index()

display(occupancy_rate_df.home.unique())

occupancy_rate_df = occupancy_rate_df[occupancy_rate_df["home"].isin(Correspondence.keys())]


winning_rate_df = pd.read_sql_query(
    """
select home as team,
       strftime('%Y-%m', time)                                                                         as year_month,

       last_value(curr_season_win_pct) over (partition by home, strftime('%Y-%m', time) order by time) as win_percentage
from game_data
group by home, year_month;
""",
    con,
)
winning_rate_df = winning_rate_df.reset_index()
winning_rate_df = winning_rate_df[winning_rate_df["team"].isin(Correspondence.keys())]

winning_rate_df["year_month"] = pd.to_datetime(winning_rate_df["year_month"])
occupancy_rate_df["year_month"] = pd.to_datetime(occupancy_rate_df["year_month"])

In [None]:
winning_rate_df["team"] = winning_rate_df["team"].apply(lambda x: Correspondence[x])
occupancy_rate_df["team"] = occupancy_rate_df["home"].apply(lambda x: Correspondence[x])

In [None]:
winning_rate_df.head()

In [None]:
occupancy_rate_df.head()

In [None]:
winning_rate_df[winning_rate_df["year_month"] == "2006-03-01"]

In [None]:
all_team_results = all_team_results.merge(winning_rate_df.drop(["index"], axis=1), on=["team", "year_month"], how="inner")
display(all_team_results.sort_values("year_month"))
all_team_results = all_team_results.merge(occupancy_rate_df.drop(["index", "home"], axis=1), on=["team", "year_month"], how="inner")
display(all_team_results)

In [None]:
all_team_results.to_csv("data/all_team_results.csv")

In [None]:
all_team_results.sort_values("year_month").head(50)

How can we define the popularity of a team. Based on its views ratio and occupancy rate.

 Hypothese: Is the correlation between winning percentage and popularity bigger for big market than for small market teams?
 
 
 
Big market teams are followed all around the world. Thus, many fans developped a loyalty to their teams. Our supposition is that this loyalty permits to the big market teams to continue to be popular with less good results compared to small market teams. 

In [None]:
from scipy.stats import pearsonr

small_market_correlations = {}
for team in SmallMarkets:
    df = all_team_results[all_team_results["team"] == team]
    small_market_correlations[team] = pearsonr(df["total_nba_view_percentage"], df["win_percentage"]).statistic


big_market_correlations = {}
for team in BigMarkets:
    df = all_team_results[all_team_results["team"] == team]
    big_market_correlations[team] = pearsonr(df["total_nba_view_percentage"], df["win_percentage"]).statistic

In [None]:
big_market_correlations

In [None]:
small_market_correlations

In [None]:
pd.DataFrame(big_market_correlations.values()).mean()

In [None]:
pd.DataFrame(small_market_correlations.values()).mean()

In [None]:
lakers_df = all_team_results[(all_team_results["team"] == "LA") & (all_team_results["year_month"] > "2010")].sort_values("year_month")

In [None]:
# lakers_df = lakers_df[['year_month','total_nba_view_percentage','win_percentage']].groupby(lakers_df.year_month.dt.year).agg({'win_percentage':'last','total_nba_view_percentage':'mean'})

In [None]:
sns.set(rc={"figure.figsize": (11, 4)})
# sns.set(style="white", color_codes=True)
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})
ax = plt.subplot
ax = sns.lineplot(x="year_month", y="win_percentage", data=lakers_df, color="r", marker="x")
# ax = sns.lineplot(x='year_month',y='total_nba_view_percentage', data=lakers_df,color='b')

plt.show()

In [None]:
sns.set(rc={"figure.figsize": (11, 4)})
# sns.set(style="white", color_codes=True)
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})
ax = plt.subplot
# ax = sns.lineplot(x='year_month',y='win_percentage', data=lakers_df,color='r',marker='x')
ax = sns.lineplot(x="year_month", y="total_nba_view_percentage", data=lakers_df, color="b")

plt.show()

After trying to take conclusions on the winning rate of a team over the current month, we discovered that our values were not representative of the real level of the team at that time, i.e. a high level team could have faced only low level teams during a certain month, thus it wuld have a high winnnig rate and low view percentage as those matches are not decisive and thus not interesting. Therefore, we decided to calculate our winnig rate as the rat of win on the 10 past matches that a team has played. 