In [None]:
import sqlite3

import dask.dataframe as dd
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import pearsonr
from dask.distributed import Client
from matplotlib import pyplot as plt
from scipy.stats import pearsonr

from utils.constants import BigMarkets, Correspondence, SmallMarkets, MediumMarkets

In [None]:
client = Client(n_workers=6)
client

# NBA videos dataset preparation 

In [None]:
video_metadata_df = dd.read_parquet(f"./data/video_metadata/parquet/")
video_metadata_df["upload_date"] = dd.to_datetime(video_metadata_df.upload_date)

video_metadata_df.head(5)

In [None]:
# Selecting only videos from the categories "Sports", "People & Blogs" and "Entertainment"

video_metadata_df_sports = video_metadata_df[video_metadata_df["categories"].isin(["Sports", "People & Blogs", "Entertainment"])]
video_metadata_df_sports["tags"] = video_metadata_df_sports["tags"].str.lower()
video_metadata_df_sports["tags"] = video_metadata_df_sports["tags"].apply(lambda x: f",{x},")

In [None]:
# video_metadata_df_sports = video_metadata_df_sports[video_metadata_df_sports['tags'].str.contains('basketball')]

date_obj = video_metadata_df_sports.upload_date.dt
group_by = [date_obj.year, date_obj.month, video_metadata_df.categories]

video_metadata_df_sports_grouped = video_metadata_df_sports[["upload_date"]].groupby(by=group_by).count().persist()

In [None]:
# Selecting only videos containing the tags "nba" and "basketball"

required_tags = ["nba", "basketball"]
nba_basketball_df = video_metadata_df_sports[video_metadata_df_sports["tags"].str.contains(f',{",|,".join(required_tags)},')].persist()
nba_basketball_df["year_month"] = dd.to_datetime(
    nba_basketball_df.upload_date.dt.year.astype("str") + "-" + nba_basketball_df.upload_date.dt.month.astype("str")
)

In [None]:
nba_basketball_df.head()

# NBA teams media exposure analysis

## Media coverage of each market on Youtube

### Number of videos per team 

In [None]:
BigMarkets

In [None]:
MediumMarkets

In [None]:
SmallMarkets

In [None]:
dd_dict = {}

# Selecting the teams and their market size depending on the tags of the videos
for key, value in {**BigMarkets, **MediumMarkets, **SmallMarkets}.items():

    team_df = nba_basketball_df[nba_basketball_df["tags"].str.contains(f',{",|,".join(value)},')]
    team_df["team"] = key
    team_df["market_size"] = "small" if key in SmallMarkets else "medium" if key in MediumMarkets else "big"
    team_df = team_df.persist().compute()

    dd_dict[key] = team_df

all_team_df = pd.concat([v for k, v in dd_dict.items()])

In [None]:
all_team_df = all_team_df.sort_values("year_month")
all_team_df["season"] = all_team_df["year_month"].apply(lambda x: f"{x.year - 1}-{x.year}" if x.month < 10 else f"{x.year}-{x.year + 1}")

In [None]:
all_team_df

In [None]:
videos_per_team_df = all_team_df[["team", "market_size", "title"]].groupby(["team", "market_size"]).count()
videos_per_team_df.rename(columns={"title": "total_videos"}, inplace=True)

In [None]:
videos_per_team_df["total_videos"].sort_values().plot(kind="bar")

In [None]:
# Change Color !!!

fig = px.bar(
    videos_per_team_df.sort_values("total_videos").reset_index(),
    x="team",
    y="total_videos",
    labels={"team": "Team", "total_videos": "Total Number of Videos"},
    color="market_size",
    width=800,
)
fig.update_layout(title={"text": "Number of Videos per Team", "x": 0.5, "xanchor": "center"})
fig.update_layout(xaxis_categoryorder="total ascending")
fig.show()

In [None]:
for season in all_team_df["season"].unique():
    videos_per_team_df = all_team_df[all_team_df.season == season].groupby(["team"]).count()
    print(season)
    print(videos_per_team_df["title"].sort_values())
    # videos_per_team_df["title"].sort_values().plot(kind='bar')

In [None]:
stack_df = all_team_df[["team", "market_size", "title"]].groupby(["team", "market_size"]).count()
stack_df.rename(columns={"title": "total_videos"}, inplace=True)
stack_df

In [None]:
fig = px.bar(
    stack_df.sort_values("total_videos", ascending=False).reset_index(),
    x="market_size",
    y="total_videos",
    color="team",
    barmode="stack",
    color_discrete_sequence=px.colors.qualitative.Plotly,
    # animation_frame="season",
    # animation_group="team",
    # custom_data=["Views"],
)
fig

### Number of videos per market 

In [None]:
videos_per_market_df = all_team_df[["market_size", "title"]].groupby(["market_size"]).count()
videos_per_market_df.rename(columns={"title": "total_videos"}, inplace=True)

In [None]:
videos_per_market_df["total_videos"].plot(kind="bar")

In [None]:
# Change Color !!!


fig = px.bar(
    videos_per_market_df.reset_index(),
    x="market_size",
    y="total_videos",
    width=400,
    labels={"market_size": "Market Size", "total_videos": "Total Number of Videos"},
    color="market_size",
)
fig.update_layout(title={"text": "Number of Videos per Market Size", "x": 0.5, "xanchor": "center"})
fig.show()

## Number of channels per team 

In [None]:
# Computing the ratio of videos related to a single team on each channel
channel_team_df = all_team_df[["title", "channel_id", "team"]].groupby(["channel_id", "team"]).count().reset_index()
count_by_channel = channel_team_df.groupby("channel_id").sum().reset_index()
count_by_channel.rename(columns={"title": "total_videos"}, inplace=True)
channel_team_df.rename(columns={"title": "team_videos"}, inplace=True)
ratio_per_team = pd.merge(channel_team_df, count_by_channel, on="channel_id")
ratio_per_team["ratio"] = ratio_per_team["team_videos"] / ratio_per_team["total_videos"]

# Only selecting channels that uploaded a significant number of videos
significant_channels_df = ratio_per_team[ratio_per_team.total_videos > 4]

# Select only fanbase channels - were at least 60% of videos are on a single team
fanbase_channels_df = significant_channels_df[significant_channels_df.ratio > 0.51]

# Number of channels per team
fanbase_channels_df = fanbase_channels_df.groupby("team").count().sort_values("ratio")  # .plot(kind='bar')

# Pas Clean A nettoyer
marketper_team_df = all_team_df[["team", "market_size"]].groupby(["team", "market_size"]).count().reset_index()
fanbase_channels_df = fanbase_channels_df.merge(marketper_team_df, on="team")

In [None]:
fig = px.bar(fanbase_channels_df, x="team", y="ratio", labels={"ratio": "Number of Fanbase channels", "team": "Team"}, color="market_size")
fig.update_layout(title={"text": "Number of FanBase Channels per Team", "x": 0.5, "xanchor": "center"})
fig.update_layout(xaxis_categoryorder="total ascending")
fig.show()

## The big name bias 

### Fans engagement 

In [None]:
# Computing the fan engagement on each market size
engagement_df = all_team_df.groupby(["market_size"]).sum()
engagement_df["engagement"] = (engagement_df.like_count / engagement_df.view_count) * 1000

fig = px.bar(
    engagement_df.sort_values("engagement").reset_index(),
    x="market_size",
    y="engagement",
    width=600,
    labels={"market_size": "Market Size", "engagement": "Engagement ratio (Likes/Views x 1000)"},
    range_y=[7, 10],
    color="market_size",
)
fig.update_layout(title={"text": "Engagement by Market Size", "x": 0.5, "xanchor": "center"})
fig.show()

In [None]:
engagement_df = all_team_df.groupby(["team", "market_size"]).sum()
engagement_df["engagement"] = (engagement_df.like_count / engagement_df.view_count) * 1000
engagement_df["engagement"].sort_values().plot(kind="bar")