In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
tqdm.pandas()


category_sports = [
    "basketball",
    "wrestling",
    "soccer",
    "boxing",
    "hockey",
    "golf",
    "baseball",
]

## Load data

This data is generated by the code `calculate_user_embeddings.py`.


This code performs the following operations to represent each user as a vector:
1. For each user and each time period, the code lists the videos on which the user has posted comments.
2. Based on the tags assigned to these videos, the user is awarded “points.”
    - For example, if User A comments on a video tagged with “basketball,” the user receives 1 point for point_basketball.
    - If a video has multiple tags, the points are divided among the tags. For instance, if a video is tagged with both “soccer” and “boxing,” the user is awarded 0.5 points for point_soccer and 0.5 points for point_boxing.
3. The points for each sport are aggregated for each user and time period. Finally, these points are normalized to represent each user as a vector.

For details on the specific vectors, refer to the contents of the `df_user_embedding` variable.

In [10]:
df_user_embedding = pd.read_parquet("data/user_embedding_by_year_month_all.parquet")

## Preprocess

In [11]:
# add point sum column
df_user_embedding["point_sum"] = df_user_embedding[[f"point_{s}" for s in [*category_sports, "other"]]].sum(axis=1)

# select users with at least 10 comments
df_user_embedding = df_user_embedding[df_user_embedding["point_sum"] >= 10]

In [12]:
# normalize points
for s in [*category_sports, "other"]:
    df_user_embedding.loc[:, f"point_{s}"] = df_user_embedding.loc[:, f"point_{s}"] / df_user_embedding["point_sum"]
df_user_embedding = df_user_embedding.drop(columns=["point_sum"])

In [13]:
df_user_embedding

Unnamed: 0,author,year,month,point_basketball,point_wrestling,point_soccer,point_boxing,point_hockey,point_golf,point_baseball,point_other
41,269,2017,8,0.200000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.800000
119,617,2017,6,0.000000,0.818182,0.000000,0.000000,0.0,0.0,0.0,0.181818
488,2590,2018,12,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.1,0.900000
614,2990,2016,1,0.000000,1.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000
616,2990,2016,3,0.000000,1.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
116245546,576548979,2019,7,0.000000,0.066667,0.000000,0.200000,0.0,0.0,0.0,0.733333
116245547,576548979,2019,8,0.045455,0.000000,0.045455,0.181818,0.0,0.0,0.0,0.727273
116245741,576550259,2019,2,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,1.000000
116245856,576550811,2019,7,0.700000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.300000


## Analysis on 2018 FIFA World Cup

In [15]:
periods_event_before = [(2018, 4), (2018, 5)]
periods_event_during = [(2018, 6), (2018, 7)]
periods_event_after = [(2018, 8), (2018, 9)]


def is_during_event(year, month):
    return any((year, month) == event_period for event_period in periods_event_during)

def is_before_event(year, month):
    return any((year, month) == event_period for event_period in periods_event_before)

def is_after_event(year, month):
    return any((year, month) == event_period for event_period in periods_event_after)


df_user_embedding_before_event = df_user_embedding[df_user_embedding.progress_apply(lambda x: is_before_event(x["year"], x["month"]), axis=1)].drop(columns=["year", "month"]).groupby("author").mean()
df_user_embedding_during_event = df_user_embedding[df_user_embedding.progress_apply(lambda x: is_during_event(x["year"], x["month"]), axis=1)].drop(columns=["year", "month"]).groupby("author").mean()
df_user_embedding_after_event = df_user_embedding[df_user_embedding.progress_apply(lambda x: is_after_event(x["year"], x["month"]), axis=1)].drop(columns=["year", "month"]).groupby("author").mean()

# only keep users that are present in all 3 periods
df_user_list = df_user_embedding_before_event.index.intersection(df_user_embedding_during_event.index).intersection(df_user_embedding_after_event.index)
df_user_embedding_before_event = df_user_embedding_before_event.loc[df_user_list]
df_user_embedding_during_event = df_user_embedding_during_event.loc[df_user_list]
df_user_embedding_after_event = df_user_embedding_after_event.loc[df_user_list]

100%|██████████| 3073047/3073047 [00:10<00:00, 304337.50it/s]
100%|██████████| 3073047/3073047 [00:10<00:00, 306316.86it/s]
100%|██████████| 3073047/3073047 [00:09<00:00, 308586.91it/s]


In [16]:
df_user_type_change = pd.DataFrame()

df_user_type_change["before"] = pd.cut(
    df_user_embedding_before_event["point_soccer"],
    bins=[0, 0.001, 0.5, 1.0], 
    labels=["rarely", "sometimes", "often"],
    right=True, 
    include_lowest=True
)

df_user_type_change["during"] = pd.cut(
    df_user_embedding_during_event["point_soccer"],
    bins=[0, 0.001, 0.5, 1.0], 
    labels=["rarely", "sometimes", "often"],
    right=True, 
    include_lowest=True
)

df_user_type_change["after"] = pd.cut(
    df_user_embedding_after_event["point_soccer"],
    bins=[0, 0.001, 0.5, 1.0], 
    labels=["rarely", "sometimes", "often"],
    right=True, 
    include_lowest=True
)

In [33]:
import plotly.graph_objects as go

def create_sankey_diagram(df_user_type_change):
    # Define labels in the desired order
    labels = [
        "often_before", "sometimes_before", "rarely_before",
        "often_during", "sometimes_during", "rarely_during",
        "often_after", "sometimes_after", "rarely_after"
    ]
    
    # Create color mappings for different fan levels
    color_mapping = {
        'often': '#1f77b4',     # Blue
        'sometimes': '#ff7f0e',  # Orange
        'rarely': '#2ca02c'     # Green
    }
    
    # Create node colors
    node_colors = [color_mapping[level.split("_")[0]] for level in labels]
    
    # Create source, target, and value lists for the flows
    source = []
    target = []
    value = []
    link_colors = []
    
    # Generate indices mapping
    label_to_index = {label: idx for idx, label in enumerate(labels)}
    
    # Create flows between time periods
    for fan_level_a in ["often", "sometimes", "rarely"]:
        for fan_level_b in ["often", "sometimes", "rarely"]:
            for tag_1, tag_2 in zip(df_user_type_change.columns, df_user_type_change.columns[1:]):
                flow_value = len(
                    df_user_type_change[
                        (df_user_type_change[tag_1] == fan_level_a) & 
                        (df_user_type_change[tag_2] == fan_level_b)
                    ]
                )
                
                source.append(label_to_index[f"{fan_level_a}_{tag_1}"])
                target.append(label_to_index[f"{fan_level_b}_{tag_2}"])
                value.append(flow_value)
                
                # Add color based on source node
                base_color = color_mapping[fan_level_a]
                # Convert hex to rgba with 0.6 opacity
                rgba_color = f"rgba{tuple(int(base_color.lstrip('#')[i:i+2], 16) for i in (0, 2, 4)) + (0.6,)}"
                link_colors.append(rgba_color)
    
    # Create the figure
    fig = go.Figure(data=[go.Sankey(
        node = dict(
            pad = 15,
            thickness = 20,
            line = dict(color = "black", width = 0.5),
            label = labels,
            color = node_colors
        ),
        link = dict(
            source = source,
            target = target,
            value = value,
            color = link_colors
        )
    )])
    
    # Update layout
    fig.update_layout(
        title_text="User Type Changes Over Time",
        font_size=12,
        width=1000,
        height=600,
    )
    
    return fig


create_sankey_diagram(df_user_type_change)

## Analysis on 2019 NBA Finals

In [37]:
periods_event_before = [(2019, 3), (2019, 4)]
periods_event_during = [(2019, 5), (2019, 6)]
periods_event_after = [(2019, 7), (2019, 8)]


def is_during_event(year, month):
    return any((year, month) == event_period for event_period in periods_event_during)

def is_before_event(year, month):
    return any((year, month) == event_period for event_period in periods_event_before)

def is_after_event(year, month):
    return any((year, month) == event_period for event_period in periods_event_after)


df_user_embedding_before_event = df_user_embedding[df_user_embedding.progress_apply(lambda x: is_before_event(x["year"], x["month"]), axis=1)].drop(columns=["year", "month"]).groupby("author").mean()
df_user_embedding_during_event = df_user_embedding[df_user_embedding.progress_apply(lambda x: is_during_event(x["year"], x["month"]), axis=1)].drop(columns=["year", "month"]).groupby("author").mean()
df_user_embedding_after_event = df_user_embedding[df_user_embedding.progress_apply(lambda x: is_after_event(x["year"], x["month"]), axis=1)].drop(columns=["year", "month"]).groupby("author").mean()

# only keep users that are present in all 3 periods
df_user_list = df_user_embedding_before_event.index.intersection(df_user_embedding_during_event.index).intersection(df_user_embedding_after_event.index)
df_user_embedding_before_event = df_user_embedding_before_event.loc[df_user_list]
df_user_embedding_during_event = df_user_embedding_during_event.loc[df_user_list]
df_user_embedding_after_event = df_user_embedding_after_event.loc[df_user_list]

100%|██████████| 3073047/3073047 [00:10<00:00, 297066.82it/s]
100%|██████████| 3073047/3073047 [00:10<00:00, 299910.11it/s]
100%|██████████| 3073047/3073047 [00:10<00:00, 304389.59it/s]


In [38]:
df_user_type_change = pd.DataFrame()

df_user_type_change["before"] = pd.cut(
    df_user_embedding_before_event["point_basketball"],
    bins=[0, 0.001, 0.5, 1.0], 
    labels=["rarely", "sometimes", "often"],
    right=True, 
    include_lowest=True
)

df_user_type_change["during"] = pd.cut(
    df_user_embedding_during_event["point_basketball"],
    bins=[0, 0.001, 0.5, 1.0], 
    labels=["rarely", "sometimes", "often"],
    right=True, 
    include_lowest=True
)

df_user_type_change["after"] = pd.cut(
    df_user_embedding_after_event["point_basketball"],
    bins=[0, 0.001, 0.5, 1.0], 
    labels=["rarely", "sometimes", "often"],
    right=True, 
    include_lowest=True
)

In [39]:
import plotly.graph_objects as go

def create_sankey_diagram(df_user_type_change):
    # Get unique labels
    labels = []
    for period in ['before', 'during', 'after']:
        for level in ['often', 'sometimes', 'rarely']:
            labels.append(f"{level}_{period}")
    
    # Create color mappings for different fan levels
    color_mapping = {
        'often': '#1f77b4',     # Blue
        'sometimes': '#ff7f0e',  # Orange
        'rarely': '#2ca02c'     # Green
    }
    
    # Create node colors
    node_colors = []
    for _ in range(3):  # Three time periods
        for level in ['often', 'sometimes', 'rarely']:
            node_colors.append(color_mapping[level])
    
    # Create source, target, and value lists for the flows
    source = []
    target = []
    value = []
    link_colors = []
    
    # Generate indices mapping
    label_to_index = {label: idx for idx, label in enumerate(labels)}
    
    # Create flows between time periods
    for fan_level_a in ["often", "sometimes", "rarely"]:
        for fan_level_b in ["often", "sometimes", "rarely"]:
            for tag_1, tag_2 in zip(df_user_type_change.columns, df_user_type_change.columns[1:]):
                flow_value = len(
                    df_user_type_change[
                        (df_user_type_change[tag_1] == fan_level_a) & 
                        (df_user_type_change[tag_2] == fan_level_b)
                    ]
                )
                
                source.append(label_to_index[f"{fan_level_a}_{tag_1}"])
                target.append(label_to_index[f"{fan_level_b}_{tag_2}"])
                value.append(flow_value)
                
                # Add color based on source node
                base_color = color_mapping[fan_level_a]
                # Convert hex to rgba with 0.6 opacity
                rgba_color = f"rgba{tuple(int(base_color.lstrip('#')[i:i+2], 16) for i in (0, 2, 4)) + (0.6,)}"
                link_colors.append(rgba_color)
    
    # Create the figure
    fig = go.Figure(data=[go.Sankey(
        node = dict(
            pad = 15,
            thickness = 20,
            line = dict(color = "black", width = 0.5),
            label = labels,
            color = node_colors
        ),
        link = dict(
            source = source,
            target = target,
            value = value,
            color = link_colors
        )
    )])
    
    # Update layout
    fig.update_layout(
        title_text="User Type Changes Over Time",
        font_size=12,
        width=1000,
        height=600
    )
    
    return fig


create_sankey_diagram(df_user_type_change)

In [41]:
with open("sankey_diagram_basketball_2019.json", "w") as f:
    f.write(create_sankey_diagram(df_user_type_change).to_json())

## Analysis on 2015 Boxing Battle

In [27]:
periods_event_before = [(2015, 4)]
periods_event_during = [(2015, 5)]
periods_event_after = [(2015, 6)]


def is_during_event(year, month):
    return any((year, month) == event_period for event_period in periods_event_during)

def is_before_event(year, month):
    return any((year, month) == event_period for event_period in periods_event_before)

def is_after_event(year, month):
    return any((year, month) == event_period for event_period in periods_event_after)


df_user_embedding_before_event = df_user_embedding[df_user_embedding.progress_apply(lambda x: is_before_event(x["year"], x["month"]), axis=1)].drop(columns=["year", "month"]).groupby("author").mean()
df_user_embedding_during_event = df_user_embedding[df_user_embedding.progress_apply(lambda x: is_during_event(x["year"], x["month"]), axis=1)].drop(columns=["year", "month"]).groupby("author").mean()
df_user_embedding_after_event = df_user_embedding[df_user_embedding.progress_apply(lambda x: is_after_event(x["year"], x["month"]), axis=1)].drop(columns=["year", "month"]).groupby("author").mean()

# only keep users that are present in all 3 periods
df_user_list = df_user_embedding_before_event.index.intersection(df_user_embedding_during_event.index).intersection(df_user_embedding_after_event.index)
df_user_embedding_before_event = df_user_embedding_before_event.loc[df_user_list]
df_user_embedding_during_event = df_user_embedding_during_event.loc[df_user_list]
df_user_embedding_after_event = df_user_embedding_after_event.loc[df_user_list]

100%|██████████| 3073047/3073047 [00:09<00:00, 310589.72it/s]
100%|██████████| 3073047/3073047 [00:09<00:00, 312615.70it/s]
100%|██████████| 3073047/3073047 [00:09<00:00, 309750.73it/s]


In [28]:
df_user_type_change = pd.DataFrame()

df_user_type_change["before"] = pd.cut(
    df_user_embedding_before_event["point_boxing"],
    bins=[0, 0.001, 0.5, 1.0], 
    labels=["rarely", "sometimes", "often"],
    right=True, 
    include_lowest=True
)

df_user_type_change["during"] = pd.cut(
    df_user_embedding_during_event["point_boxing"],
    bins=[0, 0.001, 0.5, 1.0], 
    labels=["rarely", "sometimes", "often"],
    right=True, 
    include_lowest=True
)

df_user_type_change["after"] = pd.cut(
    df_user_embedding_after_event["point_boxing"],
    bins=[0, 0.001, 0.5, 1.0], 
    labels=["rarely", "sometimes", "often"],
    right=True, 
    include_lowest=True
)

In [35]:
import plotly.graph_objects as go

def create_sankey_diagram(df_user_type_change):
    # Get unique labels
    labels = []
    for period in ['before', 'during', 'after']:
        for level in ['often', 'sometimes', 'rarely']:
            labels.append(f"{level}_{period}")
    
    # Create color mappings for different fan levels
    color_mapping = {
        'often': '#1f77b4',     # Blue
        'sometimes': '#ff7f0e',  # Orange
        'rarely': '#2ca02c'     # Green
    }
    
    # Create node colors
    node_colors = []
    for _ in range(3):  # Three time periods
        for level in ['often', 'sometimes', 'rarely']:
            node_colors.append(color_mapping[level])
    
    # Create source, target, and value lists for the flows
    source = []
    target = []
    value = []
    link_colors = []
    
    # Generate indices mapping
    label_to_index = {label: idx for idx, label in enumerate(labels)}
    
    # Create flows between time periods
    for fan_level_a in ["often", "sometimes", "rarely"]:
        for fan_level_b in ["often", "sometimes", "rarely"]:
            for tag_1, tag_2 in zip(df_user_type_change.columns, df_user_type_change.columns[1:]):
                flow_value = len(
                    df_user_type_change[
                        (df_user_type_change[tag_1] == fan_level_a) & 
                        (df_user_type_change[tag_2] == fan_level_b)
                    ]
                )
                
                source.append(label_to_index[f"{fan_level_a}_{tag_1}"])
                target.append(label_to_index[f"{fan_level_b}_{tag_2}"])
                value.append(flow_value)
                
                # Add color based on source node
                base_color = color_mapping[fan_level_a]
                # Convert hex to rgba with 0.6 opacity
                rgba_color = f"rgba{tuple(int(base_color.lstrip('#')[i:i+2], 16) for i in (0, 2, 4)) + (0.6,)}"
                link_colors.append(rgba_color)
    
    # Create the figure
    fig = go.Figure(data=[go.Sankey(
        node = dict(
            pad = 15,
            thickness = 20,
            line = dict(color = "black", width = 0.5),
            label = labels,
            color = node_colors
        ),
        link = dict(
            source = source,
            target = target,
            value = value,
            color = link_colors
        )
    )])
    
    # Update layout
    fig.update_layout(
        title_text="User Type Changes Over Time",
        font_size=12,
        width=1000,
        height=600
    )
    
    return fig


create_sankey_diagram(df_user_type_change)

In [36]:
with open("sankey_diagram_boxing_2015.json", "w") as f:
    f.write(create_sankey_diagram(df_user_type_change).to_json())

In [None]:
"point_basketball"に

この章では、コメントデータを用いて、各ユーザーが様々なスポーツの動画へのエンゲージメントをどのように変化させたかをトラッキングする。

コメントデータと動画メタデータを統合することにより、各ユーザーが各スポーツに持っている興味のレベルを数値化し（つまり、各ユーザーを各スポーツへの興味レベルのベクトルとして表現できる）、各ユーザーがいつどのようなスポーツの動画にコメントを付与したかを取得することができる。例えば、FIFA 2018オリンピックの前、開催中、開催後を通して、各ユーザーがどのようにサッカーの動画に興味をもち、どのようにコメントを残したかを追跡することができるのである。

以下では、2019年までのYouTube上での大きなスポーツイベント（Olympicsなどの複合的なイベントでは、個別のスポーツの影響が混ざり合ってしまうため、今回は単種目のスポーツイベントのみを扱うことにした）3つ、つまり、2018年のFIFA World Cup (サッカー), 2019年のNBA Final (バスケ), そして、2015年のFloyd Mayweather vs. Manny Pacquiaoの試合 (ボクシング) におけるユーザーエンゲージメントの変化を評価した。

各分析において、まず、全ユーザーをそのスポーツに興味を持っているレベルに応じて、3段階に分類した。1つ目のレベルは、コメントの大部分をそのスポーツにのみ行っており、非常に強いファンと思われるグループ ("often")。2つ目は、コメントの一部をそのスポーツに行っているが、他のスポーツにもコメントを残しており、ファンとまでは言えないが、そのスポーツに対して興味を持っているグループ ("sometimes")。そして、3つ目が、そのスポーツの動画にコメントをしたことがなく、そのスポーツにほとんど興味を持っていないと思われるユーザーグループ ("seldom")。

以下の分析では、各イベントの前 ("before")、開催中 ("during") 、そして開催後 ("after") において、各グループのユーザーがどのように変化し、またユーザーがグループ間をどのように移動したかを確認した。以下の分析は、8.6Bにものぼる、全てのコメントデータをもとに計算された。


1つ目は、FIFA World Cup 2018における分析結果である。緑色で表された大きなグループは、サッカーの動画に対してその期間にコメントを全く残さなかったユーザーのグループであり、サッカーへの興味がほとんどないユーザーとして捉えることができる。同様に、オレンジのグループは、サッカーへの興味が少しあり、青のグループは、サッカーの動画に非常に強い興味を持っているユーザーである。

この分析から読み取れるのは、まず、ワールドカップの前後を通して、サッカーの動画に非常に強い興味を示したユーザーの数はほとんど変わらなかったということである。一方で、オレンジのサッカーへの興味が少しあるグループは二倍近くに増加し、その大部分は、開催前にはサッカーにほとんど興味を示さなかったユーザーグループである。この結果は、ワールドカップがサッカーファンのみならず、普段はサッカーを見ないユーザーの注目・興味を集めていることを意味している。これは、実際に、ワールドカップになると、普段は混み合っていないスポーツバーに人が溢れかえることからも想像できる結果である。一方で、ワールドカップが終わった一番右の部分では、サッカーに一定以上の興味を持つ、青とオレンジのグループのユーザー数は、開催前と同じ水準に戻っており、ワールドカップ中にサッカーに興味を持ったユーザーが定着したわけではないことを示している。


2つ目と3つ目は、NBA Final Game 2019における分析結果である。こちらにおいても、緑、オレンジ、青のグループの意味は同じである。分析の結果、サッカーのFIFA World Cupの場合と同じように、開催期間中は、普段そのスポーツへの興味を持たないユーザーの注目を集めることはできたが、開催後にはそのグループはまたそのスポーツを見なくなってしまった。

これらの分析から読み取れるのは、ユーザーの興味をそのスポーツに惹きつける方法として、大きなイベントは一過性の効果しか持たず、長期的にはほとんど効果を持たないということである。そのスポーツの振興という観点から言えば、華々しいイベントによってユーザーの興味を惹きつけることだけでは不十分であることが言える。今回の分析では、シンプルにそのスポーツのタグのみを使って動画を抽出して分析を行ったが、Future workとして、より詳細な動画分類により、より精緻な分析結果を得ることを考えている。