In [None]:
import os
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import seaborn as sns
# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from src.utils.data_utils import get_related_videos_with_keywords, keyword_searcher
from src.utils.general_utils import plot_wordcloud
from src.utils.evaluation_utils import diff_in_diff
from sklearn.linear_model import LinearRegression

In [None]:
DATA_PATH = "../data"
target_sport_data = pd.read_parquet(DATA_PATH+"/target_sport_data_mt.parquet",engine="fastparquet")

In [None]:
target_sports = [
    "football","basketball","wrestling","soccer","boxing","hockey","mma","golf","baseball","tennis","cricket","rugby","gymnastics"
,"volleyball","skating","karate"]

In [None]:
def sport_analysis(data_all,data_sport,sport_name=None):
    monthly_count = data_sport.groupby('upload_year_month').size()
    plt.figure(figsize=(12, 8))
    monthly_count.plot(kind='line', figsize=(12, 6), colormap='tab20', linewidth=2)

    plt.title('Monthly Upload Counts for {}'.format(sport_name), fontsize=16)
    plt.xlabel('Month', fontsize=14)
    plt.ylabel('Number of Uploads', fontsize=14)
    # plt.legend(title='Sports', fontsize=10, bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    autocorrelation = monthly_count.autocorr(lag=1)
    print("Autocorrelation: ", autocorrelation)


    #------------------------------
    print("periodic analysis")
    # window = np.hanning(len(monthly_count))  # ハニング窓を使用（他の窓関数も可能）
    # windowed_data = monthly_count * window   # 信号に窓関数を掛ける

    # # フーリエ変換の計算
    # fft_values = np.fft.fft(windowed_data)
    # フーリエ変換の計算
    fft_values = np.fft.fft(monthly_count)

    # 振幅スペクトルを取得（絶対値）
    fft_amplitude = np.abs(fft_values)
    fft_amplitude /= len(monthly_count)
    print("len",len(monthly_count))

    # 周波数軸を計算
    frequencies = np.fft.fftfreq(len(fft_values))

    # 正の周波数のみを使用
    positive_frequencies = frequencies[:len(frequencies)//2]
    positive_amplitudes = fft_amplitude[:len(fft_amplitude)//2]

    # 周期（月数）に変換
    periods = 1 / positive_frequencies

    # 周期が無限大になる値（周波数が0の場合）を除去
    periods = periods[1:]  # 周波数0の部分を除く
    positive_amplitudes = positive_amplitudes[1:]  # 同様に振幅も除去
    
    threshold = 10
    filtered_periods = periods[positive_amplitudes >= threshold]
    filtered_amplitudes = positive_amplitudes[positive_amplitudes >= threshold]
    print(filtered_periods)
    print(filtered_amplitudes)
    filtered_periods=filtered_periods[1:]
    filtered_amplitudes=filtered_amplitudes[1:]

    # プロット
    plt.figure(figsize=(12, 8))
    # plt.bar(filtered_periods, filtered_amplitudes, width=0.5, color='blue')
    plt.stem(filtered_periods, filtered_amplitudes, linefmt='b-', markerfmt='bo', basefmt='r-')
    plt.title('Frequency Spectrum, {}'.format(sport_name), fontsize=16)
    plt.xlabel('Period (Months)', fontsize=14)
    plt.ylabel('Amplitude', fontsize=14)
    # plt.yscale("log")
    plt.xlim(0,60)
    plt.ylim(0,1000)
    plt.grid(True)

    # x軸のラベルを細かく調整
    plt.xticks(np.arange(0, 60 + 1, step=3), rotation=45)

    plt.tight_layout()
    plt.show()
    #------------------------------

    total_sports_videos = data_all.groupby('upload_year_month').size()

    sport_video_count = data_sport.groupby('upload_year_month').size()

    golf_trend = (sport_video_count / total_sports_videos).fillna(0)  # NaNを0で置き換える
    plt.figure(figsize=(12, 6))
    golf_trend.plot(kind='line', figsize=(12, 6), colormap='tab20', linewidth=2)

    plt.title('{} Video Upload Trends'.format(sport_name))
    plt.xlabel('Month')
    plt.ylabel('Proportion of {} related video'.format(sport_name))
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid()
    plt.tight_layout()
    plt.show()

    #-----------------------------
    # user engagement
    monthly_engagement = data_sport.groupby('upload_year_month')['engagement_rate'].agg(['mean', 'median']).reset_index()

    tick_positions = range(0, len(monthly_engagement), 4)
    plt.figure(figsize=(12, 6))
    plt.plot(monthly_engagement['upload_year_month'].astype(str), monthly_engagement['mean'], marker='o', label='Mean Engagement Rate')
    plt.plot(monthly_engagement['upload_year_month'].astype(str), monthly_engagement['median'], marker='s', label='Median Engagement Rate', linestyle='--')
    plt.title('Monthly Engagement Rate of {}'.format(sport_name))
    plt.xlabel('Month')
    plt.ylabel('Engagement Rate')
    plt.xticks(tick_positions, monthly_engagement['upload_year_month'].astype(str).iloc[tick_positions], rotation=45)  # ラベルを間引く
    plt.legend()
    plt.grid()
    plt.ylim(0,0.05)
    plt.tight_layout()
    plt.show()

    # #-----------------------------
    # # q score
    # data_sport['q_score']=(data_sport['like_count'] - data_sport['dislike_count']) / (data_sport['like_count'] + data_sport['dislike_count'])
    # monthly_engagement = data_sport.groupby('upload_year_month')['q_score'].agg(['mean', 'median']).reset_index()

    # tick_positions = range(0, len(monthly_engagement), 4)
    # plt.figure(figsize=(12, 6))
    # plt.plot(monthly_engagement['upload_year_month'].astype(str), monthly_engagement['mean'], marker='o', label='Mean Q score Rate')
    # plt.plot(monthly_engagement['upload_year_month'].astype(str), monthly_engagement['median'], marker='s', label='Median Q score Rate', linestyle='--')
    # plt.title('Monthly Q score of {}'.format(sport_name))
    # plt.xlabel('Month')
    # plt.ylabel('Q score')
    # plt.xticks(tick_positions, monthly_engagement['upload_year_month'].astype(str).iloc[tick_positions], rotation=45)  # ラベルを間引く
    # plt.legend()
    # plt.grid()
    # plt.tight_layout()
    # plt.show()

    #-------------------------------------
    # how many channels are active over time???

    

In [None]:
for sport in target_sports:
    df_sport_category_metadata=target_sport_data[target_sport_data['tags'].apply(lambda x: sport in x)]
    sport_analysis(target_sport_data,df_sport_category_metadata,sport)