In [7]:
import os
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import seaborn as sns
# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from src.utils.data_utils import get_related_videos_with_keywords, keyword_searcher
from src.utils.general_utils import plot_wordcloud
from src.utils.evaluation_utils import diff_in_diff
from sklearn.linear_model import LinearRegression

In [9]:
DATA_PATH = "../data"
merged_timeseries_md = pd.read_parquet(DATA_PATH+"/MergedTimeseriesMetadata.parquet",engine="fastparquet")

In [11]:
merged_timeseries_md.shape

(2340291, 19)

In [1]:
from datetime import datetime

# Define major sports events with their start and end dates and associated keywords
events = {
    # Football (American)
    'SuperBowl_XXXIX_2005': {
        'sport': 'football',
        'start': datetime(2005, 2, 6),
        'end': datetime(2005, 2, 6),
        'keywords': [
            'super bowl xxxix', 'superbowl xxxix', 'super bowl 39', 'superbowl 39',
            'sb39', 'superbowl39', 'superbowl 2005', 'super bowl 2005'
        ]
    },
    'SuperBowl_LII_2018': {
        'sport': 'football',
        'start': datetime(2018, 2, 4),
        'end': datetime(2018, 2, 4),
        'keywords': [
            'super bowl lii', 'superbowl lii', 'super bowl 52', 'superbowl 52',
            'sb52', 'superbowl52', 'superbowl 2018', 'super bowl 2018'
        ]
    },
    
    # Basketball
    'NBA_Finals_2005': {
        'sport': 'basketball',
        'start': datetime(2005, 6, 8),
        'end': datetime(2005, 6, 24),
        'keywords': [
            'nba finals 2005', 'nba finals', 'nbafinals2005', 'nba finals 2004-2005',
            'finals nba', 'nba_final_2005'
        ]
    },
    'NBA_Finals_2018': {
        'sport': 'basketball',
        'start': datetime(2018, 6, 6),
        'end': datetime(2018, 6, 12),
        'keywords': [
            'nba finals 2018', 'nba finals', 'nbafinals2018', 'nba finals 2017-2018',
            'finals nba', 'nba_final_2018'
        ]
    },
    
    # Wrestling
    'WrestleMania_21_2005': {
        'sport': 'wrestling',
        'start': datetime(2005, 4, 3),
        'end': datetime(2005, 4, 3),
        'keywords': [
            'wrestlemania 21', 'wm21', 'wrestle mania 21', 'wm21', 
            'wrestlemania21', 'wrestle mania21'
        ]
    },
    'WrestleMania_34_2018': {
        'sport': 'wrestling',
        'start': datetime(2018, 4, 8),
        'end': datetime(2018, 4, 8),
        'keywords': [
            'wrestlemania 34', 'wm34', 'wrestle mania 34', 'wm34',
            'wrestlemania34', 'wrestle mania34'
        ]
    },
    
    # Soccer
    'UEFA_Champions_League_Final_2005': {
        'sport': 'soccer',
        'start': datetime(2005, 5, 25),
        'end': datetime(2005, 5, 25),
        'keywords': [
            'uefa champions league final 2005', 'champions league final 2005',
            'ucl final 2005', 'uefa cl final', 'championsleaguefinal2005'
        ]
    },
    'FIFA_WorldCup_2018': {
        'sport': 'soccer',
        'start': datetime(2018, 6, 14),
        'end': datetime(2018, 7, 15),
        'keywords': [
            'fifa world cup 2018', 'world cup 2018', 'fifa2018', 'worldcup2018',
            'fifa worldcup 2018', 'world cup', 'fifa wc2018'
        ]
    },
    
    # Boxing
    'Mayweather_McGregor_Fight_2005': {  # Example placeholder; actual high-profile fights can be added
        'sport': 'boxing',
        'start': datetime(2005, 7, 10),
        'end': datetime(2005, 7, 10),
        'keywords': [
            'floyd mayweather fight', 'mayweather fight 2005', 'mayweather vs opponent',
            'mayweather2005', 'floyd mayweather2005'
        ]
    },
    'Joshua_Klitschko_Fight_2018': {
        'sport': 'boxing',
        'start': datetime(2018, 4, 21),
        'end': datetime(2018, 4, 21),
        'keywords': [
            'anthony joshua vs wladimir klitschko', 'joshua klitschko fight',
            'joshua vs klitschko', 'anthonyjoshua klitschko2018',
            'joshua_vs_klitschko_2018'
        ]
    },
    
    # Hockey
    'StanleyCup_Devils_2005': {
        'sport': 'hockey',
        'start': datetime(2005, 6, 9),
        'end': datetime(2005, 6, 9),
        'keywords': [
            'stanley cup 2005', 'new jersey devils stanley cup', 'devils win stanley cup',
            'stanleycup2005', 'newjerseydevilsstanleycup'
        ]
    },
    'StanleyCup_Capitals_2018': {
        'sport': 'hockey',
        'start': datetime(2018, 6, 7),
        'end': datetime(2018, 6, 7),
        'keywords': [
            'stanley cup 2018', 'washington capitals stanley cup', 'capitals win stanley cup',
'stanleycup2018', 'washingtoncapitalsstanleycup'
        ]
    },
    
    # MMA
    'UFC_52_2005': {
        'sport': 'mma',
        'start': datetime(2005, 4, 16),
        'end': datetime(2005, 4, 16),
        'keywords': [
            'ufc 52', 'ufc52', 'ufc 52 couture vs henderson', 'ufc52 fight',
            'ufc_52', 'ufc52couchevshenderson'
        ]
    },
    'UFC_229_2018': {
        'sport': 'mma',
        'start': datetime(2018, 10, 6),
        'end': datetime(2018, 10, 6),
        'keywords': [
            'ufc 229', 'ufc229', 'ufc 229 khabib vs mcgregor', 'ufc229 fight',
            'ufc_229', 'ufc229khabibvsmcgregor'
        ]
    },
    
    # Golf
    'Masters_2005': {
        'sport': 'golf',
        'start': datetime(2005, 4, 7),
        'end': datetime(2005, 4, 10),
        'keywords': [
            'masters tournament 2005', 'the masters 2005', 'masters2005',
            'masters tournament', 'the masters'
        ]
    },
    'Masters_2018': {
        'sport': 'golf',
        'start': datetime(2018, 4, 5),
        'end': datetime(2018, 4, 8),
        'keywords': [
            'masters tournament 2018', 'the masters 2018', 'masters2018',
            'masters tournament', 'the masters'
        ]
    },
    
    # Baseball
    'WorldSeries_2005': {
        'sport': 'baseball',
        'start': datetime(2005, 10, 26),
        'end': datetime(2005, 11, 4),
        'keywords': [
            'mlb world series 2005', 'world series 2005', 'white sox vs astros',
            'worldseries2005', 'mlbworldseries2005'
        ]
    },
    'WorldSeries_2018': {
        'sport': 'baseball',
        'start': datetime(2018, 10, 28),
        'end': datetime(2018, 11, 5),
        'keywords': [
            'mlb world series 2018', 'world series 2018', 'red sox vs dodgers',
            'worldseries2018', 'mlbworldseries2018'
        ]
    },
    
    # Tennis
    'Wimbledon_2005': {
        'sport': 'tennis',
        'start': datetime(2005, 6, 20),
        'end': datetime(2005, 7, 3),
        'keywords': [
            'wimbledon 2005', 'wimbledon2005', 'wimbledon finals 2005',
            'wimbledon tournament 2005', 'the championships 2005'
        ]
    },
    'Wimbledon_2018': {
        'sport': 'tennis',
        'start': datetime(2018, 6, 25),
        'end': datetime(2018, 7, 8),
        'keywords': [
            'wimbledon 2018', 'wimbledon2018', 'wimbledon finals 2018',
            'wimbledon tournament 2018', 'the championships 2018'
        ]
    },
    
    # Cricket
    'ICC_SuperSeries_2005': {
        'sport': 'cricket',
        'start': datetime(2005, 7, 2),
        'end': datetime(2005, 8, 10),
        'keywords': [
            'icc super series 2005', 'super series 2005', 'icc super series',
            'icc_super_series_2005', 'cricket super series 2005'
        ]
    },
    'ICC_WorldCupQual_2018': {
        'sport': 'cricket',
        'start': datetime(2018, 3, 7),
        'end': datetime(2018, 3, 19),
        'keywords': [
            'icc cricket world cup qualifier 2018', 'world cup qualifier 2018',
            'iccwcqualifier2018', 'cricket world cup qualifier',
            'worldcupqualifier2018', 'icc world cup qualifier'
        ]
    },
    
    # Rugby
    'TriNations_2005': {
        'sport': 'rugby',
        'start': datetime(2005, 7, 2),
        'end': datetime(2005, 8, 13),
        'keywords': [
            'tri nations tournament 2005', 'tri-nations 2005', 'rugby tri nations 2005',
            'tri_nations_2005', 'rugby tri-nations 2005'
        ]
    },
    'Rugby_Championship_2018': {
        'sport': 'rugby',
        'start': datetime(2018, 8, 3),
        'end': datetime(2018, 10, 27),
        'keywords': [
            'rugby championship 2018', 'rugby championship', 'rugbychampionship2018',
            'rugby_championship_2018', 'rugby championship tournament 2018'
        ]
    },
    
    # Gymnastics
    'World_Gymnastics_2005': {
        'sport': 'gymnastics',
        'start': datetime(2005, 10, 20),
        'end': datetime(2005, 10, 30),
        'keywords': [
'world gymnastics championships 2005', 'world gymnastics 2005',
            'gymnastics world championships 2005', 'world_gymnastics_2005',
            'gymnastics world champs 2005'
        ]
    },
    'World_Gymnastics_2018': {
        'sport': 'gymnastics',
        'start': datetime(2018, 10, 30),
        'end': datetime(2018, 11, 4),
        'keywords': [
            'world gymnastics championships 2018', 'world gymnastics 2018',
            'gymnastics world championships 2018', 'world_gymnastics_2018',
            'gymnastics world champs 2018'
        ]
    },
    
    # Volleyball
    'FIVB_World_League_2005': {
        'sport': 'volleyball',
        'start': datetime(2005, 5, 7),
        'end': datetime(2005, 7, 3),
        'keywords': [
            'fivb world league 2005', 'fivbworldleague2005', 'volleyball world league 2005',
            'fivb_world_league_2005', 'world league volleyball 2005'
        ]
    },
    'FIVB_World_Championship_2018': {
        'sport': 'volleyball',
        'start': datetime(2018, 9, 9),
        'end': datetime(2018, 9, 29),
        'keywords': [
            'fivb volleyball world championship 2018', 'fivb world championship 2018',
            'volleyball world championship 2018', 'fivb_world_championship_2018',
            'world championship volleyball 2018'
        ]
    },
    
    # Skating
    'Figure_Skating_Grand_Prix_2005': {
        'sport': 'skating',
        'start': datetime(2005, 10, 14),
        'end': datetime(2005, 12, 19),
        'keywords': [
            'figure skating grand prix 2005', 'grand prix figure skating 2005',
            'figure_skating_grand_prix_2005', 'skating grand prix 2005',
            'figure skating gp2005'
        ]
    },
    'Winter_Olympics_Figure_Skating_2018': {
        'sport': 'skating',
        'start': datetime(2018, 2, 9),
        'end': datetime(2018, 2, 25),
        'keywords': [
            'winter olympics figure skating 2018', 'figure skating pyeongchang 2018',
            'winter olympic figure skating', 'figure_skating_winter_olympics_2018',
            'figure skating pyeongchang'
        ]
    },
    
    # Karate
    'World_Karate_Championships_2005': {
        'sport': 'karate',
        'start': datetime(2005, 10, 23),
        'end': datetime(2005, 10, 29),
        'keywords': [
            'world karate championships 2005', 'world karate 2005',
            'karate world championships 2005', 'world_karate_2005',
            'karate world champs 2005'
        ]
    },
    'World_Karate_Championships_2018': {
        'sport': 'karate',
        'start': datetime(2018, 11, 9),
        'end': datetime(2018, 11, 11),
        'keywords': [
            'world karate championships 2018', 'world karate 2018',
            'karate world championships 2018', 'world_karate_2018',
            'karate world champs 2018'
        ]
    }
}


In [5]:
events['Wimbledon_2005']

{'sport': 'tennis',
 'start': datetime.datetime(2005, 6, 20, 0, 0),
 'end': datetime.datetime(2005, 7, 3, 0, 0),
 'keywords': ['wimbledon 2005',
  'wimbledon2005',
  'wimbledon finals 2005',
  'wimbledon tournament 2005',
  'the championships 2005']}