In [1]:
import pandas as pd
from nltk.corpus import stopwords
import os
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import warnings
import seaborn as sns

warnings.filterwarnings('ignore')

In [2]:
def get_data(data_file, list_of_words, stop, theme='', chunksize=10**6):
    for i,chunk in enumerate(pd.read_json(data_file, lines=True, chunksize=chunksize)):
        #chunk = chunk.loc[chunk['categories'] == 'News & Politics']
        chunk = chunk[chunk.categories.isin(['News & Politics'])]
        chunk["video_info"] = chunk['title'].astype(str) +": "+ chunk["description"]
        # drop these columns to conserve space    
        chunk = chunk.drop(['title'],  axis=1)
        chunk = chunk.drop(['description'], axis=1)

        chunk['video_info'] = chunk['video_info'].str.lower()

        # Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
        chunk['video_info'] = chunk['video_info'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

        # converting upload_time to datetime object so we can begin to slice dataframe
        chunk['upload_date'] = pd.to_datetime(chunk['upload_date'], format='%Y-%m-%d').dt.date
        chunk['upload_date'] = pd.to_datetime(chunk['upload_date'], format='%Y-%m-%d')
        chunk['upload_date'] = chunk['upload_date'].dt.to_period('M')

        msk = chunk.video_info.str.contains('|'.join(list_of_words), case=False)

        if len(msk)!=0:
            chunk = chunk[msk]

            num_videos = chunk.groupby(by='upload_date')['upload_date'].agg('count')
            total_view = chunk.groupby(by='upload_date')['view_count'].sum()
            df = pd.concat(
                {'num_videos': num_videos,
                'total_views': total_view},
                axis=1)

            compression_opts = dict(method='zip',archive_name=f'{theme}_{i}.csv') 
            df.to_csv(PATH_TO_SAVE+f'{theme}_{i}.zip', compression=compression_opts)

        if i%10==0:
            print(i)

In [3]:
theme = 'baseball'
PATH_TO_SAVE = '../datasets/sport/'+theme+'/'
PATH_TO_SAVE_IMAGES = '../datasets/figures/sports'
DATA_FILE = '../original_data/yt_metadata_en.jsonl.gz'

def check_create_dir(path):
# If folder doesn't exist, then create it.
    if not os.path.isdir(path):
        os.makedirs(path)
        print("created folder : ", path)
    else:
        print(path, "folder already exists.")

In [4]:
check_create_dir(PATH_TO_SAVE)
check_create_dir(PATH_TO_SAVE_IMAGES)

../datasets/sport/baseball/ folder already exists.
../datasets/figures/sports folder already exists.


In [5]:
stop = stopwords.words('english')
list_words = ['baseball', 'mlb', 'home run']

In [16]:
get_data(DATA_FILE,list_words, theme)

0
10
20
30
40
50
60
70


In [6]:
def merge_sum_df(df1,df2):
    return df1.groupby('upload_date').sum().add(df2.groupby('upload_date').sum(), fill_value=0).reset_index()

In [7]:
def plot(df_, title=''):
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add traces
    fig.add_trace(
        go.Scatter(x=df_['upload_date'], y=df_['num_videos'], name="Number of videos uploaded for {}".format(title)),
        secondary_y=False,
    )

    fig.add_trace(
        go.Scatter(x=df_['upload_date'], y=df_['total_views'], name="Number of views for {}".format(title)),
        secondary_y=True,
    )

    # Add figure title
    fig.update_layout(
        title_text="Number of videos uploaded and number of views over time"
    )

    # Set x-axis title
    fig.update_xaxes(title_text="Date")

    # Set y-axes titles
    fig.update_yaxes(title_text="<b>primary</b> Number of videos", secondary_y=False)
    fig.update_yaxes(title_text="<b>secondary</b> Number of views", secondary_y=True)

    fig.show()

In [8]:
def get_file_names(path_files):
    file_names = []
    for path in os.listdir(path_files):
        # check if current path is a file
        if os.path.isfile(os.path.join(path_files, path)):
            if path[0]=='.':
                continue
            file_names.append(path)
    return file_names


def create_df(path,file_names):
    df_ = pd.read_csv(path+'/'+file_names[0], compression='zip')
    for i,file in enumerate(file_names):
        if i==0:
            continue
        df = pd.read_csv(path+'/'+file, compression='zip')
        df_ = merge_sum_df(df_,df)
    return df_

In [9]:
path_football = '../datasets/sport/world_cup/'
file_names = get_file_names(path_football)
df_football = create_df(path_football,file_names)
plot(df_football, 'World Cup')

In [10]:
path = '../datasets/sport/NBA/'
file_names = get_file_names(path)
df_nba = create_df(path,file_names)
plot(df_nba, 'NBA')

In [11]:
path = '../datasets/sport/Olympics/'
file_names = get_file_names(path)
df_olympics = create_df(path,file_names)
plot(df_olympics, 'Olympics')

In [12]:
path = '../datasets/sport/tennis/'
file_names = get_file_names(path)
df_tennis = create_df(path,file_names)
plot(df_tennis, 'Tennis')

In [13]:
path = '../datasets/sport/baseball/'
file_names = get_file_names(path)
df_baseball = create_df(path,file_names)
plot(df_baseball, 'Baseball')

In [14]:
list_df = [df_football, df_nba, df_olympics, df_tennis, df_baseball]
list_titles = ['World cup', 'NBA', 'Olympics', 'Tennis', 'Baseball']

In [15]:
def plot_all(list_df, list_titles, type='num_videos'):
    fig = go.Figure()
    if type == 'num_videos':
        legend = "Number of videos uploaded"
    if type == 'total_views':
        legend = "Number of views"
    for df_,title in zip(list_df,list_titles):
        # Add traces
        fig.add_trace(
            go.Scatter(x=df_['upload_date'], y=df_[type], 
            name="{} for {}".format(legend,title)),
        )
        fig.update_layout(
            title_text=legend
            )
        # Set x-axis title
        fig.update_xaxes(title_text="Date")

    fig.show()


In [16]:
plot_all(list_df=list_df, list_titles=list_titles, type='num_videos')

In [17]:
plot_all(list_df=list_df, list_titles=list_titles, type='total_views')

In [26]:
df_num_videos=pd.DataFrame({
    'World cup': df_football.set_index('upload_date')['num_videos'],
    'Basketball': df_nba.set_index('upload_date')['num_videos'],
    'Olympics': df_olympics.set_index('upload_date')['num_videos'],
    'Tennis': df_tennis.set_index('upload_date')['num_videos'],
    'Baseball': df_baseball.set_index('upload_date')['num_videos'],
})
df_num_videos

Unnamed: 0_level_0,World cup,Basketball,Olympics,Tennis,Baseball
upload_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2006-02,,,1.0,,
2006-06,,,1.0,,1.0
2006-08,,,3.0,,
2006-09,1.0,,2.0,,
2006-10,2.0,,2.0,1.0,1.0
...,...,...,...,...,...
2019-06,5318.0,1483.0,6090.0,124.0,192.0
2019-07,4556.0,1035.0,6672.0,360.0,245.0
2019-08,2385.0,965.0,5895.0,235.0,81.0
2019-09,2692.0,1039.0,6175.0,447.0,67.0


In [28]:
df_num_views=pd.DataFrame({
    'World cup': df_football.set_index('upload_date')['total_views'],
    'Basketball': df_nba.set_index('upload_date')['total_views'],
    'Olympics': df_olympics.set_index('upload_date')['total_views'],
    'Tennis': df_tennis.set_index('upload_date')['total_views'],
    'Baseball': df_baseball.set_index('upload_date')['total_views'],
})
df_num_views

Unnamed: 0_level_0,World cup,Basketball,Olympics,Tennis,Baseball
upload_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2006-02,,,7121.0,,
2006-06,,,790.0,,4298.0
2006-08,,,52096.0,,
2006-09,959.0,,15745.0,,
2006-10,14977.0,,154200.0,15626.0,74905.0
...,...,...,...,...,...
2019-06,113568017.0,17500998.0,89227081.0,805954.0,2024594.0
2019-07,147168455.0,12452278.0,98583806.0,8875207.0,3377982.0
2019-08,36322692.0,12292334.0,122070829.0,3847260.0,2160571.0
2019-09,45770924.0,16208561.0,119634162.0,10584393.0,640680.0


In [31]:
def stacked_plot(df, type=''):

    fig = go.Figure()

    for column in df_num_videos.columns.to_list():
        fig.add_trace(
            go.Scatter(
                x = df.index,
                y = df[column],
                name = column
            )
        )
        
    fig.update_layout(
        updatemenus=[go.layout.Updatemenu(
            active=0,
            buttons=list(
                [dict(label = 'All',
                    method = 'update',
                    args = [{'visible': [True, True, True, True, True]},
                            {'title': 'All - '+type,
                            'showlegend':True}]),
                dict(label = 'World cup',
                    method = 'update',
                    args = [{'visible': [True, False, False, False, False]}, # the index of True aligns with the indices of plot traces
                            {'title': 'World cup - '+type,
                            'showlegend':True}]),
                dict(label = 'Basketball',
                    method = 'update',
                    args = [{'visible': [False, True, False, False, False]},
                            {'title': 'Basketball - '+type,
                            'showlegend':True}]),
                dict(label = 'Olympics',
                    method = 'update',
                    args = [{'visible': [False, False, True, False, False]},
                            {'title': 'Olympics - '+type,
                            'showlegend':True}]),
                dict(label = 'Tennis',
                    method = 'update',
                    args = [{'visible': [False, False, False, True, False]},
                            {'title': 'Tennis - '+type,
                            'showlegend':True}]),
                dict(label = 'Baseball',
                    method = 'update',
                    args = [{'visible': [False, False, False, False, True]},
                            {'title': 'Baseball - '+type,
                            'showlegend':True}]),
                ])
            )
        ])

    fig.show()

In [30]:
stacked_plot(df_num_videos, type='Number of videos')

In [32]:
stacked_plot(df_num_views, type='Total number of views')