In [1]:
import pandas as pd
import numpy as np
import os

# copied from dataloader because of path issues in notebook
def load_raw_data(filename, n_first_rows):
    """
    Load the raw data from the file and return the first n_first_rows
    Parameters:
    filename (str): the name of the file to load
    n_first_rows (int): the number of rows to load
    Return:
    df (pd.DataFrame): the first n_first_rows of the file
    """
    
    df_timeseries = pd.read_csv(f'./../../data/{filename}', sep='\t', compression='infer', nrows=n_first_rows)
    df_timeseries['datetime'] = pd.to_datetime(df_timeseries['datetime'])

    return df_timeseries

In [2]:
# Load the time series data
df_timeseries = load_raw_data('df_timeseries_en.tsv', 18872500)

# Convert the datetime column to datetime format
df_timeseries['datetime'] = pd.to_datetime(df_timeseries['datetime'])

In [3]:
# Compute the week index from the first date in the dataset

# Get the first date in the dataset
first_date = df_timeseries['datetime'].min()
print(first_date)

# Get last date in the dataset
last_date = df_timeseries['datetime'].max()
print(last_date)

# Compute the week index
df_timeseries['week_index'] = df_timeseries['datetime'].apply(lambda x: (x - first_date).days // 7)

2015-01-05 00:00:00
2019-09-30 01:00:00


In [4]:
df_timeseries.sort_values(by='datetime', inplace=True)
df_timeseries.reset_index(drop=True, inplace=True)
print(df_timeseries.head(50))

                     channel            category   datetime         views  \
0   UC1Ru5ZqoiF1cFwQSuFuv-KA              Gaming 2015-01-05  3.111235e+06   
1   UCnYwVM9uyLGzTFfAkWqiUdQ              Gaming 2015-01-05  2.580014e+07   
2   UCqzju-_WMKsgNx8R3QwupQQ       Howto & Style 2015-01-12  3.307442e+08   
3   UC-IhaHBONjT0JjdOKqC21zQ  Film and Animation 2015-01-12  1.355811e+06   
4   UCe8GMzGI_8fBfQYgOe3tBKA       Entertainment 2015-01-12  7.106681e+07   
5   UCnYwVM9uyLGzTFfAkWqiUdQ              Gaming 2015-01-12  2.615005e+07   
6   UC1Ru5ZqoiF1cFwQSuFuv-KA              Gaming 2015-01-12  3.135948e+06   
7   UCjKOy4vngYbZMSXJbMmJoUQ              Gaming 2015-01-12  4.749135e+07   
8   UCe8GMzGI_8fBfQYgOe3tBKA       Entertainment 2015-01-19  7.202908e+07   
9   UCzqQPiNga359uISQxuKO1Fg       Howto & Style 2015-01-19  2.141333e+02   
10  UC-IhaHBONjT0JjdOKqC21zQ  Film and Animation 2015-01-19  1.363066e+06   
11  UC1Ru5ZqoiF1cFwQSuFuv-KA              Gaming 2015-01-19  3.139996e+06   

In [5]:
df_timeseries.groupby('week_index')
print(df_timeseries[['datetime', 'week_index']].head(50))


     datetime  week_index
0  2015-01-05           0
1  2015-01-05           0
2  2015-01-12           1
3  2015-01-12           1
4  2015-01-12           1
5  2015-01-12           1
6  2015-01-12           1
7  2015-01-12           1
8  2015-01-19           2
9  2015-01-19           2
10 2015-01-19           2
11 2015-01-19           2
12 2015-01-19           2
13 2015-01-19           2
14 2015-01-19           2
15 2015-01-26           3
16 2015-01-26           3
17 2015-01-26           3
18 2015-01-26           3
19 2015-01-26           3
20 2015-01-26           3
21 2015-01-26           3
22 2015-02-02           4
23 2015-02-02           4
24 2015-02-02           4
25 2015-02-02           4
26 2015-02-02           4
27 2015-02-02           4
28 2015-02-02           4
29 2015-02-09           5
30 2015-02-09           5
31 2015-02-09           5
32 2015-02-09           5
33 2015-02-09           5
34 2015-02-09           5
35 2015-02-09           5
36 2015-02-16           6
37 2015-02-1

In [6]:
# Find and print rows with duplicate week indices
duplicate_weeks = df_timeseries[df_timeseries.duplicated(subset=['week_index'], keep=False)]

# Print the rows with 'datetime' and 'weekindex' columns
print(duplicate_weeks[['channel', 'datetime', 'week_index']].tail(50))

# TODO: Remove the duplicate rows from the dataset ? keep only 00:00:00 rows ?

                           channel            datetime  week_index
18872449  UC5J6SlEEnj28Zf1QbLNJ1ow 2019-09-30 01:00:00         247
18872450  UCQGswVycjt4v4zrvHyK6TCQ 2019-09-30 01:00:00         247
18872451  UCPnWeY_BurW8m3Q0q7v_g9Q 2019-09-30 01:00:00         247
18872452  UC2UJvNq5CbL2eEAReEDR_Bw 2019-09-30 01:00:00         247
18872453  UCBh2MtapL0Qbcby_v_hyEXA 2019-09-30 01:00:00         247
18872454  UC3plrhJApuJ_uapz_AMf7tw 2019-09-30 01:00:00         247
18872455  UCMUjCxhdYzl_yGhJp8yq4nw 2019-09-30 01:00:00         247
18872456  UCPKlmMK0Ybl8eSlctwKER1Q 2019-09-30 01:00:00         247
18872457  UCtlWNQjJr5vRGPiJZJBa4lw 2019-09-30 01:00:00         247
18872458  UCMxLVx3E2jU7RjjrUsiD93w 2019-09-30 01:00:00         247
18872459  UC3qiN2e6haEIuCTurf1xDDg 2019-09-30 01:00:00         247
18872460  UC0X9PT1B66UDpKsmGz83_Pw 2019-09-30 01:00:00         247
18872461  UCBPoTSAoYaRlgEXQZcc6Fiw 2019-09-30 01:00:00         247
18872462  UCr2SvXhFky8VSeykkEUa18g 2019-09-30 01:00:00        

In [9]:
df_timeseries_grouped_by_channel = df_timeseries.groupby('channel')
print(df_timeseries_grouped_by_channel.get_group('UCXhkGgooXHDNwgJXmoTSN7g'))

                           channel category            datetime         views  \
23545     UCXhkGgooXHDNwgJXmoTSN7g    Music 2016-03-21 00:00:00  1.407062e+07   
36754     UCXhkGgooXHDNwgJXmoTSN7g    Music 2016-03-28 01:00:00  1.412790e+07   
42796     UCXhkGgooXHDNwgJXmoTSN7g    Music 2016-04-04 01:00:00  1.423378e+07   
54336     UCXhkGgooXHDNwgJXmoTSN7g    Music 2016-04-11 01:00:00  1.430530e+07   
75198     UCXhkGgooXHDNwgJXmoTSN7g    Music 2016-04-18 01:00:00  1.434464e+07   
...                            ...      ...                 ...           ...   
18248633  UCXhkGgooXHDNwgJXmoTSN7g    Music 2019-08-26 01:00:00  2.695777e+07   
18357755  UCXhkGgooXHDNwgJXmoTSN7g    Music 2019-09-02 01:00:00  2.700862e+07   
18515497  UCXhkGgooXHDNwgJXmoTSN7g    Music 2019-09-09 01:00:00  2.706119e+07   
18642950  UCXhkGgooXHDNwgJXmoTSN7g    Music 2019-09-16 01:00:00  2.711444e+07   
18755130  UCXhkGgooXHDNwgJXmoTSN7g    Music 2019-09-23 01:00:00  2.716740e+07   

            delta_views    

In [8]:
channel_id = 'UCWwWOFsW68TqXE-HZLC3WIA'
exists = channel_id in df_timeseries['channel'].values
print(exists)

True
