In [93]:
import pandas as pd
import numpy as np
import os

# copied from dataloader because of path issues in notebook
def load_raw_data(filename, n_first_rows):
    """
    Load the raw data from the file and return the first n_first_rows
    Parameters:
    filename (str): the name of the file to load
    n_first_rows (int): the number of rows to load
    Return:
    df (pd.DataFrame): the first n_first_rows of the file
    """
    
    df_timeseries = pd.read_csv(f'./../../data/{filename}', sep='\t', compression='infer', nrows=n_first_rows)
    df_timeseries['datetime'] = pd.to_datetime(df_timeseries['datetime'])

    return df_timeseries

In [94]:
# Load the time series data
df_timeseries = load_raw_data('df_timeseries_en.tsv', 1000)

# Convert the datetime column to datetime format
df_timeseries['datetime'] = pd.to_datetime(df_timeseries['datetime'])

In [95]:
# Compute the week index from the first date in the dataset

# Get the first date in the dataset
first_date = df_timeseries['datetime'].min()
print(first_date)

# Compute the week index
df_timeseries['week_index'] = df_timeseries['datetime'].apply(lambda x: (x - first_date).days // 7)

2016-03-21 00:00:00


In [96]:
df_timeseries.sort_values(by='datetime', inplace=True)
df_timeseries.reset_index(drop=True, inplace=True)
print(df_timeseries[['datetime', 'week_index']].head(50))

              datetime  week_index
0  2016-03-21 00:00:00           0
1  2016-03-28 01:00:00           1
2  2016-04-04 01:00:00           2
3  2016-04-11 01:00:00           3
4  2016-04-18 01:00:00           4
5  2016-04-25 01:00:00           5
6  2016-05-02 01:00:00           6
7  2016-05-09 01:00:00           7
8  2016-05-16 01:00:00           8
9  2016-05-23 01:00:00           9
10 2016-05-30 01:00:00          10
11 2016-06-06 00:00:00          11
12 2016-06-06 01:00:00          11
13 2016-06-13 00:00:00          12
14 2016-06-13 01:00:00          12
15 2016-06-20 00:00:00          13
16 2016-06-20 01:00:00          13
17 2016-06-27 00:00:00          14
18 2016-06-27 01:00:00          14
19 2016-07-04 00:00:00          15
20 2016-07-04 01:00:00          15
21 2016-07-11 00:00:00          16
22 2016-07-11 01:00:00          16
23 2016-07-18 00:00:00          17
24 2016-07-18 01:00:00          17
25 2016-07-25 00:00:00          18
26 2016-07-25 01:00:00          18
27 2016-08-01 00:00:

In [88]:
df_timeseries.groupby('week_index')
print(df_timeseries[['datetime', 'week_index']].head(50))


              datetime  week_index
0  2016-03-21 00:00:00           0
1  2016-03-28 01:00:00           1
2  2016-04-04 01:00:00           2
3  2016-04-11 01:00:00           3
4  2016-04-18 01:00:00           4
5  2016-04-25 01:00:00           5
6  2016-05-02 01:00:00           6
7  2016-05-09 01:00:00           7
8  2016-05-16 01:00:00           8
9  2016-05-23 01:00:00           9
10 2016-05-30 01:00:00          10
11 2016-06-06 00:00:00          11
12 2016-06-06 01:00:00          11
13 2016-06-13 00:00:00          12
14 2016-06-13 01:00:00          12
15 2016-06-20 00:00:00          13
16 2016-06-20 01:00:00          13
17 2016-06-27 00:00:00          14
18 2016-06-27 01:00:00          14
19 2016-07-04 00:00:00          15
20 2016-07-04 01:00:00          15
21 2016-07-11 00:00:00          16
22 2016-07-11 01:00:00          16
23 2016-07-18 00:00:00          17
24 2016-07-18 01:00:00          17
25 2016-07-25 00:00:00          18
26 2016-07-25 01:00:00          18
27 2016-08-01 00:00:

In [100]:
# Find and print rows with duplicate week indices
duplicate_weeks = df_timeseries[df_timeseries.duplicated(subset=['week_index'], keep=False)]

# Print the rows with 'datetime' and 'weekindex' columns
print(duplicate_weeks[['datetime', 'week_index']])

# TODO: Remove the duplicate rows from the dataset ? keep only 00:00:00 rows ?

               datetime  week_index
11  2016-06-06 00:00:00          11
12  2016-06-06 01:00:00          11
13  2016-06-13 00:00:00          12
14  2016-06-13 01:00:00          12
15  2016-06-20 00:00:00          13
..                  ...         ...
995 2019-09-23 00:00:00         183
996 2019-09-23 00:00:00         183
997 2019-09-23 00:00:00         183
998 2019-09-23 01:00:00         183
999 2019-09-23 01:00:00         183

[989 rows x 2 columns]
