In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import axes
import matplotlib.dates as mdates
from matplotlib import dates
from matplotlib import ticker
from scipy.optimize import curve_fit
from scipy import stats
import seaborn as sns
from ThymeBoost import ThymeBoost as tb
import random
from datetime import datetime, timedelta
import math
from tqdm import tqdm
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


In [4]:
DIR = "data/"
PATH_CHANNELS_DST = DIR + "df_channels_en.tsv"
PATH_TIME_SERIES_DST = DIR + "df_timeseries_en.tsv"
PATH_COMMENTS_AUTHOR_DST = DIR + "num_comments_authors.tsv.gz"
PATH_COMMENTS_DST = DIR + "num_comments.tsv.gz"
PATH_METADATA_DST = DIR + "yt_metadata_en.jsonl.gz"
PATH_METADATA_HELPER = DIR + "yt_metadata_helper.feather"

In [5]:
timeseries = pd.read_csv(PATH_TIME_SERIES_DST, sep = '\t')
timeseries['datetime']= pd.to_datetime(timeseries['datetime'])
timeseries['delta_views'] = timeseries['delta_views'].round(decimals = 3)
timeseries['subs'] = timeseries['subs'].round(decimals = 3)
timeseries['delta_subs'] = timeseries['delta_subs'].round(decimals = 3)
timeseries.head()

Unnamed: 0,channel,category,datetime,views,delta_views,subs,delta_subs,videos,delta_videos,activity
0,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-03,202494.6,0.0,650.222,0.0,5,0,3
1,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-10,394085.7,191591.111,1046.0,395.778,6,1,1
2,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-17,835393.8,441308.083,1501.5,455.5,6,0,1
3,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-24,1104577.0,269183.25,1750.0,248.5,6,0,0
4,UCBJuEqXfXTdcPSbGO9qqn1g,Film and Animation,2017-07-31,1284406.0,179828.6,2008.3,258.3,6,0,0


In [6]:
timeseries_filtered = timeseries[timeseries.datetime > pd.to_datetime("2017-1-1")]
print("We know have {} datapoints".format(len(timeseries_filtered)))

We know have 16732615 datapoints


In [7]:
channel_timeseries = pd.pivot_table(timeseries_filtered, values='delta_views', 
                                index=['datetime'], 
                                columns=['channel'])
print("Without uniformization, we have {} rows.".format(len(channel_timeseries)))
channel_timeseries.head()

Without uniformization, we have 288 rows.


channel,UC--24Q3_ZQeFmgJE-Um5QZQ,UC--2EbuIuBOGgGqPiJxi3rg,UC--3c8RqSfAqYBdDjIG3UNA,UC--3ozenTzry39-xMqAPQlw,UC--6E6EpybocdaNlPhvK_9A,UC--70ql_IxJmhmqXqrkJrWQ,UC--7oyGW0N7fMf164-ZXljQ,UC--94OGFAzYDOJb0eXgMjew,UC--BMyA2X4a9PGAo3lTuopg,UC--EwQJeJ6SKlJdswehQH4g,...,UCzzmGsmy8cBxmqljbVG_e7Q,UCzzoJY_ln_StRVdrRX1_ftg,UCzzqd0yx7h1PuC3KJVqeJgw,UCzzsKg4jPGBL05t2w3HfHBA,UCzzt-UNlRHbEFY3rEpFpuXw,UCzztaOtjmaZoiLvWLagq7bg,UCzzyt9Z70MBX9RQAFVSNlpQ,UCzzzPMh9fZHxc7gxPRB2LHQ,UCzzzUN8yvD2LRAnY-lhzyLQ,UCzzzZ3-icktxbC3j7hkWqRw
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01 23:00:00,2731.406,27667.057,,78790.542,45266.219,58624.651,,,230144.542,3141.141,...,1723.201,4365.896,,17874.796,14840.513,,,11938.087,8823.58,57680.059
2017-01-02 00:00:00,,,,,,,,,,,...,,,389.792,,,,,,,
2017-01-08 23:00:00,4190.594,32073.417,,83433.521,58383.031,261201.391,,,211393.042,3132.073,...,2196.625,2537.089,,21798.781,15995.781,,,16928.234,11129.797,68525.88
2017-01-09 00:00:00,,,,,,,,,,,...,,,413.875,,,,,,,
2017-01-15 23:00:00,8811.448,60385.818,,176154.271,105656.375,436874.953,,,410550.208,5420.714,...,4063.5,11802.484,,44051.906,24833.911,,,23935.083,17847.375,141159.036


In [8]:
# Project the timestamp to the correct day
uniform_datetime = timeseries_filtered.copy(deep=True)
uniform_datetime.datetime = uniform_datetime.datetime - timedelta(hours = 2)
# Drop the time to only keep the day
uniform_datetime.datetime = pd.to_datetime(uniform_datetime.datetime).dt.date
uniform_channel_timeseries = pd.pivot_table(uniform_datetime, values='delta_views', 
                                index=['datetime'], 
                                columns=['channel'])
print("With uniformization, we have {} rows.".format(len(channel_timeseries)))
uniform_channel_timeseries.head()

With uniformization, we have 288 rows.


channel,UC--24Q3_ZQeFmgJE-Um5QZQ,UC--2EbuIuBOGgGqPiJxi3rg,UC--3c8RqSfAqYBdDjIG3UNA,UC--3ozenTzry39-xMqAPQlw,UC--6E6EpybocdaNlPhvK_9A,UC--70ql_IxJmhmqXqrkJrWQ,UC--7oyGW0N7fMf164-ZXljQ,UC--94OGFAzYDOJb0eXgMjew,UC--BMyA2X4a9PGAo3lTuopg,UC--EwQJeJ6SKlJdswehQH4g,...,UCzzmGsmy8cBxmqljbVG_e7Q,UCzzoJY_ln_StRVdrRX1_ftg,UCzzqd0yx7h1PuC3KJVqeJgw,UCzzsKg4jPGBL05t2w3HfHBA,UCzzt-UNlRHbEFY3rEpFpuXw,UCzztaOtjmaZoiLvWLagq7bg,UCzzyt9Z70MBX9RQAFVSNlpQ,UCzzzPMh9fZHxc7gxPRB2LHQ,UCzzzUN8yvD2LRAnY-lhzyLQ,UCzzzZ3-icktxbC3j7hkWqRw
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01,2731.406,27667.057,,78790.542,45266.219,58624.651,,,230144.542,3141.141,...,1723.201,4365.896,389.792,17874.796,14840.513,,,11938.087,8823.58,57680.059
2017-01-08,4190.594,32073.417,,83433.521,58383.031,261201.391,,,211393.042,3132.073,...,2196.625,2537.089,413.875,21798.781,15995.781,,,16928.234,11129.797,68525.88
2017-01-15,8811.448,60385.818,,176154.271,105656.375,436874.953,,,410550.208,5420.714,...,4063.5,11802.484,1117.375,44051.906,24833.911,,,23935.083,17847.375,141159.036
2017-01-22,4207.583,31155.448,,170651.932,118148.547,237834.172,,,351185.609,4501.188,...,1467.172,18982.25,0.0,40054.531,20701.083,,,28294.255,8019.599,89166.245
2017-01-29,0.0,0.0,,0.0,0.0,0.0,,,0.0,0.0,...,0.0,0.0,97.125,0.0,0.0,,,0.0,1184.635,0.0


In [13]:
channel_timeseries = uniform_channel_timeseries.dropna(thresh=len(uniform_channel_timeseries) - 1, axis=1)
channel_timeseries

channel,UC--24Q3_ZQeFmgJE-Um5QZQ,UC--2EbuIuBOGgGqPiJxi3rg,UC--3ozenTzry39-xMqAPQlw,UC--6E6EpybocdaNlPhvK_9A,UC--70ql_IxJmhmqXqrkJrWQ,UC--BMyA2X4a9PGAo3lTuopg,UC--EwQJeJ6SKlJdswehQH4g,UC--LAVm36WiCmqQbekV17qQ,UC--MxpGXJ3LVD8KvlNzRlcA,UC--SaadDxRXz729rtbQqyBA,...,UCzzfu83LhPMMuhtDVA75rfw,UCzzgEOi0xR9SuflkXmTQv6g,UCzzmGsmy8cBxmqljbVG_e7Q,UCzzoJY_ln_StRVdrRX1_ftg,UCzzqd0yx7h1PuC3KJVqeJgw,UCzzsKg4jPGBL05t2w3HfHBA,UCzzt-UNlRHbEFY3rEpFpuXw,UCzzzPMh9fZHxc7gxPRB2LHQ,UCzzzUN8yvD2LRAnY-lhzyLQ,UCzzzZ3-icktxbC3j7hkWqRw
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01,2731.406,27667.057,78790.542,45266.219,58624.651,230144.542,3141.141,405090.474,152664.714,218213.260,...,107774.234,50608.250,1723.201,4365.896,389.792,17874.796,14840.513,11938.087,8823.580,57680.059
2017-01-08,4190.594,32073.417,83433.521,58383.031,261201.391,211393.042,3132.073,513466.229,128403.047,166920.260,...,165696.859,66100.979,2196.625,2537.089,413.875,21798.781,15995.781,16928.234,11129.797,68525.880
2017-01-15,8811.448,60385.818,176154.271,105656.375,436874.953,410550.208,5420.714,804861.375,267081.979,180527.938,...,303092.307,269860.010,4063.500,11802.484,1117.375,44051.906,24833.911,23935.083,17847.375,141159.036
2017-01-22,4207.583,31155.448,170651.932,118148.547,237834.172,351185.609,4501.188,517145.125,268256.380,173103.469,...,50091.531,135643.083,1467.172,18982.250,0.000,40054.531,20701.083,28294.255,8019.599,89166.245
2017-01-29,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,108857.875,0.000,0.000,97.125,0.000,0.000,0.000,1184.635,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-09-01,12050.294,16285.500,15080.625,37692.100,114632.875,237070.750,142106.750,896308.000,741449.000,6857.195,...,290734.818,14579.444,41196.345,142957.769,7220.889,9817.895,7240.125,121074.100,1959046.500,97305.600
2019-09-08,11990.252,15189.125,14170.625,36827.850,104873.125,227507.750,174397.875,721268.625,734633.375,6700.474,...,161183.750,12273.333,30090.455,128864.538,7211.586,8243.053,6965.250,37997.750,1927067.333,77068.255
2019-09-15,11630.000,20895.125,14706.125,37179.917,116137.875,228404.750,14665.875,754182.375,723529.625,6700.474,...,305132.750,10735.667,52301.900,118294.615,5658.000,8243.053,7610.750,73245.850,1939274.833,71936.345
2019-09-22,11797.286,22312.375,15116.625,29436.722,130651.875,228743.375,9969.750,1548685.500,650013.875,8029.700,...,259474.250,17552.333,44120.600,122725.962,5663.429,54022.500,8852.000,161366.100,1944157.833,67538.800


In [74]:
num_start_timestamp = 30
X = channel_timeseries[:num_start_timestamp]
y = channel_timeseries[num_start_timestamp:]

In [75]:
test_size = int(0.2 * len(X.columns))
X_train = X.iloc[:,test_size:]
X_test = X.iloc[:, :test_size]
y_train = y.iloc[:, test_size:]
y_test = y.iloc[:, :test_size]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [76]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [77]:
xg_reg.fit(X_train, y_train)

XGBoostError: [17:46:16] /Users/runner/miniforge3/conda-bld/xgboost-split_1667849653518/work/src/data/data.cc:455: Check failed: this->labels.Size() % this->num_row_ == 0 (24 vs. 0) : Incorrect size for labels.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000000143c35fc4 dmlc::LogMessageFatal::~LogMessageFatal() + 116
  [bt] (1) 2   libxgboost.dylib                    0x0000000143cd33dd xgboost::MetaInfo::SetInfoFromHost(xgboost::GenericParameter const&, xgboost::StringView, xgboost::Json) + 4301
  [bt] (2) 3   libxgboost.dylib                    0x0000000143cd21df xgboost::MetaInfo::SetInfo(xgboost::GenericParameter const&, xgboost::StringView, xgboost::StringView) + 159
  [bt] (3) 4   libxgboost.dylib                    0x0000000143c4d2c9 XGDMatrixSetInfoFromInterface + 249
  [bt] (4) 5   libffi.7.dylib                      0x000000010d803ead ffi_call_unix64 + 85
  [bt] (5) 6   ???                                 0x000000030a3a0040 0x0 + 13056475200

