In [85]:
import pandas as pd
from datetime import datetime

df = pd.read_pickle("data/videos.pkl")
df.iloc[31000]

title                        تلميع وتنظيف الجلد الطبيعي المدو SNN333
view_timeseries    [(3715, 2024-11-20 21:51:24.383555+00:00), (37...
thumbnail_url       https://i.ytimg.com/vi/rvwsf8MtL8k/mqdefault.jpg
video_id                                                 rvwsf8MtL8k
subscribers                                                   326000
channel_title                                          I Build Daily
category                                                          22
blacklisted                                                      NaN
duration                                                       182.0
Name: 31000, dtype: object

## Time Series Data
Each timeseries is an array consisting of tuples with a view and time sample. We add delta days for each time sample for easier analysis.

In [86]:
df["view_timeseries"][:5]

0    [(447, 2024-09-30 09:09:14.072230+00:00), (450...
1    [(547, 2024-09-30 09:09:15.012316+00:00), (776...
2    [(5724, 2024-09-30 09:09:18.678616+00:00), (85...
3    [(811, 2024-09-30 09:09:20.322871+00:00), (143...
4    [(326, 2024-09-30 09:09:20.442491+00:00), (370...
Name: view_timeseries, dtype: object

In [87]:
def to_delta_days(timeseries):
    new_timeseries = []
    for day in timeseries:
        delta_days = day[1].date() - timeseries[0][1].date()
        new_timeseries.append((day[0], delta_days.days))
    return new_timeseries

df["view_timeseries"] = df["view_timeseries"].apply(to_delta_days)

There are a few gaps in the time data, which we fill out by averaging the neighbours.

In [88]:
def average_neighbours(timeseries):
    for i, sample in enumerate(timeseries):
        if i + 1 >= len(timeseries):
            if sample[1] != i:
                return
            break
        if timeseries[i + 1][1] != timeseries[i][1] + 1:
            avg_neighbours = int((timeseries[i][0] + timeseries[i + 1][0]) / 2)
            timeseries.insert(i + 1, (avg_neighbours, i + 1))
    return timeseries

df["view_timeseries"] = df["view_timeseries"].apply(average_neighbours)

We only keep the first 30 days and discard videos with insufficient time samples. We also drop blacklisted videos and non-english videos.

In [89]:
drop_list = []
trim_length = 20

def trim_timeseries(timeseries):
    new_ts = [x for x in timeseries if x[1] <= trim_length - 1]
    return new_ts

df["view_timeseries"] = df["view_timeseries"].apply(trim_timeseries)

row_drops = 0

# drop rows with less than 30 timestamps
for i, timeseries in enumerate(df["view_timeseries"].to_list()):
    if len(timeseries) < trim_length:
        drop_list.append(i)
        row_drops += 1

#filter non-english videos
from lingua import LanguageDetectorBuilder, Language
import pandas as pd

detector = LanguageDetectorBuilder.from_all_languages().build()

lang_drops = 0 
for i, title in enumerate(df["title"].to_list()):
    print("Processing ", i)
    conf = detector.compute_language_confidence_values(title)
    if conf[0].language != Language.ENGLISH:
        drop_list.append(i)
        lang_drops += 1

print("lang drops ", lang_drops)
print("row drops ", row_drops)

#remove duplicates from drop list
#drop_list = list(dict.fromkeys(drop_list))

#drop rows
df_trimmed = df.drop(df.index[drop_list])
print(f"Length after initial drop {len(df_trimmed)}")
df_trimmed = df_trimmed[df_trimmed["blacklisted"].isnull()]
print(f"Length after blacklisted {len(df_trimmed)}")
df_trimmed = df_trimmed[df_trimmed["duration"] > 60]
print(f"Length after duration {len(df_trimmed)}")

print(f"Length before trim {len(df)}")
print(f"Length after trim {len(df_trimmed)}")

df_trimmed[:5]

Processing  0
Processing  1
Processing  2
Processing  3
Processing  4
Processing  5
Processing  6
Processing  7
Processing  8
Processing  9
Processing  10
Processing  11
Processing  12
Processing  13
Processing  14
Processing  15
Processing  16
Processing  17
Processing  18
Processing  19
Processing  20
Processing  21
Processing  22
Processing  23
Processing  24
Processing  25
Processing  26
Processing  27
Processing  28
Processing  29
Processing  30
Processing  31
Processing  32
Processing  33
Processing  34
Processing  35
Processing  36
Processing  37
Processing  38
Processing  39
Processing  40
Processing  41
Processing  42
Processing  43
Processing  44
Processing  45
Processing  46
Processing  47
Processing  48
Processing  49
Processing  50
Processing  51
Processing  52
Processing  53
Processing  54
Processing  55
Processing  56
Processing  57
Processing  58
Processing  59
Processing  60
Processing  61
Processing  62
Processing  63
Processing  64
Processing  65
Processing  66
Proce

Unnamed: 0,title,view_timeseries,thumbnail_url,video_id,subscribers,channel_title,category,blacklisted,duration
1,Mario Sports Superstars Football: Baby Mario &...,"[(547, 0), (776, 1), (828, 2), (853, 3), (879,...",https://i.ytimg.com/vi/35gfRMPwxkc/mqdefault.jpg,35gfRMPwxkc,92400,VMGAMING,20,,3118.0
2,WORST BREAKOUT EVER?! What to do when your ski...,"[(5724, 0), (8555, 1), (9631, 2), (10401, 3), ...",https://i.ytimg.com/vi/4fklg-BTY4g/mqdefault.jpg,4fklg-BTY4g,389000,Soo Beauty 수뷰티,26,,1395.0
4,Andrew Fox | IDF & War Crimes | Israel-Hamas W...,"[(326, 0), (370, 1), (372, 2), (379, 3), (386,...",https://i.ytimg.com/vi/lmC1JnlYDQw/mqdefault.jpg,lmC1JnlYDQw,48200,Shana Meyerson YOGAthletica,25,,4621.0
8,Ondo Finance vs Sabai Protocol | Sabai Token |...,"[(1849, 0), (2350, 1), (2482, 2), (2533, 3), (...",https://i.ytimg.com/vi/gBojwpJBE7c/mqdefault.jpg,gBojwpJBE7c,196000,Global Rashid,27,,516.0
41,Theo Ogimaa Favell Traditional Vs Woodland Spe...,"[(153, 0), (200, 1), (205, 2), (212, 3), (220,...",https://i.ytimg.com/vi/GFHqYCmL92w/mqdefault.jpg,GFHqYCmL92w,95400,PowWows.com,19,,1912.0


In [90]:
# validation test – make sure data is correctly formatted
err_count = 0
for timeseries in df_trimmed["view_timeseries"].to_list():
    if len(timeseries) != trim_length:
        print(f"Invalid length of timeseries: {len(timeseries)}")
        err_count += 1
    for i, sample in enumerate(timeseries):
        if i != sample[1]:
            print(f"Error at {i}.\nValue: {sample[1]}\nLength of timeseries: {len(timeseries)}")
            err_count += 1
print(f"Errors: {err_count}")

Errors: 0


Let's convert category integers to their corresponding string values.

In [91]:
# convert category to numbers
def convert_category_id(cat):
    cats = ['', 'Film & Animation', 'Autos & Vehicles', '', '', '', '', '', '', '', 'Music', '', '', '', '', 'Pets & Animals', '', 'Sports', 'Short Movies', 'Travel & Events', 'Gaming', 'Videoblogging', 'People & Blogs', 'Comedy', 'Entertainment', 'News & Politics', 'Howto & Style', 'Education', 'Science & Technology', 'Nonprofits & Activism', 'Movies', 'Anime/Animation', 'Action/Adventure', 'Classics', 'Comedy', 'Documentary', 'Drama', 'Family', 'Foreign', 'Horror', 'Sci-Fi/Fantasy', 'Thriller', 'Shorts', 'Shows', 'Trailers']
    return cats[int(cat)]

df_trimmed["category"] = df_trimmed["category"].apply(convert_category_id)

We'll also add a standardized view component to each time series.

In [92]:
import numpy as np

scaled_views = []

def standardize(timeseries):
    views = [x[0] for x in timeseries]
    days = [x[1] for x in timeseries]
    mean = np.sum(views) / len(views)
    sd = np.std(views)
    if sd == 0:
        sd = 1
    scaled_views = [(x - mean)/sd for x in views]
    return [(views[i], days[i], scaled_views[i]) for i, x in enumerate(timeseries)]

df_trimmed["view_timeseries"] = df_trimmed["view_timeseries"].apply(standardize)
df_trimmed["view_timeseries"]

1        [(547, 0, -3.3210900488693595), (776, 1, -1.41...
2        [(5724, 0, -2.8782804774410082), (8555, 1, -1....
4        [(326, 0, -0.732712712487337), (370, 1, -0.601...
8        [(1849, 0, -3.726489165399963), (2350, 1, -1.3...
41       [(153, 0, -3.4079659402564997), (200, 1, -1.34...
                               ...                        
58040    [(340, 0, -3.297029702970298), (351, 1, -2.207...
58041    [(1586, 0, -2.07707151142496), (1628, 1, -2.06...
58042    [(164428, 0, -2.6447223135664637), (234821, 1,...
58045    [(2520, 0, -1.945072273686186), (3244, 1, -1.8...
58047    [(7138, 0, -2.566822822710075), (8281, 1, -2.0...
Name: view_timeseries, Length: 28018, dtype: object

We convert time series data into a numpy array with 2 dimensions: Video and day. Finally, we save it to a .npy file.

In [93]:

ts_views_trimmed = []
ts_views_trimmed_std = []
ts_data = df_trimmed["view_timeseries"].to_list()

#drop delta day index
for timeseries in ts_data:
    new_ts = []
    new_ts_std = []
    for sample in timeseries:
        new_ts.append(sample[0])
        new_ts_std.append(sample[2])
    ts_views_trimmed.append(new_ts)
    ts_views_trimmed_std.append(new_ts_std)

ts_data = np.array(ts_views_trimmed)
ts_data_std = np.array(ts_views_trimmed_std)
print(ts_data.shape)

np.save("data/timeseries.npy", ts_data)
np.save("data/timeseries_std.npy", ts_data_std)

(28018, 20)


Now, we unpack the time series samples and place them in wide format, so we later can analyse each time stamp seperately.

In [95]:
day_list = [[] for x in range(trim_length)] #will contain a list for each day

def assemble_day_list(timeseries):
    for i, day in enumerate(timeseries):
        day_list[i].append((day[0], day[2]))

df_trimmed["view_timeseries"].apply(assemble_day_list)

df_trimmed_wide = pd.DataFrame(df_trimmed)
df_trimmed_wide_std = pd.DataFrame(df_trimmed)

for i, day in enumerate(day_list):
    print([x[0] for x in day])
    df_trimmed_wide[f"day{i}"] = [x[0] for x in day]
    df_trimmed_wide_std[f"day{i}"] = [x[1] for x in day]
    

[547, 5724, 326, 1849, 153, 2621, 98, 93, 3420, 5834, 1530, 5695, 153, 620, 4802, 31000, 3963, 467, 465, 6840, 38, 6646, 449, 413, 175, 2621, 1, 1173, 765, 5852, 6232, 26, 74531, 121, 3474, 189, 1133, 139906, 68, 122, 130, 108, 209, 44409, 363, 21864, 129, 194, 59601, 92, 31, 82, 219, 4548, 1751, 95052, 1139, 2525, 40125, 169, 355, 745, 1655, 1280, 1501, 1185, 506, 631, 1034, 845, 788, 459, 1177, 934, 407, 589, 1205, 913, 1913, 23893, 28785, 13379, 1718, 78, 36388, 54, 1921, 50465, 58, 140, 22270, 36148, 20436, 8861, 1984, 2274, 87286, 1572, 1050, 943, 6006, 10680, 12158, 9635, 4477, 5709, 2845, 51230, 8323, 365, 66, 306, 1443, 1321, 605, 6386, 57010, 96, 1342, 122119, 6560, 3040, 276, 1962, 2722, 1745, 1161, 1931, 1150, 1743, 3307, 1313, 3868, 1591, 1308, 3779, 3059, 4095, 1895, 1873, 4009, 1518, 3159, 1204, 659, 1542, 683, 374, 662, 660, 2206, 603, 1439, 1513, 536, 1217, 513, 2240, 90240, 36575, 13097, 206760, 2537, 2511, 4805, 232, 340, 1497, 771, 1517, 917, 176966, 257, 6809, 4982,

We'll also make a long version.

In [96]:
df_trimmed_long_std = pd.melt(df_trimmed_wide_std.drop("view_timeseries", axis="columns"), 
                  id_vars=['title', 'thumbnail_url', 'video_id', 'subscribers', 'channel_title', 'category', 'blacklisted', 'duration'], 
                  var_name='day', 
                  value_name='views')

# convert days to int
def day_to_int(str):
    return int(str[3:])

df_trimmed_long_std["day"] = df_trimmed_long_std["day"].apply(day_to_int)

df_trimmed_long_std[:5]

Unnamed: 0,title,thumbnail_url,video_id,subscribers,channel_title,category,blacklisted,duration,day,views
0,Mario Sports Superstars Football: Baby Mario &...,https://i.ytimg.com/vi/35gfRMPwxkc/mqdefault.jpg,35gfRMPwxkc,92400,VMGAMING,Gaming,,3118.0,0,-3.32109
1,WORST BREAKOUT EVER?! What to do when your ski...,https://i.ytimg.com/vi/4fklg-BTY4g/mqdefault.jpg,4fklg-BTY4g,389000,Soo Beauty 수뷰티,Howto & Style,,1395.0,0,-2.87828
2,Andrew Fox | IDF & War Crimes | Israel-Hamas W...,https://i.ytimg.com/vi/lmC1JnlYDQw/mqdefault.jpg,lmC1JnlYDQw,48200,Shana Meyerson YOGAthletica,News & Politics,,4621.0,0,-0.732713
3,Ondo Finance vs Sabai Protocol | Sabai Token |...,https://i.ytimg.com/vi/gBojwpJBE7c/mqdefault.jpg,gBojwpJBE7c,196000,Global Rashid,Education,,516.0,0,-3.726489
4,Theo Ogimaa Favell Traditional Vs Woodland Spe...,https://i.ytimg.com/vi/GFHqYCmL92w/mqdefault.jpg,GFHqYCmL92w,95400,PowWows.com,Travel & Events,,1912.0,0,-3.407966


Finally, we save the processed dataframe to both csv and pickle.

In [97]:
df_trimmed_wide.to_csv("data/videos_processed_wide.csv")
df_trimmed_wide.to_pickle("data/videos_processed_wide.pkl")

df_trimmed_long_std.to_pickle("data/video_processed_long.pkl")
df_trimmed_long_std.to_csv("data/video_processed_long.csv")