In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
timeseries_df = pd.read_csv("data/df_timeseries_en.tsv.gz", sep='\t', compression="gzip", parse_dates=["datetime"])
timeseries_df.head(2)

In [None]:
channel_df = pd.read_csv("data/df_channels_en.tsv.gz", sep='\t', compression="gzip")
channel_df.head(2)

### Calculate and fix metrics (change this description)

In [None]:
ts_grouped_by_channel = timeseries_df.groupby('channel')

# calculate cumulative views per channel
timeseries_df["cumviews"] = ts_grouped_by_channel["views"].cumsum()

# because negative delta views are not included in the original dataset, we recalculated them
timeseries_df["delta_views"] = ts_grouped_by_channel["views"].diff().fillna(0)

## Significative channels visualization

#### Overnight success followed by downfall

In [None]:
current_channel = 'UCj-R_ePoJvWGiLOD6aDgMSg'

fig, axs = plt.subplots(3, 1, figsize=(15, 10), sharex=True)
axs_flat = axs.ravel()

timeseries_df[timeseries_df["channel"] == current_channel].plot(x="datetime", y="cumviews", ax=axs_flat[0])
timeseries_df[timeseries_df["channel"] == current_channel].plot(x="datetime", y="views", ax=axs_flat[1])
timeseries_df[timeseries_df["channel"] == current_channel].plot(x="datetime", y="delta_views", ax=axs_flat[2])

for i in range(3):
    axs_flat[i].legend(loc='upper left')
plt.tight_layout()

add explanation here to explain why this is a good example and what distinguishes it from the others, what are the features we are looking for, ...

#### Overnight success followed by stability

In [None]:
current_channel = 'UCa4hfBXGDC_TxUHTEbCdyng'

fig, axs = plt.subplots(3, 1, figsize=(15, 10), sharex=True)
axs_flat = axs.ravel()

timeseries_df[timeseries_df["channel"] == current_channel].plot(x="datetime", y="cumviews", ax=axs_flat[0])
timeseries_df[timeseries_df["channel"] == current_channel].plot(x="datetime", y="views", ax=axs_flat[1])
timeseries_df[timeseries_df["channel"] == current_channel].plot(x="datetime", y="delta_views", ax=axs_flat[2])

for i in range(3):
    axs_flat[i].legend(loc='upper left')
plt.tight_layout()

add explanation here to explain why this is a good example and what distinguishes it from the others, what are the features we are looking for, ...

# Population division

### Step 1: Calculate meaningful metrics

##### Metric 1: Views variability

In [None]:
max_views = ts_grouped_by_channel['views'].max()
min_views = ts_grouped_by_channel['views'].min()

variability = (max_views - min_views) / max_views
channel_df["variability"] = variability
variability.head(2)

##### Metric 2: Channel growth

In [None]:
begin, end = ts_grouped_by_channel['views'].first(), ts_grouped_by_channel['views'].last()

growth = (end - begin) / begin
channel_df["growth"] = growth
growth.head(2)

##### Metric 3: Views growth steepness

In [None]:
delta_max = ts_grouped_by_channel["delta_views"].max()

growth_steepness = delta_max / max_views
channel_df["growth_steepness"] = growth_steepness
growth_steepness.head(2)

In [None]:
metrics = pd.DataFrame({
    "growth_steepness": growth_steepness,
    "growth": growth,
    "variability": variability
}).reset_index()
metrics.head(2)

In [None]:
channel_df = channel_df.merge(metrics, on="channel")
channel_df.head(2)

### Step 2: Divide population using the metrics

In [None]:
growth_bound = 0.2

# select channels with a growth between -0.2 and 0.2
lose_fame = channel_df[(-growth_bound < channel_df["growth"]) & (channel_df["growth"] < growth_bound)]

# keep only channels with a growth greater than 0.2 because we're not interested in those who lost fame
keep_fame = channel_df[channel_df["growth"] > growth_bound]

#### Initial exploratory data analysis
We added the metrics for further filtering but we still need to quantify the threshold.  
Ideally we want to select channels with a high variability and a high growth steepness so ordering them by those metrics and selecting the top 10% should be a good start for an initial exploratory data analysis

### Step 3: Further, and more accurate, filtering

In [None]:
import scipy.integrate as integrate

def growth_rate_per_channel(df_timeseries):
    sliding_window = 5

    times = []
    integrals = []
    for i in range(len(df_timeseries['views']) - sliding_window):
        window = df_timeseries['views'][i:i+sliding_window]
        b, e = window.iloc[0], window.iloc[-1]

        if e < b:
            continue

        line = np.array([(e - b) * j / (sliding_window - 1) + b for j in range(sliding_window)])

        # since we are measuring the sudden increase in views as an integral of the difference between the linear increase and the actual increase,
        # we have to make sure that the linear increase is above the actual increase at least at the beginning of the window
        if window.iloc[1] > line[1]:
            continue

        if integrate.simpson(line - window) > 0:
            # FIXME: add importance of the absolute value difference of values instead of only the area
            integrals.append(integrate.simpson(np.abs(line - window)) / integrate.simpson(line))
            times = times + [df_timeseries['datetime'].iloc[i]]

    integrals = integrals + [0]
    times = times + [df_timeseries['datetime'].iloc[0]]
    
    _max = np.argmax(integrals)
    return pd.Series([times[_max], integrals[_max]])

In [None]:
# calculate the sudden increase in views for each channel
keep_fame_ts = timeseries_df[timeseries_df["channel"].isin(keep_fame["channel"])]
keep_fame_sudden_growth = keep_fame_ts.groupby('channel').apply(growth_rate_per_channel).rename(columns={0: 'sudden_growth_date', 1: 'sudden_growth_index'})

lose_fame_ts = timeseries_df[timeseries_df["channel"].isin(lose_fame["channel"])]
lose_fame_sudden_growth = lose_fame_ts.groupby('channel').apply(growth_rate_per_channel).rename(columns={0: 'sudden_growth_date', 1: 'sudden_growth_index'})

#### Initial exploratory data analysis
Same as above, we want to analyze the channels comparing them by growth steepness, so further filtering using the new calculated metric should be done to additionally narrow the populations