In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
data_path = "D:\Datasets\YouNiverse"

In [None]:
df_channels = pd.read_csv(data_path + "\df_channels_en.tsv.gz", sep="\t")
df_timeseries = pd.read_csv(data_path + "\df_timeseries_en.tsv.gz", sep="\t")


In [None]:
df_channels

In [None]:
df_channels.columns

In [None]:
df_timeseries

In [None]:
df_timeseries.columns

In [None]:
color_palette = px.colors.qualitative.Set2

In [None]:
# Visualization 1: Line graph showing the year-on-year growth of gaming channels, videos, and viewership
# Preprocess the 'df_timeseries' for year-on-year analysis
# Example code for preprocessing
df_timeseries['datetime'] = pd.to_datetime(df_timeseries['datetime'])
df_timeseries['month_year'] = df_timeseries['datetime'].dt.to_period('Q')

# Example aggregation for monthly data
monthly_stats = df_timeseries.groupby('month_year').agg({
    'channel': 'nunique', 
    'videos': 'sum', 
    'views': 'sum'
}).reset_index()

# Creating the subplots
fig1 = make_subplots(rows=1, cols=3, subplot_titles=('Channels', 'Videos', 'Views'))

# Adding traces
fig1.add_trace(go.Scatter(x=monthly_stats['month_year'].astype(str), y=monthly_stats['channel'], mode='lines', name='Channels'), row=1, col=1)
fig1.add_trace(go.Scatter(x=monthly_stats['month_year'].astype(str), y=monthly_stats['videos'], mode='lines', name='Videos'), row=1, col=2)
fig1.add_trace(go.Scatter(x=monthly_stats['month_year'].astype(str), y=monthly_stats['views'], mode='lines', name='Views'), row=1, col=3)

# Updating layout
fig1.update_layout(title='Quarterly Growth of Gaming Channels, Videos, and Viewership on YouTube', showlegend=False)
fig1.update_xaxes(title_text='Month-Year', row=3, col=1)
fig1.update_yaxes(title_text='Count')

# Display the figure
# fig1.show()

fig1.write_html("quarterly_growth_gaming.html")

In [None]:
# Visualization 2: Demographic pie charts and engagement bar graphs
# For this visualization, assuming 'category_cc' represents demographics and 'subscribers_cc' represents engagement
# Calculating demographics and engagement
demographics = df_channels['category_cc'].value_counts()
engagement = df_channels.groupby('category_cc')['subscribers_cc'].sum()

extended_color_palette = px.colors.qualitative.Set2 + px.colors.qualitative.Pastel1 + px.colors.qualitative.Dark2

# Trimming or extending the palette to match the number of categories
if len(extended_color_palette) > len(demographics.index):
    extended_color_palette = extended_color_palette[:len(demographics.index)]
elif len(extended_color_palette) < len(demographics.index):
    extended_color_palette.extend(px.colors.qualitative.Plotly[len(extended_color_palette) - len(demographics.index):])

# Matching the extended, more varied color palette to the categories
colors = {category: extended_color_palette[i] for i, category in enumerate(demographics.index)}

# Recreating the pie chart and bar graph with the new color palette
fig2 = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'bar'}]])

fig2.add_trace(
    go.Pie(
        labels=demographics.index, 
        values=demographics.values, 
        name='Demographics', 
        marker=dict(colors=[colors[label] for label in demographics.index])
    ), 
    1, 1
)

fig2.add_trace(
    go.Bar(
        x=engagement.index, 
        y=engagement.values, 
        name='Engagement', 
        marker=dict(color=[colors[label] for label in engagement.index])
    ), 
    1, 2
)

fig2.update_layout(title='Demographic Distribution and Engagement in YouTube Gaming')


# fig2.show()
fig2.write_html("demographics_engagement_gaming.html")

In [None]:
fig3 = px.violin(df_channels, y='subscribers_cc', x='category_cc', box=True, points="all",
                title='Subscribers Distribution Across Categories (Violin Plot)')
# fig3.show()
fig3.write_html("subscribers_distribution_gaming.html")

category_cc	join_date	channel	name_cc	subscribers_cc	videos_cc	subscriber_rank_sb	weights

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder

categorical_features = ['category_cc']
numerical_features = ['videos_cc', 'weights', 'subscriber_rank_sb']

# One-hot encode the categorical variable 'category_cc'
one_hot_encoder = OneHotEncoder()
category_encoded = one_hot_encoder.fit_transform(df_channels[['category_cc']]).toarray()
category_encoded_df = pd.DataFrame(category_encoded, columns=one_hot_encoder.get_feature_names_out(['category_cc']))

# Include the encoded categorical data with the numerical features
X = pd.concat([df_channels[numerical_features], category_encoded_df], axis=1)
y = df_channels['subscribers_cc']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reduce the number of estimators to speed up the process
rf_model = RandomForestRegressor(n_estimators=10, random_state=42)

# If the dataset is too large, use a fraction of it to train the model
if len(X_train) > 100000:  # arbitrary threshold for demonstration purposes
    X_train_sample = X_train.sample(n=100000, random_state=42)
    y_train_sample = y_train.loc[X_train_sample.index]
else:
    X_train_sample = X_train
    y_train_sample = y_train

# Train the model on the smaller sample set
rf_model.fit(X_train_sample, y_train_sample)

# Predict the number of subscribers on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)

# Show the plot
fig.show()
r2 = r2_score(y_test, y_pred)

mse, r2

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Define the figure for the subplot
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(20, 30))

# Histogram of subscribers
sns.histplot(df_channels['subscribers_cc'], bins=3000, kde=False, ax=axes[0, 0])
axes[0, 0].set_title('Distribution of Subscribers')

# Boxplot of subscribers across different categories
sns.boxplot(x='subscribers_cc', y='category_cc', data=df_channels, ax=axes[0, 1])
axes[0, 1].set_title('Subscribers Distribution Across Categories')

# Scatter plot of subscribers vs. videos
sns.scatterplot(x='videos_cc', y='subscribers_cc', data=df_channels, ax=axes[1, 0])
axes[1, 0].set_title('Subscribers vs. Videos')

# Correlation heatmap
corr = df_channels.corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', ax=axes[1, 1])
axes[1, 1].set_title('Correlation Heatmap')

# Pairplot for numerical features (sampled due to large size)
sns.pairplot(df_channels.sample(min(500, len(df_channels)), random_state=42), diag_kind='kde')

# Average number of videos and subscribers in each category
category_stats = df_channels.groupby('category_cc').agg({'videos_cc':'mean', 'subscribers_cc':'mean'}).reset_index()
sns.barplot(x='videos_cc', y='category_cc', data=category_stats, ax=axes[2, 0])
axes[2, 0].set_title('Average Number of Videos per Category')

sns.barplot(x='subscribers_cc', y='category_cc', data=category_stats, ax=axes[2, 1])
axes[2, 1].set_title('Average Number of Subscribers per Category')



# The following codes ensure that the same color is used for the same category across all plots
# Filter out NaN values and ensure that all entries are strings
filtered_categories = df_channels['category_cc'].dropna().astype(str)

# Create a sorted list of unique categories
unique_categories = sorted(filtered_categories.unique())

# Create a color palette with a distinct color for each category
colors = sns.color_palette('Paired', len(unique_categories))

# Create a color map (dictionary) for each category
color_map = {category: color for category, color in zip(unique_categories, colors)}



# Pie chart for proportion of subscribers in each category
category_subscribers = df_channels.groupby('category_cc')['subscribers_cc'].sum()
axes[3, 0].pie(category_subscribers, labels=category_subscribers.index, autopct='%1.1f%%', colors=[color_map[cat] for cat in category_subscribers.index])
axes[3, 0].set_title('Proportion of Subscribers in Each Category')

# Pie chart for proportion of channels in each category
category_counts = df_channels['category_cc'].value_counts()
axes[3, 1].pie(category_counts, labels=category_counts.index, autopct='%1.1f%%', colors=[color_map[cat] for cat in category_counts.index])
axes[3, 1].set_title('Proportion of Channels in Each Category')

# Violin plot for subscribers across different categories
sns.violinplot(x='subscribers_cc', y='category_cc', data=df_channels, ax=axes[4, 0])
axes[4, 0].set_title('Subscribers Distribution Across Categories (Violin Plot)')

# Stacked bar chart for distribution of videos and subscribers in each category
category_grouped = df_channels.groupby('category_cc').agg({'videos_cc':'sum', 'subscribers_cc':'sum'})
category_grouped.plot(kind='bar', stacked=True, ax=axes[4, 1])
axes[4, 1].set_title('Distribution of Videos and Subscribers in Each Category')

# Adjust the layout and show the plot
plt.tight_layout()
plt.show()


At first glance, it seems that `subscribers` vs `videos` graph follows a power law distribution. Let's check it out.

In [None]:
from sklearn.linear_model import LinearRegression

# Determine the 95th percentile of subscribers
percentile_95 = np.percentile(df_channels['subscribers_cc'], 95)

# Filter the dataset for channels that have subscriber counts above the 95th percentile
upper_bound_data = df_channels[df_channels['subscribers_cc'] >= percentile_95]

# Log-transform the upper bound data
log_subscribers_upper = np.log(upper_bound_data['subscribers_cc'] + 1)
log_videos_upper = np.log(upper_bound_data['videos_cc'] + 1)

# Reshape the data for sklearn
X_upper = log_videos_upper.values.reshape(-1, 1)
y_upper = log_subscribers_upper.values.reshape(-1, 1)

# Create and fit the model for the upper bound data
model_upper = LinearRegression()
model_upper.fit(X_upper, y_upper)

# Get the coefficient (k) and the intercept (log(a)) for the upper bound model
k_upper = model_upper.coef_[0][0]
log_a_upper = model_upper.intercept_[0]

# Calculate the R-squared value to assess the fit for the upper bound model
r_squared_upper = model_upper.score(X_upper, y_upper)

# Convert log(a) back to a for the upper bound model
a_upper = np.exp(log_a_upper)

# Return the model parameters and R-squared value for the upper bound
a_upper, k_upper, r_squared_upper, percentile_95

- The exponent $k$ for the upper bound is approximately 0.0617.

The $R^2$ value for the upper bound model is approximately 0.0191, which is even lower than the model for the entire dataset. This lower $R^2$ value indicates that the power-law model explains an even smaller portion of the variance within the upper bound of the data.

Or, if we want to visualize the data in a log-log plot, we can see that the data points are not linearly distributed. 
![log-logGraph](\images\graph1.png)

In [None]:
df_timeseries.columns

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Assuming df_timeseries is your time-series data with columns as specified
# Scaling the 'subs' column from df_timeseries
scaler = MinMaxScaler(feature_range=(-1, 1))
df_timeseries['scaled_subs'] = scaler.fit_transform(df_timeseries['subs'].values.reshape(-1,1))

# We create sequences of data for the LSTM
def create_sequences(input_data, time_steps):
    inout_seq = []
    L = len(input_data)
    for i in range(L - time_steps):
        train_seq = input_data[i:i + time_steps]
        train_label = input_data[i + time_steps:i + time_steps + 1]
        inout_seq.append((train_seq, train_label))
    return inout_seq

time_steps = 5  # For example, using 5 days of data to predict the 6th day
df_timeseries_sorted = df_timeseries.sort_values(by=['channel', 'datetime'])
sequences = df_timeseries_sorted.groupby('channel')['scaled_subs'].apply(lambda x: create_sequences(x.tolist(), time_steps))
sequences = [item for sublist in sequences for item in sublist]  # Flatten the list

# Splitting the data into features and targets
X, y = zip(*sequences)
X = np.array(X).reshape(-1, time_steps, 1)
y = np.array(y).reshape(-1, 1)

# Convert the data to PyTorch tensors
X_tensor = torch.FloatTensor(X)
y_tensor = torch.FloatTensor(y)

# Create the DataLoader
dataset = TensorDataset(X_tensor, y_tensor)
train_loader = DataLoader(dataset, batch_size=64, shuffle=False)

# LSTM Model
class LSTM(nn.Module):
    def __init__(self, input_size=1, hidden_layer_size=100, output_size=1):
        super().__init__()
        self.hidden_layer_size = hidden_layer_size
        self.lstm = nn.LSTM(input_size, hidden_layer_size, batch_first=True)
        self.linear = nn.Linear(hidden_layer_size, output_size)

    def forward(self, input_seq):
        lstm_out, _ = self.lstm(input_seq)
        predictions = self.linear(lstm_out[:, -1, :])  # Take the last time step only
        return predictions

# Instantiate the model, define the loss function and the optimizer
model = LSTM()
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training the model
epochs = 5
loss_history = []
for epoch in range(epochs):
    model.train()
    for i, (seq, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        y_pred = model(seq)
        single_loss = loss_function(y_pred, labels)
        single_loss.backward()
        optimizer.step()
        loss_history.append(single_loss.item())
    
    print(f'Epoch {epoch+1}/{epochs}, Loss: {single_loss.item():.4f}')

    # Optionally, add code to evaluate the model on a validation set here

# Visualize the training progress
plt.figure(figsize=(10, 5))
plt.plot(loss_history, label='Training loss')
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.title('Training Loss Curve')
plt.legend()
plt.show()

# Include evaluation and other components as needed


### Aspect 1: Predicting Channel Growth Trajectories
**Research Question:**
Can we predict the future growth of a channel in terms of subscribers and views based on past performance and content category?

**Code Interpretation:**
You would use time series analysis and machine learning to forecast growth. By applying models like ARIMA for univariate forecasting or LSTM networks for multivariate time series, you can predict future metrics.

In [None]:
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error

# Assuming df_timeseries is indexed by 'datetime' and each channel's data is a separate time series
# Example for one channel:
channel_data = df_timeseries[df_timeseries['channel'] == 'UC0UeVA9YHpOEr_Ng442xiRw']

# ARIMA model for 'subs'
model_subs = ARIMA(channel_data['subs'], order=(5,1,0))
model_subs_fit = model_subs.fit()

# Forecast the next X steps
forecast_subs = model_subs_fit.forecast(steps=10)

# Evaluate the model using MSE if you have the actual values to compare with
# mse = mean_squared_error(test_data, forecast_subs)


### Aspect 2: Content Strategy Optimization
**Research Question:**
What content strategies correlate with higher engagement and growth in different categories?

**Code Interpretation:**
You would perform statistical analysis and machine learning to find patterns and correlations in content strategies across different categories. Regression models or decision trees can reveal what features (like video frequency, length, time of posting) are most predictive of higher engagement.

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Merge datasets on 'channel' to include category information in df_timeseries
merged_data = pd.merge(df_timeseries, df_channels[['channel', 'category']], on='channel')

merged_data.shape

In [None]:
# RandomForest to determine feature importances for 'delta_subs'
X = merged_data[['views', 'delta_views', 'videos', 'delta_videos', 'activity']]  # Add more features as needed
y = merged_data['delta_subs']

model = RandomForestRegressor()
model.fit(X, y)

# Feature importances can give insight into which aspects are most predictive of subscriber changes
importances = model.feature_importances_


### Aspect 3: Community Health and Lifecycle Analysis
**Research Question:**
How does the lifecycle stage of a channel (e.g., growing, mature, declining) affect community health metrics like engagement and sentiment?

**Code Interpretation:**
You would use clustering to identify lifecycle stages and then analyze how these stages correlate with engagement metrics. Sentiment analysis on video titles/descriptions can be paired with engagement metrics to assess community health.

In [None]:
from sklearn.cluster import KMeans
from textblob import TextBlob

# KMeans clustering to identify lifecycle stages based on growth metrics
lifecycle_features = df_channels[['subscribers_cc', 'videos_cc']]  # Add other relevant features
kmeans = KMeans(n_clusters=3)  # Assuming three lifecycle stages
df_channels['lifecycle_stage'] = kmeans.fit_predict(lifecycle_features)

# Sentiment analysis on video titles (requires text data)
# df_timeseries['title_sentiment'] = df_timeseries['title'].apply(lambda title: TextBlob(title).sentiment.polarity)

# Analysis of sentiment vs engagement
# merged_data = pd.merge(df_timeseries, df_channels[['channel', 'lifecycle_stage']], on='channel')
# lifecycle_sentiment = merged_data.groupby('lifecycle_stage')['title_sentiment'].mean()
