In [None]:
!python --version

In [None]:
!pip install keras-tuner --upgrade
!pip install catboost
!pip install lightgbm
!pip install shap

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import matplotlib.pyplot as plt
import pickle
from PIL import Image
import io
import math
import tensorflow as tf
import six
from collections import defaultdict
from tensorflow import keras
from tensorflow.keras import Input, layers, Model
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import scipy.stats as stats
import keras_tuner as kt
from catboost import CatBoostRegressor
import lightgbm as lgb
import shap
from sklearn.model_selection import train_test_split


In [None]:
%matplotlib inline

In [None]:
plt.rcParams['figure.dpi'] = 400

In [None]:
print('Notebook run using keras:', keras.__version__)
print('Notebook run using tensorflow:', tf.__version__)

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
#helper functions
def adjusted_r2(y_test, y_pred, n_predictors):
    r2 = r2_score(y_test, y_pred)  
    adj_r2 = 1-(1-r2)*(len(y_test)-1)/(len(y_test)-n_predictors-1) 
    return adj_r2


def mean_absolute_percentage_error(y_pred, y_true): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
def _parse_function(example_proto):
    feature_description = {
        'image': tf.io.FixedLenFeature((), tf.string), #face images
        'emb': tf.io.FixedLenFeature((128,), tf.float32),  # embedding from face_recognition
        'user': tf.io.FixedLenFeature((), tf.string), #username
        'label': tf.io.FixedLenFeature((), tf.float32), #target label (engagment rate)
    }

    data = tf.io.parse_single_example(example_proto, feature_description)
    image = data['image']
    image = tf.io.decode_jpeg(image, channels=3)
    image = tf.image.resize_with_pad(image, 128, 128) #resize images to fit model input
    image = tf.cast(image, tf.float32) / 255.0 # Normalize the rgb values of the image from the range [0, 255] to [0, 1]

    return {'image': image, 
            'emb': data['emb'],
            'user': data['user']}, data['label']

In [None]:
# Training dataset, 32 faces per channel, random-shuffled.
ds_train = tf.data.Dataset.from_tensor_slices(['/content/drive/MyDrive/tiktok_model/train_32face.tfrecord'])
ds_train = ds_train.flat_map(lambda filename: tf.data.TFRecordDataset(filename))
ds_train = ds_train.map(_parse_function, num_parallel_calls=tf.data.AUTOTUNE)
ds_train = ds_train.shuffle(10000).batch(32, drop_remainder=True)

# val dataset, 32 faces per channel, used for the final ensembled prediction.
ds_val = tf.data.Dataset.from_tensor_slices(['/content/drive/MyDrive/tiktok_model/validation_32face.tfrecord'])
ds_val = ds_val.flat_map(lambda filename: tf.data.TFRecordDataset(filename))
ds_val = ds_val.map(_parse_function, num_parallel_calls=tf.data.AUTOTUNE)
ds_val = ds_val.batch(32, drop_remainder=True)

# test dataset, 32 faces per channel, used for the final ensembled prediction.
ds_test_multiple = tf.data.Dataset.from_tensor_slices(['/content/drive/MyDrive/tiktok_model/test_32face.tfrecord'])
ds_test_multiple = ds_test_multiple.flat_map(lambda filename: tf.data.TFRecordDataset(filename))
ds_test_multiple = ds_test_multiple.map(_parse_function, num_parallel_calls=tf.data.AUTOTUNE)
ds_test_multiple = ds_test_multiple.batch(32, drop_remainder=True)

# test dataset, 1 face per channel.
ds_test = tf.data.Dataset.from_tensor_slices(['/content/drive/MyDrive/tiktok_model/test_1_32face.tfrecord'])
ds_test = ds_test.flat_map(lambda filename: tf.data.TFRecordDataset(filename))
ds_test = ds_test.map(_parse_function, num_parallel_calls=tf.data.AUTOTUNE)
ds_test = ds_test.batch(32, drop_remainder=True)

In [None]:
# Prepare a directory to store all the faces.
checkpoint_dir = "/content/drive/MyDrive/tiktok_model/face_plots"
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

In [None]:
for e in ds_test.take(1):
    images, user, labels = e[0]['image'].numpy(), e[0]['user'].numpy(), e[1].numpy()
    fig = plt.figure(figsize=(5, 5))
    for i, img in enumerate(images[:16]):
        fig.add_subplot(4, 4, i + 1)
        plt.imshow(img)
        plt.axis('off')
    plt.tight_layout(pad=0.2)
    plt.savefig("/content/drive/MyDrive/tiktok_model/face_plots/example_faces_32", dpi=400, bbox_inches="tight")
    plt.show()

In [None]:
class TikTokRegressionModel(tf.keras.Model):
    def __init__(self, channel_multiplier=[], dropout_rate=0.3, units=32, backbone="resnet50", n_layers=2):
        super().__init__()

        self.backbone = backbone
        self.dropout_rate = dropout_rate
        self.units = units
        self.n_layers = n_layers


        if self.backbone == "vgg16":
          self.vgg = tf.keras.applications.VGG16(weights='imagenet',
                  include_top=False,
                  input_shape=(128, 128, 3),
                  pooling='avg')
          

          self.vgg.trainable=False
          

          self.base = tf.keras.models.Sequential()
          self.base.add(self.vgg)
          self.base.add(tf.keras.layers.Dropout(dropout_rate))


        elif self.backbone == "resnet50":
          self.resnet = tf.keras.applications.ResNet50V2(weights='imagenet',
                  include_top=False,
                  input_shape=(128, 128, 3),
                  pooling='avg')

  

          self.resnet.trainable=False
          

          self.base = tf.keras.models.Sequential()
          self.base.add(self.resnet)
          self.base.add(tf.keras.layers.Dropout(dropout_rate))
        

        #face vector-encoding MLP
        self.emb = tf.keras.models.Sequential()
        self.emb.add(tf.keras.layers.Dense(32))
        self.emb.add(tf.keras.layers.Activation('relu'))



        #fully connected layers
        self.dnn = tf.keras.models.Sequential()

        for l in range(n_layers):
          layer = tf.keras.models.Sequential()
          layer.add(tf.keras.layers.Dense(units))
          layer.add(tf.keras.layers.Activation('relu'))
          self.dnn.add(layer)

        #regresion output layer
        self.dnn.add(tf.keras.layers.Dense(1))
        self.dnn.add(tf.keras.layers.Activation('relu'))

        

    def call(self, inputs, training=False):
      #run = inputs[0]
      run = inputs['image']
      run = self.base(run) #image calling
      
      #emb = inputs[1]
      emb = inputs['emb']
      emb = self.emb(emb) #face vector calling

      run = tf.concat([run, emb], axis=-1) # concacenate  outputs  
      run = self.dnn(run) #run through fully-connected and regression layers     

      return run


    def model(self):
        img = Input(shape=(128,128,3))  # sequence length
        emb = Input(shape=(128,)) 

        return Model(inputs=[img, emb], outputs=self.call([img, emb]))

##Hyperparameter Tuning

In [None]:
def build_model(hp):
  hp_units = hp.Choice('units', values=[32, 64])
  hp_dropout_rate = hp.Choice('dropout_rate', values = [0.0, 0.2, 0.3])
  hp_backbone = hp.Choice('backbone', values = ['vgg16', 'resnet50'])
  hp_layers = hp.Choice('layers', values = [1, 2])

  model = TikTokRegressionModel(dropout_rate=hp_dropout_rate, units=hp_units, backbone=hp_backbone, n_layers=hp_layers)

  # Tune the learning rate for the optimizer
  hp_learning_rate = hp.Choice('learning_rate', values=[0.001, 0.0005, 0.0001, 0.00005])

  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss=tf.keras.losses.MeanSquaredError(),
                metrics=[tf.keras.metrics.RootMeanSquaredError(), 
                      tf.keras.metrics.MeanAbsoluteError(), 
                      tf.keras.metrics.MeanAbsolutePercentageError()])

  return model

In [None]:
# Prepare a directory to store all the checkpoints.
tuner_dir = "/content/drive/MyDrive/tiktok_model/tuner"

if not os.path.exists(tuner_dir):
    os.makedirs(tuner_dir)

tuner = kt.Hyperband(build_model,
                     objective='val_loss',
                     max_epochs=40,
                     factor=3,
                     overwrite=True,
                     directory=tuner_dir,
                     max_consecutive_failed_trials=10,
                     project_name='engagment_prediction_regression_nn')

In [None]:
#early stopping
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

In [None]:
#perform hyperparmeter search
tuner.search(x=ds_train, validation_data=ds_val, epochs=40, callbacks=[stop_early])

#Train Model with optimal hyperparameters

In [None]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hps)

In [None]:
# Prepare a directory to store all the checkpoints.
checkpoint_dir = "/content/drive/MyDrive/tiktok_model/ckpt/resnet"
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
  
# Prepare a directory to store all the logs.
log_dir = "/content/drive/MyDrive/tiktok_model/logs/resnet"
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
  
# Prepare a directory to store plots.
plot_dir = "/content/drive/MyDrive/tiktok_model/plots/resnet"
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)

In [None]:
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
        filepath="/content/drive/MyDrive/tiktok_model/ckpt/resnet/model",
        #save_freq="epoch",
        save_best_only=True,  # Only save a model if `val_loss` has improved.
        monitor="val_loss",
        verbose=1
    )

In [None]:
#init model with optimal hyperparameters
model = TikTokRegressionModel(units=32, dropout_rate=0.3, backbone="resnet50", n_layers=2)

In [None]:
#plot model structure
tf.keras.utils.plot_model(
    model.model(),
    to_file="/content/drive/MyDrive/tiktok_model/own.png",
    show_shapes=True,
    show_dtype=True,
    show_layer_names=False,
    rankdir="TB",
    expand_nested=True,
    dpi=200,
    layer_range=None,
    show_layer_activations=True,
)

In [None]:
# Compile with optimal learning rate
model.compile(loss=tf.keras.losses.MeanSquaredError(),
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),
              metrics=[tf.keras.metrics.RootMeanSquaredError(), 
                       tf.keras.metrics.MeanAbsoluteError(), 
                       tf.keras.metrics.MeanAbsolutePercentageError()])

In [None]:
initial_epochs = 40

In [None]:
 history = model.fit(ds_train, epochs=initial_epochs, callbacks=model_checkpoint_callback, validation_data=ds_val)

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']

In [None]:
best_epoch = val_loss.index(min(val_loss)) + 1 
print('Best epoch: %d' % (best_epoch))

In [None]:
#plot model loss
plt.figure(figsize=(14, 5))
plt.plot(loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.ylabel('Loss [MSE]')
plt.xlabel('Epoch')
plt.legend(loc='upper right')
plt.savefig("/content/drive/MyDrive/tiktok_model/plots/train_val_loss_32_resnet", dpi=400, bbox_inches="tight")
plt.show()

#Evaluation

In [None]:
# Simple evaluation (1 face)
preds = model.predict(ds_test)[:, 0]
labels = []
users = []
images = []
for e in ds_test:
    labels.extend(e[1].numpy())
    users.extend(e[0]['user'].numpy())
    images.extend(e[0]['image'].numpy())
labels = np.array(labels)

print("MAE: ", mean_absolute_error(labels, preds))
print("MAPE: ",mean_absolute_percentage_error(preds, labels))
print("MSE: ",mean_squared_error(labels, preds))
print("RMSE: ",mean_squared_error(labels, preds, squared=False))

#calculate pearson correlation
print("Pearson :", np.corrcoef(preds, labels)[0][1])

#Spearman correlation
res, p_value = stats.spearmanr(preds, labels)
print("Spearman :", res, p_value)

#adjusted r2
n_predictors = 2
print("Adj. R2: ",adjusted_r2(labels, preds, n_predictors))

In [None]:
# Multiple faces evaluation

preds_num = defaultdict(float)
preds_den = defaultdict(float)
preds_all = model.predict(ds_test_multiple)[:, 0]
users_all = []
for e in ds_test_multiple:
    users_all.extend(e[0]['user'].numpy())
for p, u in zip(preds_all, users_all):
    u = six.ensure_str(u)
    preds_num[u] += p
    preds_den[u] += 1.0


preds_multiple= np.array([float(preds_num[six.ensure_str(u)]) / float(preds_den[six.ensure_str(u)]) for u in users])

print("MAE: ", mean_absolute_error(labels, preds_multiple))
print("MAPE: ",mean_absolute_percentage_error(preds_multiple, labels))
print("MSE: ",mean_squared_error(labels, preds_multiple))
print("RMSE: ",mean_squared_error(labels, preds_multiple, squared=False))

#calculate pearson correlation
print("Pearson :", np.corrcoef(preds_multiple, labels)[0][1])

#Spearman correlation
res, p_value = stats.spearmanr(preds_multiple, labels)
print("Spearman :", res, p_value)


#adjusted r2
n_predictors = 2
print("Adj. R2: ", adjusted_r2(labels, preds_multiple, n_predictors))

In [None]:
plt.figure(figsize=(8, 8))
plt.scatter(labels, preds_multiple, alpha=0.3)

x = y = np.arange(0, 15, 1)

coef = np.polyfit(x,y,1)
poly1d_fn = np.poly1d(coef) 

plt.plot(poly1d_fn(x), '--k')

plt.xlabel('Predictions')
plt.ylabel('Ground Truth')

In [None]:
fig = plt.figure(figsize=(8, 8))
ax =sns.jointplot(x=labels, y=preds_multiple, alpha=0.3)

plt.ylabel('Predictions [%]')
plt.xlabel('Ground Truth [%]')

In [None]:
#MAE
error = preds_multiple - labels
plt.figure(figsize=(10, 5))
plt.hist(error, bins=30)
plt.xlabel('Absolute Prediction Error [User Engagement Rate]')
_ = plt.ylabel('Count')


##Results Face Features


In [None]:
#quicksort predictions
preds_sorted = np.argsort(preds)

In [None]:
# Highest engagement predictions
fig = plt.figure(figsize=(5, 6))
fig.tight_layout(pad=0.2)
for f, i in enumerate(reversed(preds_sorted[-20:])):
    img = images[i]
    fig.add_subplot(5, 4, f + 1)                           
    plt.imshow(img)
    plt.axis('off') 
plt.tight_layout(pad=0.2)
plt.show()

In [None]:
# Lowest engagement predictions
fig = plt.figure(figsize=(5, 6))
fig.tight_layout(pad=0.2)
for f, i in enumerate(reversed(preds_sorted[:16])):
    img = images[i]
    fig.add_subplot(5, 4, f + 1)                            
    plt.imshow(img)
    plt.axis('off')
    
plt.tight_layout(pad=0.2)
plt.show()

#Language Regression

In [None]:
#import dataframes
df_final = pd.read_csv("/content/drive/MyDrive/tiktok_model/df_tabular.csv")

In [None]:
df_final = df_final.groupby('language_cat').filter(lambda x : len(x)>2)

In [None]:
X, y = df_final['language_cat'], df_final["average_engagement_impressions"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=X, test_size=0.1, random_state=1)

In [None]:
X_train_split, X_eval_split, y_train_split, y_eval_split = train_test_split(X_train, y_train, stratify=X_train, test_size=0.2, random_state=42)

CatBoost

In [None]:
catmodel = CatBoostRegressor()

In [None]:
catmodel.fit(pd.DataFrame(X_train_split), y_train_split, eval_set=(pd.DataFrame(X_eval_split), y_eval_split), use_best_model=True, verbose=False, cat_features=['language_cat'])

In [None]:
preds = catmodel.predict(pd.DataFrame(X_test))

In [None]:
print("MAE: ", mean_absolute_error(y_test, preds))
print("MAPE: ", mean_absolute_percentage_error(y_test, preds))
print("MSE: ", mean_squared_error(y_test, preds))
print("RMSE: ", mean_squared_error(y_test, preds, squared=False))

#calculate pearson correlation
print("Pearson :", np.corrcoef(preds, y_test)[0][1])

#Spearman correlation
res, p_value = stats.spearmanr(preds, y_test)
print("Spearman :", res, p_value)


#adjusted r2
n_predictors = 1
print("Adj. R2: ",adjusted_r2(y_test, preds, n_predictors))

In [None]:
sns.jointplot(x=y_test, y=preds, alpha=0.3)
plt.xlabel('Ground Truth [%]')
plt.ylabel('Predictions [%]')

LightGBM

In [None]:
model = lgb.LGBMRegressor()

In [None]:
model.fit(pd.DataFrame(X_train_split), y_train_split, eval_set=(pd.DataFrame(X_eval_split), y_eval_split), categorical_feature=['language_cat'])

In [None]:
preds = model.predict(pd.DataFrame(X_test))

In [None]:
print("MAE: ", mean_absolute_error(y_test, preds))
print("MAPE: ", mean_absolute_percentage_error(y_test, preds))
print("MSE: ", mean_squared_error(y_test, preds))
print("RMSE: ", mean_squared_error(y_test, preds, squared=False))

#calculate pearson correlation
print("Pearson :", np.corrcoef(preds, y_test)[0][1])

#Spearman correlation
res, p_value = stats.spearmanr(preds, y_test)
print("Spearman :", res, p_value)


#adjusted r2
n_predictors = 1
print("Adj. R2: ",adjusted_r2(y_test, preds, n_predictors))

In [None]:
sns.jointplot(x=y_test, y=preds, alpha=0.3)
plt.xlabel('Ground Truth [%]')
plt.ylabel('Predictions [%]')

#Channel age

In [None]:
#import dataframes
df_final = pd.read_csv("/content/drive/MyDrive/tiktok_model/df_tabular.csv")

In [None]:
X, y = df_final['channel_age'], df_final["average_engagement_impressions"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.1, random_state=1)

In [None]:
X_train_split, X_eval_split, y_train_split, y_eval_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

CatBoost

In [None]:
catmodel_age = CatBoostRegressor()

In [None]:
catmodel_age.fit(pd.DataFrame(X_train_split), y_train_split, eval_set=(pd.DataFrame(X_eval_split), y_eval_split), use_best_model=True, verbose=False)

In [None]:
preds = catmodel_age.predict(pd.DataFrame(X_test))

In [None]:
print("MAE: ", mean_absolute_error(y_test, preds))
print("MAPE: ", mean_absolute_percentage_error(y_test, preds))
print("MSE: ", mean_squared_error(y_test, preds))
print("RMSE: ", mean_squared_error(y_test, preds, squared=False))

#calculate pearson correlation
print("Pearson :", np.corrcoef(preds, y_test)[0][1])

#Spearman correlation
res, p_value = stats.spearmanr(preds, y_test)
print("Spearman :", res, p_value)


#adjusted r2
n_predictors = 1
print("Adj. R2: ",adjusted_r2(y_test, preds, n_predictors))

In [None]:
sns.jointplot(x=y_test, y=preds, alpha=0.3)
plt.xlabel('Ground Truth [%]')
plt.ylabel('Predictions [%]')

LightGBM

In [None]:
model = lgb.LGBMRegressor()

In [None]:
model.fit(pd.DataFrame(X_train_split), y_train_split, eval_set=(pd.DataFrame(X_eval_split), y_eval_split), verbose=False)

In [None]:
preds = model.predict(pd.DataFrame(X_test))

In [None]:
print("MAE: ", mean_absolute_error(y_test, preds))
print("MAPE: ", mean_absolute_percentage_error(y_test, preds))
print("MSE: ", mean_squared_error(y_test, preds))
print("RMSE: ", mean_squared_error(y_test, preds, squared=False))

#calculate pearson correlation
print("Pearson :", np.corrcoef(preds, y_test)[0][1])

#Spearman correlation
res, p_value = stats.spearmanr(preds, y_test)
print("Spearman :", res, p_value)


#adjusted r2
n_predictors = 1
print("Adj. R2: ",adjusted_r2(y_test, preds, n_predictors))

In [None]:
sns.jointplot(x=y_test, y=preds, alpha=0.3)
plt.xlabel('Ground Truth [%]')
plt.ylabel('Predictions [%]')

#Channel Topic

In [None]:
#import dataframes
df_final = pd.read_csv("/content/drive/MyDrive/tiktok_model/df_tabular.csv")

In [None]:
X, y = df_final['topic'], df_final["average_engagement_impressions"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=X, test_size=0.1, random_state=1)

In [None]:
X_train_split, X_eval_split, y_train_split, y_eval_split = train_test_split(X_train, y_train, stratify=X_train, test_size=0.2, random_state=42)

CatBoost

In [None]:
catmodel_topic = CatBoostRegressor()

In [None]:
catmodel_topic.fit(pd.DataFrame(X_train_split), y_train_split, eval_set=(pd.DataFrame(X_eval_split), y_eval_split), use_best_model=True, verbose=False, cat_features=['topic'])

In [None]:
preds = catmodel_topic.predict(pd.DataFrame(X_test))

In [None]:
print("MAE: ", mean_absolute_error(y_test, preds))
print("MAPE: ", mean_absolute_percentage_error(y_test, preds))
print("MSE: ", mean_squared_error(y_test, preds))
print("RMSE: ", mean_squared_error(y_test, preds, squared=False))

#calculate pearson correlation
print("Pearson :", np.corrcoef(preds, y_test)[0][1])

#Spearman correlation
res, p_value = stats.spearmanr(preds, y_test)
print("Spearman :", res, p_value)


#adjusted r2
n_predictors = 1
print("Adj. R2: ",adjusted_r2(y_test, preds, n_predictors))

In [None]:
sns.jointplot(x=y_test, y=preds, alpha=0.3)
plt.xlabel('Ground Truth [%]')
plt.ylabel('Predictions [%]')

LightGBM

In [None]:
model = lgb.LGBMRegressor()

In [None]:
model.fit(pd.DataFrame(X_train_split), y_train_split, eval_set=(pd.DataFrame(X_eval_split), y_eval_split), verbose=False, categorical_feature=['topic'])

In [None]:
preds = model.predict(pd.DataFrame(X_test))

In [None]:
print("MAE: ", mean_absolute_error(y_test, preds))
print("MAPE: ", mean_absolute_percentage_error(y_test, preds))
print("MSE: ", mean_squared_error(y_test, preds))
print("RMSE: ", mean_squared_error(y_test, preds, squared=False))

#calculate pearson correlation
print("Pearson :", np.corrcoef(preds, y_test)[0][1])

#Spearman correlation
res, p_value = stats.spearmanr(preds, y_test)
print("Spearman :", res, p_value)


#adjusted r2
n_predictors = 1
print("Adj. R2: ",adjusted_r2(y_test, preds, n_predictors))

In [None]:
sns.jointplot(x=y_test, y=preds, alpha=0.3)
plt.xlabel('Ground Truth [%]')
plt.ylabel('Predictions [%]')

#Post frequency

In [None]:
#import dataframes
df_final = pd.read_csv("/content/drive/MyDrive/tiktok_model/df_tabular.csv")

In [None]:
X, y = df_final['post_frequency'], df_final["average_engagement_impressions"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.1, random_state=1)

In [None]:
X_train_split, X_eval_split, y_train_split, y_eval_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

CatBoost

In [None]:
catmodel_frequency = CatBoostRegressor()

In [None]:
catmodel_frequency.fit(pd.DataFrame(X_train_split), y_train_split, eval_set=(pd.DataFrame(X_eval_split), y_eval_split), use_best_model=True, verbose=False)

In [None]:
preds = catmodel_frequency.predict(pd.DataFrame(X_test))

In [None]:
print("MAE: ", mean_absolute_error(y_test, preds))
print("MAPE: ", mean_absolute_percentage_error(y_test, preds))
print("MSE: ", mean_squared_error(y_test, preds))
print("RMSE: ", mean_squared_error(y_test, preds, squared=False))

#calculate pearson correlation
print("Pearson :", np.corrcoef(preds, y_test)[0][1])

#Spearman correlation
res, p_value = stats.spearmanr(preds, y_test)
print("Spearman :", res, p_value)


#adjusted r2
n_predictors = 1
print("Adj. R2: ",adjusted_r2(y_test, preds, n_predictors))

In [None]:
sns.jointplot(x=y_test, y=preds, alpha=0.3)
plt.xlabel('Ground Truth [%]')
plt.ylabel('Predictions [%]')

LightGBM

In [None]:
model = lgb.LGBMRegressor()

In [None]:
model.fit(pd.DataFrame(X_train_split), y_train_split, eval_set=(pd.DataFrame(X_eval_split), y_eval_split), verbose=False)

In [None]:
preds = model.predict(pd.DataFrame(X_test))

In [None]:
print("MAE: ", mean_absolute_error(y_test, preds))
print("MAPE: ", mean_absolute_percentage_error(y_test, preds))
print("MSE: ", mean_squared_error(y_test, preds))
print("RMSE: ", mean_squared_error(y_test, preds, squared=False))

#calculate pearson correlation
print("Pearson :", np.corrcoef(preds, y_test)[0][1])

#Spearman correlation
res, p_value = stats.spearmanr(preds, y_test)
print("Spearman :", res, p_value)


#adjusted r2
n_predictors = 1
print("Adj. R2: ",adjusted_r2(y_test, preds, n_predictors))

In [None]:
sns.jointplot(x=y_test, y=preds, alpha=0.3)
plt.xlabel('Ground Truth [%]')
plt.ylabel('Predictions [%]')

#Verification Status

In [None]:
#import dataframes
df_final = pd.read_csv("/content/drive/MyDrive/tiktok_model/df_tabular.csv")

In [None]:
X, y = df_final['verified'], df_final["average_engagement_impressions"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.1, random_state=1)

In [None]:
X_train_split, X_eval_split, y_train_split, y_eval_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

CatBoost

In [None]:
catmodel_verified = CatBoostRegressor()

In [None]:
catmodel_verified.fit(pd.DataFrame(X_train_split), y_train_split, eval_set=(pd.DataFrame(X_eval_split), y_eval_split), use_best_model=True, verbose=False, cat_features=['verified'])

In [None]:
preds = catmodel_verified.predict(pd.DataFrame(X_test))

In [None]:
print("MAE: ", mean_absolute_error(y_test, preds))
print("MAPE: ", mean_absolute_percentage_error(y_test, preds))
print("MSE: ", mean_squared_error(y_test, preds))
print("RMSE: ", mean_squared_error(y_test, preds, squared=False))

#calculate pearson correlation
print("Pearson :", np.corrcoef(preds, y_test)[0][1])

#Spearman correlation
res, p_value = stats.spearmanr(preds, y_test)
print("Spearman :", res, p_value)


#adjusted r2
n_predictors = 1
print("Adj. R2: ",adjusted_r2(y_test, preds, n_predictors))

In [None]:
sns.jointplot(x=y_test, y=preds, alpha=0.3)
plt.xlabel('Ground Truth [%]')
plt.ylabel('Predictions [%]')

LightGBM

In [None]:
model = lgb.LGBMRegressor()

In [None]:
model.fit(pd.DataFrame(X_train_split), y_train_split, eval_set=(pd.DataFrame(X_eval_split), y_eval_split), verbose=False, categorical_feature=['verified'])

In [None]:
preds = model.predict(pd.DataFrame(X_test))

In [None]:
print("MAE: ", mean_absolute_error(y_test, preds))
print("MAPE: ", mean_absolute_percentage_error(y_test, preds))
print("MSE: ", mean_squared_error(y_test, preds))
print("RMSE: ", mean_squared_error(y_test, preds, squared=False))

#calculate pearson correlation
print("Pearson :", np.corrcoef(preds, y_test)[0][1])

#Spearman correlation
res, p_value = stats.spearmanr(preds, y_test)
print("Spearman :", res, p_value)


#adjusted r2
n_predictors = 1
print("Adj. R2: ",adjusted_r2(y_test, preds, n_predictors))

In [None]:
sns.jointplot(x=y_test, y=preds, alpha=0.3)
plt.xlabel('Ground Truth [%]')
plt.ylabel('Predictions [%]')

#Combined CatBoost Model

In [None]:
#import dataframes
df_final = pd.read_csv("/content/drive/MyDrive/tiktok_model/df_tabular.csv")

In [None]:
df_final = df_final.groupby('language_cat').filter(lambda x : len(x)>3)

In [None]:
#df_final['language_cat']= df_final["language_cat"].astype(str)

In [None]:
X, y = df_final[['language_cat', 'topic', 'verified', 'post_frequency', 'channel_age']], df_final["average_engagement_impressions"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=X['language_cat'], test_size=0.1, random_state=1)

In [None]:
X_train_split, X_eval_split, y_train_split, y_eval_split = train_test_split(X_train, y_train, stratify=X_train['language_cat'], test_size=0.2, random_state=42)

CatBoost

In [None]:
cat_ensembled = CatBoostRegressor()

In [None]:
cat_ensembled.fit(pd.DataFrame(X_train_split), y_train_split,  eval_set=(pd.DataFrame(X_eval_split), y_eval_split), verbose=False, use_best_model=True, cat_features=['language_cat', 'topic', 'verified'])

In [None]:
preds = cat_ensembled.predict(X_test)

In [None]:
print("MAE: ", mean_absolute_error(y_test, preds))
print("MAPE: ", mean_absolute_percentage_error(y_test, preds))
print("MSE: ", mean_squared_error(y_test, preds))
print("RMSE: ", mean_squared_error(y_test, preds, squared=False))

#calculate pearson correlation
print("Pearson :", np.corrcoef(preds, y_test)[0][1])

#Spearman correlation
res, p_value = stats.spearmanr(preds, y_test)
print("Spearman :", res, p_value)


#adjusted r2
n_predictors = 5
print("Adj. R2: ",adjusted_r2(y_test, preds, n_predictors))

In [None]:
sns.jointplot(x=y_test, y=preds, alpha=0.3)
plt.xlabel('Ground Truth [%]')
plt.ylabel('Predictions [%]')

LightGBM

In [None]:
model = lgb.LGBMRegressor()

In [None]:
model.fit(pd.DataFrame(X_train_split), y_train_split, eval_set=(pd.DataFrame(X_eval_split), y_eval_split), verbose=False, categorical_feature=['language_cat', 'topic', 'verified'])

In [None]:
preds = model.predict(pd.DataFrame(X_test))

In [None]:
print("MAE: ", mean_absolute_error(y_test, preds))
print("MAPE: ", mean_absolute_percentage_error(y_test, preds))
print("MSE: ", mean_squared_error(y_test, preds))
print("RMSE: ", mean_squared_error(y_test, preds, squared=False))

#calculate pearson correlation
print("Pearson :", np.corrcoef(preds, y_test)[0][1])

#Spearman correlation
res, p_value = stats.spearmanr(preds, y_test)
print("Spearman :", res, p_value)


#adjusted r2
n_predictors = 5
print("Adj. R2: ",adjusted_r2(y_test, preds, n_predictors))

In [None]:
sns.jointplot(x=y_test, y=preds, alpha=0.3)
plt.xlabel('Ground Truth [%]')
plt.ylabel('Predictions [%]')

#SHAP Analysis

In [None]:
explainer = shap.Explainer(cat_ensembled)
shap_values = explainer(pd.DataFrame(X_test))

In [None]:
# create a dependence scatter plot to show the effect of a single feature across the whole dataset
fig,ax = plt.subplots(figsize=(12,7))
shap.plots.scatter(shap_values[:,"verified"],  alpha=0.3, ax=ax)

In [None]:
# create a dependence scatter plot to show the effect of a single feature across the whole dataset
fig,ax = plt.subplots(figsize=(12,7))
shap.plots.scatter(shap_values[:,"channel_age"],  alpha=0.3, ax=ax)

In [None]:
# create a dependence scatter plot to show the effect of a single feature across the whole dataset
fig,ax = plt.subplots(figsize=(12,7))

shap.plots.scatter(shap_values[:,"post_frequency"],  alpha=0.3, ax=ax, xmax=0.2)

In [None]:
df = pd.DataFrame({'topic': shap_values.data[:,1].ravel(), 'shap_value': shap_values.values[:,1].ravel()})

In [None]:
df = df.groupby('topic').mean().sort_values(by='shap_value', ascending=False)

In [None]:
print(df.to_string())

In [None]:
df = pd.DataFrame({'language_cat': shap_values.data[:,0].ravel(), 'shap_value': shap_values.values[:,0].ravel()})

In [None]:
df = df.groupby('language_cat').mean().sort_values(by='shap_value', ascending=False)

In [None]:
df2 = df_final.drop_duplicates(subset=['language_cat'])

In [None]:
df = pd.merge(df, df2[['language_cat', 'language']], on='language_cat', how='left')

In [None]:
LANGUAGES = {
    'af': 'afrikaans',
    'sq': 'albanian',
    'am': 'amharic',
    'ar': 'arabic',
    'hy': 'armenian',
    'az': 'azerbaijani',
    'eu': 'basque',
    'be': 'belarusian',
    'bn': 'bengali',
    'bs': 'bosnian',
    'bg': 'bulgarian',
    'ca': 'catalan',
    'ceb': 'cebuano',
    'ny': 'chichewa',
    'zh-cn': 'chinese (simplified)',
    'zh-tw': 'chinese (traditional)',
    'co': 'corsican',
    'hr': 'croatian',
    'cs': 'czech',
    'da': 'danish',
    'nl': 'dutch',
    'en': 'english',
    'eo': 'esperanto',
    'et': 'estonian',
    'tl': 'filipino',
    'fi': 'finnish',
    'fr': 'french',
    'fy': 'frisian',
    'gl': 'galician',
    'ka': 'georgian',
    'de': 'german',
    'el': 'greek',
    'gu': 'gujarati',
    'ht': 'haitian creole',
    'ha': 'hausa',
    'haw': 'hawaiian',
    'iw': 'hebrew',
    'he': 'hebrew',
    'hi': 'hindi',
    'hmn': 'hmong',
    'hu': 'hungarian',
    'is': 'icelandic',
    'ig': 'igbo',
    'id': 'indonesian',
    'ga': 'irish',
    'it': 'italian',
    'ja': 'japanese',
    'jw': 'javanese',
    'kn': 'kannada',
    'kk': 'kazakh',
    'km': 'khmer',
    'ko': 'korean',
    'ku': 'kurdish (kurmanji)',
    'ky': 'kyrgyz',
    'lo': 'lao',
    'la': 'latin',
    'lv': 'latvian',
    'lt': 'lithuanian',
    'lb': 'luxembourgish',
    'mk': 'macedonian',
    'mg': 'malagasy',
    'ms': 'malay',
    'ml': 'malayalam',
    'mt': 'maltese',
    'mi': 'maori',
    'mr': 'marathi',
    'mn': 'mongolian',
    'my': 'myanmar (burmese)',
    'ne': 'nepali',
    'no': 'norwegian',
    'or': 'odia',
    'ps': 'pashto',
    'fa': 'persian',
    'pl': 'polish',
    'pt': 'portuguese',
    'pa': 'punjabi',
    'ro': 'romanian',
    'ru': 'russian',
    'sm': 'samoan',
    'gd': 'scots gaelic',
    'sr': 'serbian',
    'st': 'sesotho',
    'sn': 'shona',
    'sd': 'sindhi',
    'si': 'sinhala',
    'sk': 'slovak',
    'sl': 'slovenian',
    'so': 'somali',
    'es': 'spanish',
    'su': 'sundanese',
    'sw': 'swahili',
    'sv': 'swedish',
    'tg': 'tajik',
    'ta': 'tamil',
    'te': 'telugu',
    'th': 'thai',
    'tr': 'turkish',
    'uk': 'ukrainian',
    'ur': 'urdu',
    'ug': 'uyghur',
    'uz': 'uzbek',
    'vi': 'vietnamese',
    'cy': 'welsh',
    'xh': 'xhosa',
    'yi': 'yiddish',
    'yo': 'yoruba',
    'zu': 'zulu',
}

In [None]:
df.language = df.language.map(LANGUAGES)

In [None]:
print(df.to_string())

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
shap.plots.bar(shap_values)

In [None]:
# visualize the first prediction's explanation

shap.plots.waterfall(shap_values[0])