In [None]:
# Add word2vec to python path.
import sys
sys.path.append("external/word2vec")

In [None]:
# Load Config
config = {
    "use_cuda": True,
    
    "data_dir": "data/",
    "data_size": -1,
    "pre_trained_vocab_path": "models/skipgram/vocab.pt",
    "pre_train_embedding_path": "models/skipgram/best_val_model_4.27.pt",
    
    "learning_rate": 0.00025,
    "num_epochs": 64,
    
    "model_dir": "models/title-predictor/",
    "checkpoint_freq": 4,
}

In [None]:
import os
import time

# Data Processing.
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# PyTorch.
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import Vocab
from torchtext.data.utils import get_tokenizer

# Word2Vec.
from external.word2vec.utils.helper import (
    load_vocab,
)

# Title Predictor.
from title_predictor_model import TitlePredictor

# Plotting.
import matplotlib.pyplot as plt

In [None]:
plt.style.use("Solarize_Light2")

In [None]:
# Check CUDA availability.
if config["use_cuda"]:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
    device = torch.device("cpu")
device

In [None]:
# Check if model directory exists.
if not os.path.exists(config["model_dir"]):
    os.makedirs(config["model_dir"])

# Load Data

In [None]:
videoDf = pd.read_csv("data/usvideos_rm_dup.csv")
# Remove all nan view values.
videoDf = videoDf.dropna()
videoDf.head()

In [None]:
# Dataset for twitter financial news text.
class YouTubeTitleViewDataset(Dataset):
    def __init__(self, df: pd.DataFrame, size = -1):
        self.titleDf = df
        # Shuffle and take a subset of the data.
        if size > 0:
            self.titleDf = self.titleDf.sample(frac=1).reset_index(drop=True)
            self.titleDf = self.titleDf[:size]
        else:
            self.titleDf = self.titleDf.sample(frac=1).reset_index(drop=True)
        
    def __len__(self):
        return len(self.titleDf)
    
    def __getitem__(self, idx):
        title = self.titleDf["title"][idx]
        view = self.titleDf["views"][idx]
        return str(title), view

    def __iter__(self):
        for i in range(len(self)):
            yield self[i]

In [None]:
# Create title dataset.
titleDataset = YouTubeTitleViewDataset(videoDf, size = config["data_size"])
len(titleDataset)

In [None]:
# Split train and test dataset.
trainSet, valSet = train_test_split(titleDataset, test_size=0.2)
len(trainSet), len(valSet)

In [None]:
titleDataset[0]

# Load Vocab and Tokenizer

In [None]:
if (config["pre_trained_vocab_path"]):
    vocab: Vocab = load_vocab(config["pre_trained_vocab_path"])
    vocabSize = len(vocab.get_stoi())
    print(f"Pretrained vocab size: {vocabSize}")
else:
    print(f"No vocab path provided.")

In [None]:
# Add SOS and EOS tokens.
vocab.append_token("<SOS>")
vocab.append_token("<EOS>")
vocabSize = len(vocab.get_stoi())
print(f"Vocab size: {vocabSize}")

In [None]:
# Get the english tokenizer.
tokenizer = get_tokenizer("basic_english", language="en")
tokenizer

In [None]:
# Get the first sentence and view.
sentence, view = titleDataset[0]
print(sentence)
print(view)
# Tokenize the sentence.
tokenizedSentence: list = tokenizer(sentence)
tokenizedSentence.insert(0, "<SOS>")
tokenizedSentence.append("<EOS>")
print(tokenizedSentence)
# Convert the tokens to ids.
print(vocab(tokenizedSentence))

# Model

In [None]:
# Loading pretrained embedding weight.
embeddingWeight = torch.load(config["pre_train_embedding_path"])["embeddings.weight"]
print(f"Embedding weight shape: {embeddingWeight.shape}")
# Create an embedding layer.
embedding = nn.Embedding(vocabSize, embeddingWeight.shape[1])
print(f"Embedding layer shape: {embedding.weight.shape}")
# Load pretrained embedding weight.
with torch.no_grad():
    embedding.weight.data[:embeddingWeight.shape[0]] = embeddingWeight
print(f"Pretrained embedding loaded. Embedding layer shape: {embedding.weight.shape}")

# Training

In [None]:
# Create model.
model = TitlePredictor(
    vocabSize,
    embeddingSize = embeddingWeight.shape[1],
    hiddenSize = 256,
    numLayers = 2,
    dropout = 0.5,
    embedding = embedding
)
model = model.to(device)

In [None]:
# Setup loss function and optimizer.
lossFunction = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config["learning_rate"])

In [None]:
trainLosses = []
valLosses = []
startTime = time.time()
bestValLoss = float("inf")
bestValModel = None

# Train the model.
for epoch in range(config["num_epochs"]):
    print(f"Epoch: {epoch+1}/{config['num_epochs']}")
    # Train the model.
    trainLossVal = 0
    valLossVal = 0
    for i, (title, view) in enumerate(trainSet):
        # Tokenize the sentence.
        tokenizedSentence: list = tokenizer(title)
        tokenizedSentence.insert(0, "<SOS>")
        tokenizedSentence.append("<EOS>")
        # Convert the tokens to ids.
        tokenizedSentence = vocab(tokenizedSentence)
        # Convert to tensor.
        tokenizedSentence = torch.tensor(tokenizedSentence).to(device)
        # Reshape to [sentence length, 1]
        tokenizedSentence = tokenizedSentence.unsqueeze(1)
        # Get the view.
        view = torch.tensor([view]).to(device)
        # Get the natural log of the view.
        view = torch.log(view)
        # Reshape to [1]
        view = view.unsqueeze(0)
        # Forward pass.
        prediction = model(tokenizedSentence)
        # Compute loss.
        trainLoss = lossFunction(prediction, view.float())
        # Backward pass.
        optimizer.zero_grad()
        trainLoss.backward()
        # Update parameters.
        optimizer.step()
        # Add loss to train loss.
        trainLossVal += trainLoss.item()
        
        print(f"Sentence: {i+1}/{len(trainSet)}", end="\r")
    
    # Validate the model.
    with torch.no_grad():
        for i, (title, view) in enumerate(valSet):
            # Tokenize the sentence.
            tokenizedSentence: list = tokenizer(title)
            tokenizedSentence.insert(0, "<SOS>")
            tokenizedSentence.append("<EOS>")
            # Convert the tokens to ids.
            tokenizedSentence = vocab(tokenizedSentence)
            # Convert to tensor.
            tokenizedSentence = torch.tensor(tokenizedSentence).to(device)
            # Reshape to [sentence length, 1]
            tokenizedSentence = tokenizedSentence.unsqueeze(1)
            # Get the view.
            view = torch.tensor([view]).to(device)
            # Get the natural log of the view.
            view = torch.log(view)
            # reshape to [1]
            view = view.unsqueeze(0)
            # Forward pass.
            prediction = model(tokenizedSentence)
            # Compute loss.
            valLoss = lossFunction(prediction, view)
            # Add loss to validation loss.
            valLossVal += valLoss.item()
    
    avgTrainLoss = trainLossVal/len(trainSet)
    avgValLoss = valLossVal/len(valSet)
    
    # Print loss.
    print(f"Epoch: {epoch+1}/{config['num_epochs']}, Train Loss: {avgTrainLoss:.4f}, Val Loss: {avgValLoss:.4f}")
    
    trainLosses.append(avgTrainLoss)
    valLosses.append(avgValLoss)
    
    # Save the model with the lowest validation loss.
    if avgValLoss < bestValLoss:
        bestValLoss = avgValLoss
        bestValModel = model
        print("Best model saved.")
    
    # Save checkpoint.
    if (epoch+1) % config["checkpoint_freq"] == 0:
        checkpoint = {
            "model": model.state_dict(),
            "vocabSize": model.vocabSize,
            "embeddingSize": model.embeddingSize,
            "hiddenSize": model.hiddenSize,
            "numLayers": model.numLayers,
            "dropout": model.dropoutPr,
        }
        torch.save(checkpoint, f"{config['model_dir']}/checkpoint_{epoch+1}.pt")
        print(f"Checkpoint saved at epoch {epoch+1}.")
    
    # Calculate time.
    currTime = time.time()
    elapsedTime = currTime - startTime
    print(f"Elapsed time: {elapsedTime/60:.2f} minutes.")
    print(f"Average time per epoch: {elapsedTime/(epoch+1)/60:.2f} minutes.")
    print(f"Estimated time remaining: {(config['num_epochs'] - epoch - 1)*elapsedTime/(epoch+1)/60:.2f} minutes.")
        

In [None]:
# Save the model.
torch.save({
    "model": bestValModel.state_dict(),
    "vocabSize": bestValModel.vocabSize,
    "embeddingSize": bestValModel.embeddingSize,
    "hiddenSize": bestValModel.hiddenSize,
    "numLayers": bestValModel.numLayers,
    "dropout": bestValModel.dropoutPr,
}, f"{config['model_dir']}/best_val_{bestValLoss:.2f}.pt")
# Save vocabulary.
torch.save(vocab, f"{config['model_dir']}/vocab.pt")
# Save the loss.
with open(f"{config['model_dir']}/loss.json", "w") as f:
    json.dump({
        "trainLosses": trainLosses,
        "valLosses": valLosses
    }, f)

# Plot the Loss

In [None]:
# Plot the loss.
plt.plot(trainLosses, label="Train Loss")
plt.plot(valLosses, label="Val Loss")
plt.legend()

In [None]:
# Sort the data by view.
videoDf = videoDf.sort_values(by="views", ascending=True)
videoDf.head()

In [None]:
# Get 100 evenly spaced indices.
indices = range(0, len(videoDf), len(videoDf)//100)
indices = list(indices)
# Get the title and view.
titles = videoDf["title"].iloc[indices].tolist()
views = videoDf["views"].iloc[indices].tolist()
len(titles), len(views)

In [None]:
# Predict the views.
predictedViews = []
with torch.no_grad():
    for i, title in enumerate(titles):
        # Tokenize the sentence.
        tokenizedSentence: list = tokenizer(title)
        tokenizedSentence.insert(0, "<SOS>")
        tokenizedSentence.append("<EOS>")
        # Convert the tokens to ids.
        tokenizedSentence = vocab(tokenizedSentence)
        tokenizedSentence = torch.tensor(tokenizedSentence).to(device)
        # Reshape to [sentence length, 1]
        tokenizedSentence = tokenizedSentence.unsqueeze(1)
        # Convert to tensor.
        tokenizedSentence = torch.tensor(tokenizedSentence).to(device)
        # Forward pass.
        prediction = model(tokenizedSentence)
        # Get the view.
        predictedView = torch.exp(prediction).item()
        predictedViews.append(predictedView)
        print(f"Sentence: {i+1}/{len(titles)}", end="\r")
len(predictedViews)

In [None]:
titles[20], views[20], predictedViews[20]

In [None]:
# Convert views and predicted views to log scale.
logVies = np.log(views)
logPredictedViews = np.log(predictedViews)

In [None]:
# Plot the predicted views and actual views.
plt, ax = plt.subplots()
ax.plot(logVies, label="Actual Views")
ax.plot(logPredictedViews, label="Predicted Views")
ax.legend()