In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from google.colab import files
import io
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from PIL import Image
import time
import tqdm
import os
import matplotlib.pyplot as plt
import pickle

In [None]:
class YoutubeDataset(Dataset):
    def __init__(self, data, image_data):
        images = []
        for id in data['video_id']:
            images.append(image_data[id].flatten())
        images = np.array(images) # (N, HWC)
        titles = None # here
        metadata = data[['period_day', 'subscriber_count']].to_numpy() # (N, 2)
        self.x = np.concatenate((images, titles, metadata), axis=1) # (N, HWC + 3)
        self.y = np.log10(data['view_count'].to_numpy())

        print(self.x.shape) # Testing

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        x = torch.FloatTensor(self.x[idx])
        y = torch.FloatTensor(self.y[idx])
        return x, y

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        # here
    
    def forward(self, x):
        # here
        return x

    def train_(self, epochs, lr, train_loader, valid_loader, save_every):
        self.criterion = nn.MSELoss()
        self.optimizer = optim.Adam(self.parameters(), lr=lr)

        self.train_loss = []
        self.valid_loss = []

        best_mse = -1
        best_epoch = -1

        train_start = time.time()

        print("Model will be trained on {}\n".format(self.device))

        for epoch in range(1, epochs + 1):
            self.train()
            print("[Epoch {:3d} / {}]".format(epoch, epochs))

            epoch_start = time.time()
            epoch_loss = 0.0
            
            #training
            for batch_idx, (data, target) in enumerate(tqdm.tqdm(train_loader, desc="Training")):
                data, target = data.to(self.device), target.to(self.device)
                self.optimizer.zero_grad()
                output = self.forward(data)

                loss = self.criterion(output, target)
                loss.backward()
                self.optimizer.step()

                epoch_loss += loss.item()

            epoch_end = time.time()
            m, s = divmod(epoch_end - epoch_start, 60)

            epoch_loss /= len(train_loader)
            self.train_loss.append(epoch_loss)
            
            #validation
            with torch.no_grad():
                self.eval()
                true_y, pred_y = self.predict(valid_loader)
                valid_loss = self.criterion(pred_y, true_y)
                self.valid_loss.append(valid_loss.item())

            print("Train MSE = {:.4f} | Valid MSE = {:.4f}".format(epoch_loss, valid_loss))
            print(f"Train Time: {m:.0f}m {s:.0f}s\n")

            valid_mse = valid_loss.item()
            if best_mse < valid_mse:
                print("=> Best Model Updated : Epoch = {}, Valid MSE = {:.4f}\n".format(epoch, valid_mse))
                best_mse = valid_mse
                best_epoch = epoch
                torch.save(self.state_dict(), "./best_model/best_model.pt")
            else:
                print()

            if (epoch % save_every) == 0:
                torch.save(self.state_dict(),"./model/epoch{}_train{:.4f}_valid{:.4f}.pt".format(epoch, epoch_loss, valid_mse))

        m, s = divmod(time.time() - train_start, 60)
        print("\nTraining Finished...!!")
        print("\nBest Valid MSE : %.2f at epoch %d" % (best_mse, best_epoch))
        print(f"Total Time: {m:.0f}m {s:.0f}s\nModel was trained on {self.device}!")

        torch.save(self.state_dict(),"./model/epoch{}_train{:.4f}_valid{:.4f}.pt".format(epoch, epoch_loss, valid_mse))
    
    def restore(self):
        with open("./best_model/best_model.pt", "rb") as f:
            state_dict = torch.load(f)
        self.load_state_dict(state_dict)

    def predict(self, dataloader):
        with torch.no_grad():
            self.eval()
            true_y = []
            pred_y = []
            for batch_x, batch_y in dataloader:
                pred = self.forward(batch_x.to(self.device))
                true_y.append(batch_y.numpy())
                pred_y.append(pred.cpu().numpy())
            true_y = np.concatenate(true_y, axis=0).squeeze()
            pred_y = np.concatenate(pred_y, axis=0)
        return true_y, pred_y #numpy array

    def plot(self):
        plt.plot(np.array(self.train_loss_val), "b")
        plt.plot(np.array(self.train_acc_val), "r")
        plt.plot(np.array(self.valid_acc_val), "g")
        plt.savefig("graph.png")
        plt.show()


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/AIP

In [None]:
train_data, valid_data = train_test_split(pd.read_csv('./train.csv'), test_size = 0.1, random_state = 55)
test_data = pd.read_csv('./test.csv')

In [None]:
with open('./data.pickle', 'rb') as f:
    image_data = pickle.load(f)

In [None]:
#setting hyper parameters
batch_size = 64
epochs = 10
lr = 1e-5

In [None]:
train_dataset = YoutubeDataset(train_data, image_data)
valid_dataset = YoutubeDataset(valid_data, image_data)
test_dataset = YoutubeDataset(test_data, image_data)

train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size = 1)
test_loader = DataLoader(test_dataset, batch_size = 1)

In [None]:
model = Model()
model.train_(epochs, lr, train_loader, valid_loader, 10)

In [None]:
model.plot()