In [None]:
# IMPORTS
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sqlite3
import torch
import torch.optim as optim
import torch.nn as nn

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from torch.utils.data import TensorDataset, Dataset, DataLoader
!pip install transformers
from transformers import BertModel, BertTokenizer
from transformers import AdamW

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

cuda


In [4]:
# ---------------  TOKENIZATION ----------------\
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [5]:
# --------------- BERT -------------
model = BertModel.from_pretrained('bert-base-cased')
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [6]:
# ---------------  EXAMPLE DATA ----------------
db_file = "/content/drive/My Drive/Colab Notebooks/Title2Playlist/title_dataset-PREVIEW.db"
sql_conn = sqlite3.Connection(db_file)
cursor = sql_conn.cursor()

index = 0
lim = 3
data = cursor.execute("SELECT title FROM titles LIMIT ? OFFSET ?", (lim, index))
titles = np.asarray(data.fetchall()).squeeze()
print(titles)

['Throwbacks' 'Awesome Playlist' 'korean ']


In [7]:
def create_embeddings(input, max_seq_length):
  tokenized_input = tokenizer(text=input.tolist(),
                              add_special_tokens=True,
                              padding='max_length',
                              max_length=max_seq_length,
                              return_tensors='pt',  # PyTorch tensors
                              return_attention_mask=True)

  input_ids = tokenized_input['input_ids']
  att_masks = tokenized_input['attention_mask']

  # --------------- BERT OUTPUT ---------------
  output_embeddings = []
  for id, mask in zip(input_ids, att_masks):
    title_id = id.reshape(1, len(id)).to(device)
    title_mask = mask.reshape(1, len(mask)).to(device)
    output = model(input_ids=title_id, attention_mask=title_mask)
    output_embeddings.append(output[1])

  return output_embeddings

In [8]:
output = create_embeddings(titles, 30)
print(output[0].size())

torch.Size([1, 768])
