In [1]:
# install bert from huggingface

!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |▍                               | 10kB 20.8MB/s eta 0:00:01[K     |▉                               | 20kB 5.5MB/s eta 0:00:01[K     |█▎                              | 30kB 7.3MB/s eta 0:00:01[K     |█▊                              | 40kB 7.3MB/s eta 0:00:01[K     |██▏                             | 51kB 6.5MB/s eta 0:00:01[K     |██▋                             | 61kB 7.0MB/s eta 0:00:01[K     |███                             | 71kB 7.5MB/s eta 0:00:01[K     |███▍                            | 81kB 8.1MB/s eta 0:00:01[K     |███▉                            | 92kB 7.9MB/s eta 0:00:01[K     |████▎                           | 102kB 8.3MB/s eta 0:00:01[K     |████▊                           | 112kB 8.3MB/s eta 0:00:01[K     |█████▏                          | 122kB 8.3M

In [2]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb

In [3]:
# mount to google drive
from google.colab import drive
drive.mount("/content/gdrive")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [4]:
# read data
data = pd.read_csv('/content/gdrive/My Drive/personal/movie_train_data.csv', encoding='utf-8')

In [6]:
# convert 'overview' to lower case
data['overview'] = data['overview'].apply(lambda x: x.lower())
data.head()

Unnamed: 0,original_title,overview,one_genre,genre_idx
0,Toy Story,"led by woody, andy's toys live happily in his ...",Comedy,4
1,Grumpier Old Men,a family wedding reignites the ancient feud be...,Comedy,4
2,Waiting to Exhale,"cheated on, mistreated and stepped on, the wom...",Comedy,4
3,Father of the Bride Part II,just when george banks has recovered from his ...,Comedy,4
4,Heat,"obsessive master thief, neil mccauley leads a ...",Action,2


In [7]:
# get model
# used DistilBERT due to limited resources

model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




In [8]:
# tokenize data
tokenized = data['overview'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

Token indices sequence length is longer than the specified maximum sequence length for this model (551 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (526 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (647 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (600 > 512). Running this sequence through the model will result in indexing errors


In [9]:
# pad all tokens with max length 60

def padding(tokens):
    if len(tokens) > 60:
        padded = tokens[:60]
    else:
        pads = [0]*(60-len(tokens))
        padded = tokens.copy()
        padded.extend(pads)
    return padded

In [10]:
# pad tokens and convert to numpy array

padded_tokens = tokenized.apply(lambda x: padding(x))
padded = padded_tokens.tolist()
padded = np.array(padded)
np.array(padded).shape

(39634, 60)

In [11]:
# mask tokens that were padded as 0

attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(39634, 60)

In [12]:
# send model to gpu

device = torch.device("cuda")
model.to(device)

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i

In [13]:
from torch.utils.data import Dataset, DataLoader

class MovieDataset(Dataset):
    def __init__(self, tokens, masks):
        self.tokens = tokens
        self.masks = masks

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        text = self.tokens[idx]
        mask = self.masks[idx]
        return text, mask

In [14]:
# make dataset and dataloader
movie_train_dataset = MovieDataset(padded, attention_mask)
train_loader = DataLoader(movie_train_dataset, batch_size=2, shuffle=False, num_workers=2)

In [15]:
# get embeddings from distilbert

results = []
for text, mask in train_loader:
  input, masks = text.to(device), mask.to(device)
  with torch.no_grad():
    last_hidden_states = model(input, attention_mask=masks)
    results.append(last_hidden_states[0])

In [16]:
# bring embeddings to cpu
# convert tensor embeddings into numpy arrays

embeddings = []
embeddings_cpu = []
for t in results:
  for u in t:
    embeddings.append(u)

for i in embeddings:
  embeddings_cpu.append(i[0,:].cpu().numpy())

In [17]:
# make train data for regresssion model

train_data = pd.DataFrame()
train_data['features'] = embeddings_cpu
train_data['features'] = train_data['features'].apply(lambda x: x.tolist())
train_data['label'] = data['genre_idx']

In [18]:
train_data.head()

Unnamed: 0,features,label
0,"[-0.1980847418308258, 0.1362394094467163, 0.30...",4
1,"[-0.15464085340499878, -0.013341668993234634, ...",4
2,"[0.18121884763240814, 0.2435857057571411, 0.33...",4
3,"[-0.09777358174324036, 0.036291927099227905, 0...",4
4,"[-0.20299874246120453, 0.19659163057804108, -0...",2


In [19]:
# save data
train_data.to_csv('/content/gdrive/My Drive/personal/train_data.csv')