In [1]:
from transformers import DistilBertTokenizer, DistilBertForMaskedLM
import torch
from torch.nn import functional as F
import pandas as pd

In [10]:
model_name = 'distilbert-base-uncased'

tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForMaskedLM.from_pretrained(model_name)

In [11]:
essays = pd.read_csv('swiss.csv')
essays.head()

Unnamed: 0,essay
0,9gag - number of posts by 9gag user why 9gag?...
1,social media spring 2013 project 1 - mining th...
2,social media: individual work the long tail of...
3,"youtube user information analysis march 18, 20..."
4,social medias report on long tail effect intro...


In [12]:
text = essays.values.tolist()
text = list(map(lambda x: x[0], text))

Check if all tokens in the essay texts are accepted by the model. If not, extend the model's vocabulary

In [13]:
# Save initial embeddings to check if changed later
initial_embeddings = model.distilbert.embeddings.word_embeddings

# Save vocabulary to compare with the essay vocabulary
tokenizer.save_vocabulary('vocab.txt')

essay_tokens = []

for t in text:
    tokens = t.split()
    for token in tokens:
        token = token.lower()
        if token not in essay_tokens:
            essay_tokens.append(token)

essay_tokens.sort()
print(f'tokens in essay texts: {len(essay_tokens)}')

missing_tokens = []

with open('vocab.txt', 'r') as vocab:
    model_tokens = vocab.readlines()
    model_tokens = list(map(lambda x: x[:-1], model_tokens))

    for token in essay_tokens:
        if token not in model_tokens:
            missing_tokens.append(token)

print(f"missing_tokens: {len(missing_tokens)}")

tokens in essay texts: 5467
missing_tokens: 2854


In [14]:
def clean_text(text):
    text = text.lower()
    tokens = text.split()
    # Naive approach: just remove whatever token is missing (for now)
    tokens = list(filter(lambda token: True if token not in missing_tokens else False, tokens))
    text = ' '.join(tokens)
    return text

text = list(map(lambda txt: clean_text(txt), text))
text[5]

'project i mining the social web site chosen is is a chinese website for girls and women to share their experiences on provides most kinds of cosmetic appeared in market and cosmetics are divided in as the figure this cosmetic distribution will attract large numbers of customers to search for general as well as specific comments about this approach attracts the site visitors at all steps in finding and sharing comments about their interest for when visitors want to search for women skin care they can also search for specific types as facial or eye the same with sharing their comments on specific traffic from more specific search generally containing more than 3 is commonly referred to as long tail this traffic is highly desirable due to the increasing amount of long tail searches by consumers and high conversion rates commonly associated with these the success of the effective site is typically measured by the increase in daily reach and daily traffic rank trend the following illustrat

In [15]:
text[0]

'- number of posts by user why for this i chose a social media platform launched in 2008 with a similar goal as share images that users all over the world the community of users then vote and can share the the website became very and had for instance 1 billion monthly page views as of december a lot of students use this website as a casual and i felt it would be interesting to have some statistics about the long tail hypothesis is one of those website with a strong feeling of community and explain briefly how it there are three steps to the life of an image on when you an it gets on the vote everybody can vote for the image they to there are a lot of pictures of the vote but if it gets enough the image will be promoted to the once the procedure is and if the image became successful it will go to the most popular the hot which is the of one of the main goal for the people who post there is to get on the hot who could be see as a fantastic reward with almost a mystical feeling for certai

In [16]:
# From the dataset manipulation notebook, the max length is 3241
# Bert expects 512 though
inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
inputs

{'input_ids': tensor([[ 101, 1011, 2193,  ...,    0,    0,    0],
        [ 101, 2591, 2865,  ..., 3643, 1997,  102],
        [ 101, 2591, 3265,  ..., 2003, 3491,  102],
        ...,
        [ 101, 1996, 2146,  ..., 2012, 2107,  102],
        [ 101, 2591, 2865,  ..., 2145, 3961,  102],
        [ 101, 1011, 2622,  ..., 2312, 9785,  102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [17]:
inputs['labels'] = inputs.input_ids.detach().clone()
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [18]:
MASK_PROBABILITY = 0.1

# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)

# create mask array
# NOTE tokens 101 and 102 ar special (CLS and SEP), and 0 is a padding so we don't mask them
mask_arr = (rand < MASK_PROBABILITY) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)
mask_arr

tensor([[False,  True,  True,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False,  True, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [19]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

selection[0]

[1,
 2,
 11,
 25,
 38,
 53,
 64,
 94,
 97,
 98,
 100,
 104,
 111,
 121,
 127,
 130,
 141,
 146,
 155,
 171,
 178,
 197,
 198,
 207,
 216,
 223,
 227,
 235,
 250,
 265,
 271,
 273,
 281,
 291,
 299,
 305,
 312,
 314,
 316]

In [20]:
# Apply masks (token 103) where the random number was below the probability
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = tokenizer.mask_token_id

inputs.input_ids

tensor([[ 101,  103,  103,  ...,    0,    0,    0],
        [ 101, 2591,  103,  ..., 3643, 1997,  102],
        [ 101, 2591, 3265,  ..., 2003, 3491,  102],
        ...,
        [ 101, 1996, 2146,  ..., 2012, 2107,  102],
        [ 101,  103, 2865,  ..., 2145, 3961,  102],
        [ 101, 1011, 2622,  ..., 2312, 9785,  102]])

In [21]:
# Create a PyTorch dataset to feed the model
class AESDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return { 
            key: torch.tensor(val[idx]) for key, val in self.encodings.items()
        }

    def __len__(self):
        return len(self.encodings.input_ids)

dataset = AESDataset(inputs)

In [22]:
# Initialize dataloader used during training
BATCH_SIZE = 64
loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [23]:
# Enable CUDA if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)
print('using:', device)

using: cpu


In [24]:
# Activate training mode for the model
from transformers import AdamW # Using Weighted Adam optimizer

LEARNING_RATE = 5e-5

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=LEARNING_RATE)

In [25]:
# Train
from tqdm import tqdm  # tqdm provides a progress bar for training

EPOCHS = 10

for epoch in range(EPOCHS):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  key: torch.tensor(val[idx]) for key, val in self.encodings.items()
  0%|          | 0/1 [03:35<?, ?it/s]