https://medium.com/nlplanet/fine-tuning-distilbert-on-senator-tweets-a6f2425ca50e

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 24.9 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 29.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [38]:
# distilbert for text classification
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tqdm import tqdm
import matplotlib.pyplot as plt

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [79]:
# load in the data
# df = pd.read_csv('/Users/jeremyhudsonchan/Dropbox/Files/Github_Repos/Twitter-Sentiment-Analysis/data/sampled/training.1600000.processed.noemoticon.csv')
df = pd.read_csv('/content/drive/MyDrive/Twitter/training.1600000.processed.noemoticon.csv',encoding = "latin-1", low_memory=False)

In [80]:
df.head()

Unnamed: 0,ids,date,user,text,target
0,2265878782,Sun Jun 21 07:29:15 PDT 2009,MeeJong,This morning my daughter asked me if I hate my...,0
1,2205565064,Wed Jun 17 05:00:35 PDT 2009,kathysyahrizal,"I forgot to charge my bb, zzzzz only 35% left",0
2,2002872654,Tue Jun 02 05:22:07 PDT 2009,kgautam,my MBP battery is fluctuating between dead to ...,0
3,1835496893,Mon May 18 06:07:33 PDT 2009,thejanice,i really wish my landlord would call me back. ...,0
4,1573596472,Tue Apr 21 01:55:46 PDT 2009,lucyxechelon,mean blog people,0


In [81]:
# replace the sentiment labels with 4 to be 1
df['target'] = df['target'].replace(4, 1)

In [82]:
# finetune distilbert for text classification
# https://huggingface.co/transformers/model_doc/distilbert.html

# set up the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
# attention mask
def attention_masking(tokenized_text):
    attention_mask = []
    for i in tokenized_text:
        seq_mask = [float(i>0) for i in i]
        attention_mask.append(seq_mask)
    return attention_mask

# tokenize the text
tokenized_text = tokenizer.batch_encode_plus(df['text'].values,
                                                add_special_tokens=True,
                                                max_length=128,
                                                pad_to_max_length=True,
                                                return_attention_mask=True,
                                                return_tensors='pt')


In [None]:
# create input_ids, attention_mask, and labels
input_ids = tokenized_text['input_ids']
attention_mask = tokenized_text['attention_mask']
labels = torch.tensor(df['target'].values)

In [43]:
# split the data into train val test
train, test = train_test_split(df, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.2, random_state=42)

In [None]:
# create the dataloaders
class TwitterDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.target
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }
        
# create the dataloaders
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = TwitterDataset(
        dataframe=df,
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4
    )
    
# create the dataloaders
train_data_loader = create_data_loader(train, tokenizer, 128, 16)
val_data_loader = create_data_loader(val, tokenizer, 128, 16)
test_data_loader = create_data_loader(test, tokenizer, 128, 16)

# create the model
class DistilBertClass(torch.nn.Module):
    def __init__(self):
        super(DistilBertClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output
    
# create the model
model = DistilBertClass()
model.to('cuda')

# create the optimizer
optimizer = optim.Adam(params =  model.parameters(), lr=1e-5)

# create the loss function
loss_fn = nn.BCEWithLogitsLoss().to('cuda')

# train the model
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()

    losses = []
    correct_predictions = 0

    for d in tqdm(data_loader):
        input_ids = d["ids"].to(device)
        attention_mask = d["mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets.unsqueeze(1))

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

# evaluate the model
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in tqdm(data_loader):
            input_ids = d["ids"].to(device)
            attention_mask = d["mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets.unsqueeze(1))

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

# train the model
EPOCHS = 10

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print('-' * 10)
    
        train_acc, train_loss = train_epoch(
            model,
            train_data_loader,
            loss_fn,
            optimizer,
            'cuda',
            None,
            len(train)
        )
    
        print(f'Train loss {train_loss} accuracy {train_acc}')
    
        val_acc, val_loss = eval_model(
            model,
            val_data_loader,
            loss_fn,
            'cuda',
            len(val)
        )
    
        print(f'Val   loss {val_loss} accuracy {val_acc}')
        print()
    
        history['train_acc'].append(train_acc)
        history['train_loss'].append(train_loss)
        history['val_acc'].append(val_acc)
        history['val_loss'].append(val_loss)
    
        if val_acc > best_accuracy:
            torch.save(model.state_dict(), 'best_model_state.bin')
            best_accuracy = val_acc
            
# evaluate the model
test_acc, _ = eval_model(
    model,
    test_data_loader,
    loss_fn,
    'cuda',
    len(test)
)

In [None]:
# create the dataloader class
class TwitterDataLoader:
    def __init__(self, data, tokenizer, max_len, batch_size):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.batch_size = batch_size
        self.dataset = TwitterDataset(
            data=self.data,
            tokenizer=self.tokenizer,
            max_len=self.max_len
        )

    def __len__(self):
        return len(self.data)

    def __iter__(self):
        self.dataloader = DataLoader(
            self.dataset,
            batch_size=self.batch_size,
            num_workers=4
        )
        for batch in self.dataloader:
            yield batch
        
    def get_dataset(self):
        return self.dataset
    
# create the dataloaders
train_dataloader = TwitterDataLoader(
    data=train,
    tokenizer=tokenizer,
    max_len=160,
    batch_size=16
)

val_dataloader = TwitterDataLoader(
    data=val,
    tokenizer=tokenizer,
    max_len=160,
    batch_size=16
)

test_dataloader = TwitterDataLoader(
    data=test,
    tokenizer=tokenizer,
    max_len=160,
    batch_size=16
)

In [None]:
# create the model
class TwitterModel(nn.Module):
    def __init__(self):
        super(TwitterModel, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.drop = nn.Dropout(p=0.2)
        self.out = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [None]:
# create the model
model = TwitterModel()

In [None]:
# create the optimizer
optimizer = optim.Adam(params=model.parameters(), lr=2e-5)

In [None]:
# define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# create the loss function
loss_fn = nn.CrossEntropyLoss().to(device)

In [1]:
# create the training loop
def train_epoch(model, data_loader, loss_fn, optimizer, device, n_examples):
    model = model.train()

    losses = []
    correct_predictions = 0

    for d in tqdm(data_loader):
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['targets'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
# create the evaluation loop
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in tqdm(data_loader):
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['targets'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
# create the training loop
def train(model, train_dataloader, val_dataloader, loss_fn, optimizer, device, n_epochs):
    history = defaultdict(list)
    best_accuracy = 0

    for epoch in range(n_epochs):
        print(f'Epoch {epoch + 1}/{n_epochs}')
        print('-' * 10)

        train_acc, train_loss = train_epoch(
            model,
            train_dataloader,
            loss_fn,
            optimizer,
            device,
            len(train_dataloader.get_dataset())
        )

        print(f'Train loss {train_loss} accuracy {train_acc}')

        val_acc, val_loss = eval_model(
            model,
            val_dataloader,
            loss_fn,
            device,
            len(val_dataloader.get_dataset())
        )

        print(f'Val   loss {val_loss} accuracy {val_acc}')
        print()

        history['train_acc'].append(train_acc)
        history['train_loss'].append(train_loss)
        history['val_acc'].append(val_acc)
        history['val_loss'].append(val_loss)

        if val_acc > best_accuracy:
            torch.save(model.state_dict(), 'best_model_state.bin')
            best_accuracy = val_acc

    print(f'Best val accuracy: {best_accuracy}')

    return history

In [None]:
# train the model
# add .to(device) to the model, attention mask 
# and targets to train on GPU
history = train(
    model,
    train_dataloader,
    val_dataloader,
    loss_fn,
    optimizer,
    device,
    n_epochs=5
)

In [None]:
def build_rnn(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.Dropout(0.5),
    # add more layers rnn
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.GRU(rnn_units,
                        dropout=0.2, recurrent_dropout=0.2,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(1,activation='sigmoid')
  ])
  return model