In [1]:
%%capture
! pip install transformers==3.0.2

In [2]:
import numpy as np
import pandas as pd
import torch
from sklearn import metrics
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer
from sklearn.model_selection import train_test_split
from IPython.display import clear_output
from tqdm import tqdm

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
#load data from google drive
from google.colab import drive
drive.mount('/content/drive')

#put your path here 
%cd /content/drive/MyDrive/EPFL/ADA/

Mounted at /content/drive
/content/drive/MyDrive/EPFL/ADA


# Loading and preparing data

In [4]:
DATA_PATH = 'data/quotation_topics.csv'
df = pd.read_csv(DATA_PATH)

In [5]:
num_classes = 10
labels = df.urls.value_counts()[:num_classes].index.tolist()

data_qoutations = df[df.urls.isin(labels)]
data_qoutations = data_qoutations.rename(columns={'urls': 'label'})
data_qoutations.reset_index(drop=True, inplace=True)
data_qoutations = data_qoutations.assign(label_num = pd.Series(pd.factorize(data_qoutations['label'])[0]).values)
data_qoutations.head()

Unnamed: 0,label,quotation,label_num
0,opinion,a champion figure skater switching to roller s...,0
1,review,A Pile of Leaves.,1
2,politics,A Senator we can call our own.,2
3,sports,It's crazy. I can't even really explain it rig...,3
4,television,"I've never been on a show that's so immediate,",4


### Itshould have structure:
### ['qoutation', 'label', 'label_num']

In [8]:
print(pd.factorize(data_qoutations['label'])[1].tolist())

['opinion', 'review', 'politics', 'sports', 'television', 'nyregion', 'europe', 'style', 'us', 'business']


# Dataloader

In [6]:
class MultiLabelDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.quotation
        self.targets = self.data.label_num
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        # text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.targets.iloc[index], dtype=torch.long) #float
        }

In [7]:
# max_len = data_qoutations.quotation.str.split().str.len().max()
MAX_LEN = 64
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_dataset, test_dataset = train_test_split(data_qoutations, test_size=0.2)

training_set = MultiLabelDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_dataset, tokenizer, MAX_LEN)
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                # 'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                # 'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

# Loading model

In [8]:
distilBERT = DistilBertModel.from_pretrained("distilbert-base-uncased")

for parameter in distilBERT.parameters():
    parameter.requires_grad = False

Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [12]:
class DistilBERTClass(torch.nn.Module):
    def __init__(self, distilBERT):
        super(DistilBERTClass, self).__init__()
        # self.embedding = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.embedding = distilBERT
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        output_1 = self.embedding(input_ids=input_ids, attention_mask=attention_mask)[0]
        pooler = output_1[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [13]:
model = DistilBERTClass(distilBERT).to(device)
# model.to(device)

# Training loop

In [14]:
def train_model(epoch):
    model.train()
    for i, data in tqdm(enumerate(training_loader, 0)):
        optimizer.zero_grad()
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        
        outputs = model(ids, mask)
        loss = loss_CE(outputs, targets)
        params = list(model.parameters())
        if i % 100 == 0:
          clear_output(wait=True)
          print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        loss.backward()
        optimizer.step()

In [16]:
EPOCHS = 1
LEARNING_RATE = 1e-04

loss_CE = torch.nn.CrossEntropyLoss(reduction='mean').to(device)
optimizer = torch.optim.Adam(params = model.parameters(), lr = LEARNING_RATE)

for epoch in range(EPOCHS):
    train_model(epoch)

# Evaluation results

In [69]:
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets']
            outputs = model(ids, mask)
            fin_targets.extend(np.array(targets))
            fin_outputs.extend(np.argmax(outputs.cpu().detach().numpy().tolist(), axis=1))
    return fin_outputs, fin_targets

In [70]:
outputs, targets = validation(testing_loader)
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, labels=np.ndarray(num_classes), average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, labels=np.ndarray(num_classes), average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

555it [28:32,  3.09s/it]

Accuracy Score = 0.39364041269662287
F1 Score (Micro) = 0.050297816015883526
F1 Score (Macro) = 0.04526803441429517



  average, "true nor predicted", 'F-score is', len(true_sum)
