In [None]:
cd /content/drive/MyDrive/tophd/snowman-application-tasks-ay21-22/dataset

/content/drive/MyDrive/tophd/snowman-application-tasks-ay21-22/dataset


In [None]:
import pandas as pd
import numpy as np
train = pd.read_csv("train.csv", encoding='cp1252')
test = pd.read_csv("test.csv",encoding = 'cp1252')

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |▎                               | 10kB 15.1MB/s eta 0:00:01[K     |▌                               | 20kB 20.5MB/s eta 0:00:01[K     |▊                               | 30kB 24.9MB/s eta 0:00:01[K     |█                               | 40kB 23.0MB/s eta 0:00:01[K     |█▎                              | 51kB 16.8MB/s eta 0:00:01[K     |█▌                              | 61kB 18.6MB/s eta 0:00:01[K     |█▊                              | 71kB 13.9MB/s eta 0:00:01[K     |██                              | 81kB 14.1MB/s eta 0:00:01[K     |██▎                             | 92kB 13.4MB/s eta 0:00:01[K     |██▌                             | 102kB 12.4MB/s eta 0:00:01[K     |██▊                             | 112kB 12.4MB/s eta 0:00:01[K     |███                             | 

In [None]:
from transformers import (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)

In [None]:
import re
def text_preprocessing(text):
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
train['Tweets'] = train['Tweets'].apply(text_preprocessing)
test['Tweets'] = test['Tweets'].apply(text_preprocessing)

In [None]:
train.head()

Unnamed: 0,id,Tweets,Label
0,7281,The jokes and puns are flying free in this cam...,none
1,7282,#MKR Lets see who the producers think are goin...,none
2,7283,Praying Jac and Shaz do well! They're my faves...,none
3,7284,RT Pete Evans the Paleo Capitalist has had his...,none
4,7285,If Kat and Andre stay tonight I will stop watc...,none


In [None]:
# Set a dict for mapping id to tag name
# 0:'none', 1:'racism', 2:'sexism
tag2idx={'none': 0,'racism': 1,'sexism':2}

In [None]:
# Mapping index to name
tag2name={tag2idx[key] : key for key in tag2idx.keys()}

In [None]:
tag2name

{0: 'none', 1: 'racism', 2: 'sexism'}

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [None]:
!wget https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model

--2020-11-29 16:45:52--  https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.230.181
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.230.181|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 798011 (779K) [binary/octet-stream]
Saving to: ‘xlnet-base-cased-spiece.model’


2020-11-29 16:45:53 (1.77 MB/s) - ‘xlnet-base-cased-spiece.model’ saved [798011/798011]



In [None]:
vocabulary = './xlnet-base-cased-spiece.model'

In [None]:
# See model's 'max_position_embeddings' = 512
max_len  = 64 

In [None]:
# With cased model, set do_lower_case = False
tokenizer = XLNetTokenizer(vocab_file=vocabulary,do_lower_case=False)

In [None]:
sentences = train.Tweets.to_list()
labels = train.Label.to_list()

In [None]:
#input_ids, input_mask, segment_ids
full_input_ids = []
full_input_masks = []
full_segment_ids = []

SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

UNK_ID = tokenizer.encode("<unk>")[0]
CLS_ID = tokenizer.encode("<cls>")[0]
SEP_ID = tokenizer.encode("<sep>")[0]
MASK_ID = tokenizer.encode("<mask>")[0]
EOD_ID = tokenizer.encode("<eod>")[0]

for i,sentence in enumerate(sentences):
    # Tokenize sentence to token id list
    tokens_a = tokenizer.encode(sentence)
    
    # Trim the len of text
    if(len(tokens_a)>max_len-2):
        tokens_a = tokens_a[:max_len-2]
        
        
    tokens = []
    segment_ids = []
    
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(SEG_ID_A)
        
    # Add <sep> token 
    tokens.append(SEP_ID)
    segment_ids.append(SEG_ID_A)
    
    
    # Add <cls> token
    tokens.append(CLS_ID)
    segment_ids.append(SEG_ID_CLS)
    
    input_ids = tokens
    
    # The mask has 0 for real tokens and 1 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [0] * len(input_ids)

    # Zero-pad up to the sequence length at fornt
    if len(input_ids) < max_len:
        delta_len = max_len - len(input_ids)
        input_ids = [0] * delta_len + input_ids
        input_mask = [1] * delta_len + input_mask
        segment_ids = [SEG_ID_PAD] * delta_len + segment_ids

    assert len(input_ids) == max_len
    assert len(input_mask) == max_len
    assert len(segment_ids) == max_len
    
    full_input_ids.append(input_ids)
    full_input_masks.append(input_mask)
    full_segment_ids.append(segment_ids)

In [None]:
# Make label into id
tags = [tag2idx[str(lab)] for lab in labels]

In [None]:
#split train set to train and val
from sklearn.model_selection import train_test_split
tr_inputs, val_inputs, tr_tags, val_tags,tr_masks, val_masks,tr_segs, val_segs = train_test_split(full_input_ids, tags,full_input_masks,full_segment_ids, 
                                                            random_state=24, test_size=0.1)

In [None]:
#convert to tensor
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
tr_segs = torch.tensor(tr_segs)
val_segs = torch.tensor(val_segs)

In [None]:
from torch.utils.data import TensorDataset,RandomSampler,DataLoader,SequentialSampler
# Set batch num
batch_num = 32
# Set token embedding, attention embedding, segment embedding
train_data = TensorDataset(tr_inputs, tr_masks,tr_segs, tr_tags)
train_sampler = RandomSampler(train_data)
# Drop last can make batch training better for the last one
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_num,drop_last=True)

valid_data = TensorDataset(val_inputs, val_masks,val_segs, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num)

In [None]:
#download XLNET pretrained
!wget https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin

--2020-11-29 17:12:57--  https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.39.14
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.39.14|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 467042463 (445M) [application/octet-stream]
Saving to: ‘xlnet-base-cased-pytorch_model.bin’


2020-11-29 17:13:13 (28.1 MB/s) - ‘xlnet-base-cased-pytorch_model.bin’ saved [467042463/467042463]



In [None]:
!wget https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json

--2020-11-29 17:13:20--  https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.100.109
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.100.109|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 760 [application/json]
Saving to: ‘xlnet-base-cased-config.json’


2020-11-29 17:13:21 (16.9 MB/s) - ‘xlnet-base-cased-config.json’ saved [760/760]



In [None]:
# load model
model_file_address = "./"
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased",num_labels=len(tag2idx))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467042463.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [None]:
# load model to cuda
model.to(device)
# Add multi GPU support
if n_gpu >1:
    model = torch.nn.DataParallel(model)

In [None]:
# Set epoch and grad max num
import math
epochs = 5
max_grad_norm = 1.0
# Cacluate train optimiazaion num
num_train_optimization_steps = int( math.ceil(len(tr_inputs) / batch_num) / 1) * epochs
# FULL_FINETUNE
FULL_FINETUNING = True

In [None]:
from torch.optim import Adam
if FULL_FINETUNING:
    # Fine tune model all layer parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    # Only fine tune classifier parameters
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [None]:
from tqdm import tqdm,trange
#training
for _ in trange(epochs,desc="Epoch"):
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  for step, batch in enumerate(train_dataloader):
    # add batch to gpu
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_segs,b_labels = batch
        
    # forward pass
    outputs = model(input_ids =b_input_ids,token_type_ids=b_segs, input_mask = b_input_mask,labels=b_labels)
    loss, logits = outputs[:2]
    if n_gpu>1:
      # When multi gpu, average it
      loss = loss.mean()
        
      # backward pass
      loss.backward()
        
      # track train loss
      tr_loss += loss.item()
      nb_tr_examples += b_input_ids.size(0)
      nb_tr_steps += 1
        
      # gradient clipping
      torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        
      # update parameters
      optimizer.step()
      optimizer.zero_grad()
        
  # print train loss per epoch
  print("Train loss: {}".format(tr_loss/nb_tr_steps))

Epoch:  20%|██        | 1/5 [01:01<04:07, 61.83s/it]

Train loss: 0.44571096354888545


Epoch:  40%|████      | 2/5 [02:03<03:05, 61.75s/it]

Train loss: 0.24756628861650826


Epoch:  60%|██████    | 3/5 [03:04<02:03, 61.66s/it]

Train loss: 0.11277336774833707


Epoch:  80%|████████  | 4/5 [04:05<01:01, 61.48s/it]

Train loss: 0.06333471307847907


Epoch: 100%|██████████| 5/5 [05:06<00:00, 61.34s/it]

Train loss: 0.03889558804362928





In [None]:
# Save a trained model, configuration and tokenizer
xlnet_out_address = './'
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = "./pytorch_model.bin"
output_config_file = "./config.json"
# Save model into file
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(xlnet_out_address)

('./spiece.model',)

In [None]:
#evaluate model
model = XLNetForSequenceClassification.from_pretrained(xlnet_out_address,num_labels=len(tag2idx))
# Set model to GPU
model.to(device)
if n_gpu >1:
    model = torch.nn.DataParallel(model)
model.eval()

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e

In [None]:
# Set acc funtion
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

In [None]:
from sklearn.metrics import classification_report

def predict(dataloader):
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  y_true = []
  y_predict = []
  for step, batch in enumerate(dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_segs,b_labels = batch  
    with torch.no_grad():
      outputs = model(input_ids =b_input_ids,token_type_ids=b_segs, input_mask = b_input_mask,labels=b_labels)
      tmp_eval_loss, logits = outputs[:2]
    
    # Get textclassification predict result
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = accuracy(logits, label_ids)
  
    # Save predict and real label reuslt for analyze
    for predict in np.argmax(logits, axis=1):
      y_predict.append(predict)
  
    for real_result in label_ids.tolist():
      y_true.append(real_result)

    
    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy
   
    nb_eval_steps += 1
    
    
  eval_loss = eval_loss / nb_eval_steps
  eval_accuracy = eval_accuracy / len(val_inputs)
  loss = tr_loss/nb_tr_steps 
  result = {'eval_loss': eval_loss,
                  'eval_accuracy': eval_accuracy,
                  'loss': loss}
  report = classification_report(y_pred=np.array(y_predict),y_true=np.array(y_true))
  print(report)

In [None]:
predict(valid_dataloader)

              precision    recall  f1-score   support

           0       0.89      0.91      0.90       877
           1       0.68      0.75      0.71       139
           2       0.85      0.75      0.80       267

    accuracy                           0.86      1283
   macro avg       0.81      0.80      0.80      1283
weighted avg       0.86      0.86      0.86      1283



In [None]:
#test on new dataset
test_sentences = test.Tweets.to_list()
test_labels = test.Label.to_list()
# Make label into id
test_tags = [tag2idx[str(lab)] for lab in test_labels]

#input_ids, input_mask, segment_ids
test_input_ids = []
test_input_masks = []
test_segment_ids = []

SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

UNK_ID = tokenizer.encode("<unk>")[0]
CLS_ID = tokenizer.encode("<cls>")[0]
SEP_ID = tokenizer.encode("<sep>")[0]
MASK_ID = tokenizer.encode("<mask>")[0]
EOD_ID = tokenizer.encode("<eod>")[0]

for i,sentence in enumerate(test_sentences):
    # Tokenize sentence to token id list
    tokens_a = tokenizer.encode(sentence)
    
    # Trim the len of text
    if(len(tokens_a)>max_len-2):
        tokens_a = tokens_a[:max_len-2]
        
        
    tokens = []
    segment_ids = []
    
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(SEG_ID_A)
        
    # Add <sep> token 
    tokens.append(SEP_ID)
    segment_ids.append(SEG_ID_A)
    
    
    # Add <cls> token
    tokens.append(CLS_ID)
    segment_ids.append(SEG_ID_CLS)
    
    input_ids = tokens
    
    # The mask has 0 for real tokens and 1 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [0] * len(input_ids)

    # Zero-pad up to the sequence length at fornt
    if len(input_ids) < max_len:
        delta_len = max_len - len(input_ids)
        input_ids = [0] * delta_len + input_ids
        input_mask = [1] * delta_len + input_mask
        segment_ids = [SEG_ID_PAD] * delta_len + segment_ids

    assert len(input_ids) == max_len
    assert len(input_mask) == max_len
    assert len(segment_ids) == max_len
    
    test_input_ids.append(input_ids)
    test_input_masks.append(input_mask)
    test_segment_ids.append(segment_ids)

In [None]:
#convert to tensor
test_input_ids = torch.tensor(test_input_ids)
test_input_masks = torch.tensor(test_input_masks)
test_segment_ids = torch.tensor(test_segment_ids)
test_tags = torch.tensor(test_tags)

In [None]:
test_data = TensorDataset(test_input_ids, test_input_masks,test_segment_ids, test_tags)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_num)

In [None]:
predict(test_dataloader)

              precision    recall  f1-score   support

           0       0.85      0.88      0.87      2186
           1       0.97      0.75      0.85       387
           2       0.60      0.61      0.61       633

    accuracy                           0.81      3206
   macro avg       0.81      0.75      0.77      3206
weighted avg       0.82      0.81      0.81      3206

