In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv
/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv


In [3]:
!pip install transformers



# Dataset

In [4]:
train = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding = 'latin-1')
test = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv', encoding = 'latin-1')

In [5]:
train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [6]:
test.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


# Mengecek Nilai Null

In [8]:
train.isna().sum()

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

In [9]:
train['Location'].fillna('None', inplace = True)
train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [10]:
train.isna().sum()

UserName         0
ScreenName       0
Location         0
TweetAt          0
OriginalTweet    0
Sentiment        0
dtype: int64

# Data Preprocessing

In [11]:
import re 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
", ".join(stopwords.words('english'))
stops = set(stopwords.words('english'))

def clean_tweet(tweet):
    # Remove hyperlinks.
    tweet = re.sub(r'https?://\S+|www\.\S+','',tweet)
    # Remove html
    tweet = re.sub(r'<.*?>','',tweet)
    # Remove numbers
    tweet = re.sub(r'\d+','',tweet)
    # Remove mentions
    tweet = re.sub(r'@\w+','',tweet)
    # Remove punctuation
    tweet = re.sub(r'[^\w\s\d]','',tweet)
    # Remove whitespace
    tweet = re.sub(r'\s+',' ',tweet).strip()
    # Remove stopwords
    tweet = " ".join([word for word in str(tweet).split() if word not in stops])
    
    return tweet

In [13]:
def make_label(sentiment):
    
    label = ''
    if 'Positive' in sentiment: 
        label = 1
    if 'Negative' in sentiment:
        label = -1
    if 'Neutral' in sentiment:
        label = 0
    return label

In [14]:
train['cleanTweet'] = train['OriginalTweet'].apply(lambda x: clean_tweet(x))
test['cleanTweet'] = test['OriginalTweet'].apply(lambda x: clean_tweet(x))

train['Sentiment'] = train['Sentiment'].apply(lambda x: make_label(x))
test['Sentiment'] = test['Sentiment'].apply(lambda x: make_label(x))

In [15]:
from sklearn.preprocessing import LabelEncoder

# Encode the classes for BERT. We'll keep using the 3 labels we made earlier.  
encoder = LabelEncoder()
train['label'] = encoder.fit_transform(train['Sentiment'])
test['label'] = encoder.fit_transform(test['Sentiment'])

In [16]:
train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,cleanTweet,label
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,0,,1
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,1,advice Talk neighbours family exchange phone n...,2
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,1,Coronavirus Australia Woolworths give elderly ...,2
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,1,My food stock one empty PLEASE dont panic THER...,2
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",-1,Me ready go supermarket COVID outbreak Not Im ...,0


# BertTokenizer

In [17]:
import transformers
from transformers import BertTokenizer, BertForSequenceClassification

In [18]:
bert_tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [19]:
train['TokenizeTweet'] = [bert_tokenizer.tokenize(sent) for sent in train.cleanTweet.values]
test['TokenizeTweet'] = [bert_tokenizer.tokenize(sent) for sent in test.cleanTweet.values]

In [20]:
train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,cleanTweet,label,TokenizeTweet
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,0,,1,[]
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,1,advice Talk neighbours family exchange phone n...,2,"[advice, talk, neighbours, family, exchange, p..."
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,1,Coronavirus Australia Woolworths give elderly ...,2,"[corona, ##virus, australia, wool, ##worth, ##..."
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,1,My food stock one empty PLEASE dont panic THER...,2,"[my, food, stock, one, empty, please, don, ##t..."
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",-1,Me ready go supermarket COVID outbreak Not Im ...,0,"[me, ready, go, supermarket, co, ##vid, outbre..."


# Training Validation Split

In [21]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(train['cleanTweet'], train['label'], test_size = 0.2)

# Encoding Data

In [22]:
encoded_data_train = bert_tokenizer.batch_encode_plus(
    x_train, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    max_length=120,
    padding='max_length', 
    truncation=True,
    return_tensors='pt'
)

encoded_data_val = bert_tokenizer.batch_encode_plus(
    x_val, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    max_length=120,
    padding='max_length', 
    truncation=True,
    return_tensors='pt'
)

# Convert Numpy Array Menjadi Tensor

In [23]:
import torch
from torch.utils.data import TensorDataset

In [24]:
train_input_ids = encoded_data_train['input_ids']
train_attention_masks = encoded_data_train['attention_mask']
train_label = torch.tensor(y_train.values)
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_label)

val_input_ids = encoded_data_val['input_ids']
val_attention_masks = encoded_data_val['attention_mask']
val_label = torch.tensor(y_val.values)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_label)

# Membuat Data Loader

In [25]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=128)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=128)

In [26]:
torch.save(val_dataloader,'validation_data_loader')
torch.save(train_dataloader,'train_data_loader')

# Model (BertForSequenceClassification)

In [27]:
model = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels=3,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [28]:
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm, trange,tnrange,tqdm_notebook

In [29]:
optimizer = AdamW(model.parameters(),lr=3e-5,correct_bias=True)

epochs = 5
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*epochs)

In [30]:
from sklearn.metrics import f1_score

#f1 score
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

In [31]:
#accuracy score
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [32]:
def evaluate(val_dataloader):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(val_dataloader, leave=False, disable=True):

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2]}

        with torch.no_grad():        
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(val_dataloader) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [33]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


# Training

In [35]:
for epoch in tqdm(range(1, epochs+1)):

    model.train()

    loss_train_total = 0

    progress_bar = tqdm(train_dataloader, 
                        desc='Epoch {:1d}'.format(epoch), 
                        leave=False, 
                        disable=True)
    
    for batch in progress_bar:
        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids': batch[0].to(device),
                  'attention_mask': batch[1].to(device),
                  'labels': batch[2].to(device)}
        
        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total +=loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})     
        
    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    #print training result
    loss_train_avg = loss_train_total/len(train_dataloader)
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    #evaluate
    val_loss, predictions, true_vals = evaluate(val_dataloader)
    #f1 score
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')

  0%|          | 0/5 [05:36<?, ?it/s]


Epoch 1
Training loss: 0.6176868488853292


 20%|██        | 1/5 [06:04<24:16, 364.04s/it]

Validation loss: 0.465267341870528
F1 Score (weighted): 0.840756266554831


 20%|██        | 1/5 [11:39<24:16, 364.04s/it]


Epoch 2
Training loss: 0.38041673000006715


 40%|████      | 2/5 [12:07<18:11, 363.68s/it]

Validation loss: 0.37086758430187516
F1 Score (weighted): 0.8744037012638541


 40%|████      | 2/5 [17:43<18:11, 363.68s/it]


Epoch 3
Training loss: 0.2874199994948021


 60%|██████    | 3/5 [18:10<12:07, 363.55s/it]

Validation loss: 0.35941991874804863
F1 Score (weighted): 0.8823120553717461


 60%|██████    | 3/5 [23:46<12:07, 363.55s/it]


Epoch 4
Training loss: 0.23119026532237844


 80%|████████  | 4/5 [24:14<06:03, 363.64s/it]

Validation loss: 0.381132612320093
F1 Score (weighted): 0.880379893431729


 80%|████████  | 4/5 [29:50<06:03, 363.64s/it]


Epoch 5
Training loss: 0.19603099214822747


100%|██████████| 5/5 [30:18<00:00, 363.66s/it]

Validation loss: 0.38934515233223255
F1 Score (weighted): 0.8827556841030362





In [36]:
encoded_classes = encoder.classes_
predicted_category = [encoded_classes[np.argmax(x)] for x in predictions]
true_category = [encoded_classes[x] for x in true_vals]

In [37]:
x = 0
for i in range(len(true_category)):
    if true_category[i] == predicted_category[i]:
        x += 1
        
print('Accuracy Score = ', x / len(true_category))

Accuracy Score =  0.8832604470359572


In [38]:
print(classification_report(true_category, predicted_category))

              precision    recall  f1-score   support

          -1       0.87      0.90      0.88      3069
           0       0.89      0.79      0.84      1513
           1       0.89      0.91      0.90      3650

    accuracy                           0.88      8232
   macro avg       0.88      0.87      0.87      8232
weighted avg       0.88      0.88      0.88      8232



# Testing

In [39]:
x_test = test['cleanTweet']
y_test = test['label']

encoded_data_test = bert_tokenizer.batch_encode_plus(
    x_test, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    max_length=120,
    padding='max_length', 
    truncation=True,
    return_tensors='pt'
)

test_input_ids = encoded_data_test['input_ids']
test_attention_masks = encoded_data_test['attention_mask']
test_label = torch.tensor(y_test.values)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_label)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=128)

In [40]:
val_loss, predictions, true_vals = evaluate(test_dataloader)
val_f1 = f1_score_func(predictions, true_vals)

In [41]:
encoded_classes = encoder.classes_
predicted_category = [encoded_classes[np.argmax(x)] for x in predictions]
true_category = [encoded_classes[x] for x in true_vals]

x = 0
for i in range(len(true_category)):
    if true_category[i] == predicted_category[i]:
        x += 1
        
print('Accuracy Score = ', x / len(true_category))
print('\n')
print(classification_report(true_category, predicted_category))

Accuracy Score =  0.865192206424434


              precision    recall  f1-score   support

          -1       0.86      0.89      0.87      1633
           0       0.85      0.77      0.81       619
           1       0.88      0.88      0.88      1546

    accuracy                           0.87      3798
   macro avg       0.86      0.85      0.85      3798
weighted avg       0.87      0.87      0.86      3798

