<h3>Import Libraries</h3>

In [30]:
import re
import nltk
import torch
import spacy
import warnings
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from torch.utils.data import DataLoader, Dataset

<h3>Get Data</h3>

In [3]:
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')

In [4]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
test_data.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [6]:
test_data['target'] = False
data = pd.concat([train_data, test_data], sort=False, ignore_index=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10876 entries, 0 to 10875
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        10876 non-null  int64 
 1   keyword   10789 non-null  object
 2   location  7238 non-null   object
 3   text      10876 non-null  object
 4   target    10876 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 425.0+ KB


<h3>Cleaning Location Feature</h3>

In [7]:
data['location'] = data['location'].fillna('Unknown')

In [8]:
# extracts text and ner tag for each token used for the location of each tweet
def get_ner_vals(doc):
    return [(token.text, token.label_) for token in doc.ents]

In [9]:
# Normalizes the labels to only 5 potential labels
def change_label_name(label):
    LOCATION_LABELS = ["GPE", "LOC", "FAC"]

    if label == "ORG" or label == "PERSON":
        return label
    elif label in LOCATION_LABELS:
        return "LOCATION"
    elif label:
        return "OTHER"
    return "NONE"

In [10]:
# gets the most frequent label from the tokens to declare the phrase as that label
def get_most_freq_label(ent_labels):
    if len(ent_labels) == 0:
        return "NONE"
    labels_series = pd.Series(ent_labels)
    return labels_series.value_counts().index[0]

In [11]:
nlp = spacy.load('en_core_web_sm')
docs = list(nlp.pipe(data['location'].tolist(), disable=[])) # process the docs for each locaiton value
data['ner_ents'] = [get_ner_vals(doc) for doc in docs]

# extract labels from NER entities
data['ner_labels'] = data['ner_ents'].apply(lambda ents: [change_label_name(label) for _, label in ents])

# use the most frequent NER label as the label for all the tokens
data['location_ner'] = data['ner_labels'].apply(get_most_freq_label)

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10876 entries, 0 to 10875
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            10876 non-null  int64 
 1   keyword       10789 non-null  object
 2   location      10876 non-null  object
 3   text          10876 non-null  object
 4   target        10876 non-null  int64 
 5   ner_ents      10876 non-null  object
 6   ner_labels    10876 non-null  object
 7   location_ner  10876 non-null  object
dtypes: int64(2), object(6)
memory usage: 679.9+ KB


In [13]:
data['location_ner'].value_counts()

Unnamed: 0_level_0,count
location_ner,Unnamed: 1_level_1
NONE,5153
LOCATION,4016
ORG,946
PERSON,507
OTHER,254


<h5>Now, we will add the location to the text to add more context to the tweet.</h5>

In [14]:
def add_location_and_text(text, location, ner_tag):
    if ner_tag == 'LOCATION' or ner_tag == "ORG":
        return text + ', [LOC] ' + location
    else:
        return text

In [15]:
# lowercase
data['text_clean'] = data['text'].str.lower()
data['location_clean'] = data['location'].str.lower()

# remove any non words
data['location_clean'] = data.apply(lambda x: re.sub(r'([^\w\s])', '', x['location_clean']), axis=1)

# add both columns together if location is good
data['text + location'] = data.apply(lambda x: add_location_and_text(x['text_clean'], x['location_clean'], x['location_ner']), axis=1)
data.head()

Unnamed: 0,id,keyword,location,text,target,ner_ents,ner_labels,location_ner,text_clean,location_clean,text + location
0,1,,Unknown,Our Deeds are the Reason of this #earthquake M...,1,[],[],NONE,our deeds are the reason of this #earthquake m...,unknown,our deeds are the reason of this #earthquake m...
1,4,,Unknown,Forest fire near La Ronge Sask. Canada,1,[],[],NONE,forest fire near la ronge sask. canada,unknown,forest fire near la ronge sask. canada
2,5,,Unknown,All residents asked to 'shelter in place' are ...,1,[],[],NONE,all residents asked to 'shelter in place' are ...,unknown,all residents asked to 'shelter in place' are ...
3,6,,Unknown,"13,000 people receive #wildfires evacuation or...",1,[],[],NONE,"13,000 people receive #wildfires evacuation or...",unknown,"13,000 people receive #wildfires evacuation or..."
4,7,,Unknown,Just got sent this photo from Ruby #Alaska as ...,1,[],[],NONE,just got sent this photo from ruby #alaska as ...,unknown,just got sent this photo from ruby #alaska as ...


In [16]:
data['text + location'][0]

'our deeds are the reason of this #earthquake may allah forgive us all'

<h3>Text Preprocessing</h3>

In [17]:
nltk.download('stopwords')
en_stopwords = stopwords.words('english')
print(en_stopwords)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# remove links from the text
data['text_loc_clean'] = data.apply(lambda x: re.sub(r'http\S+|www\.\S+', '', x['text + location']), axis=1)

# remove any special character or non words
data['text_loc_clean'] = data.apply(lambda x: re.sub(r'([^\w\s])', "", x['text_loc_clean']), axis=1)

# remove stopwords
data['text_loc_clean'] = data['text_loc_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))



train_tweets = data['text_loc_clean'][:train_data.shape[0]].copy().tolist()
train_labels = data['target'][:train_data.shape[0]].copy().tolist()
test_tweets = data['text_loc_clean'][train_data.shape[0]:].copy().tolist()

<h5>Tokenizes them by returning input ids that come from the vocabulary of the BERT model being used. Attention Mask shows what is padding and what isn't (1 for actual words, 0 for padding).</h5>

In [19]:
train_X, val_X, train_y, val_y = train_test_split(train_tweets, train_labels, test_size=0.2)

In [20]:
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the tweets (using pt tensors since we are using PyTorch and Hugging Face BERT models)
# they require the input to be the same tensor type when loading model (DL framework provided by hugging face)
train_encodings = tokenizer(train_X, padding=True, truncation=True, return_tensors="pt")
val_encodings = tokenizer(val_X, padding=True, truncation=True, return_tensors="pt")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


<h3>Create Custom Dataset</h3>

<h5>Need to create dataset to use for GPU optimization so that the GPU does not have to wait for the CPU to process the batches so they can be trained on the GPU. CPU works in parallel with the GPU. GPU handles to computation like matrix multiplication and forward pass and back prop, while CPU handles the data preprocessing and loading into batches for the GPU to use for those computations.</h5>

<h5>Also, batches contain a certain number of sample that represent each tweet along with its target label. DataLoader is able to handle all of this.</h5>

In [21]:
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    # tells you how many sample are in your dataset (needed by DataLoader to know
    # when an epoch (one pass through dataset) ends)
    # an update per batch (backprop)
    def __len__(self):
        return len(self.labels)

    # returns one sample using its index (a single data point)
    # used by DataLoader to create batches
    # goes through input ids and attention masks to create a dictionary that describes the sample
    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [22]:
train_dataset = TweetDataset(train_encodings, train_y)
val_dataset = TweetDataset(val_encodings, val_y)

<h3>Model Finetuning</h3>

<h5>Need training arguments to tell Trainer how to train your model.
Its like a recipe for training: batch size, number of epochs, where to save results, etc.</h5>

In [47]:
training_args = TrainingArguments(
    output_dir="/content/results",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    eval_strategy="epoch", # when to evaluate and update framework
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_steps=50,
    fp16=torch.cuda.is_available(),  # mixed precision if supported
    learning_rate=2e-5
)

In [24]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "f1": f1_score(labels, preds, average="weighted")
    }

In [25]:
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [31]:
warnings.filterwarnings("ignore")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
warnings.filterwarnings("ignore")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,          # optional but helps with logging & saving
    compute_metrics=compute_metrics,
)

warnings.filterwarnings("ignore")
trainer.train()
eval_results = trainer.evaluate()
print("Results: ", eval_results)

Epoch,Training Loss,Validation Loss,F1
1,0.0211,1.358124,0.802079
2,0.0205,1.392254,0.811499
3,0.0201,1.553133,0.793653


Results:  {'eval_loss': 1.3922536373138428, 'eval_f1': 0.8114985778823385, 'eval_runtime': 1.0559, 'eval_samples_per_second': 1442.342, 'eval_steps_per_second': 45.458, 'epoch': 3.0}


In [49]:
trainer.save_model("/content/roberta_tweets_model")

<h3>Model Testing</h3>

In [36]:
test_encodings = tokenizer(test_tweets, padding=True, truncation=True, return_tensors='pt')
test_dataset = TweetDataset(test_encodings, [0] * len(test_tweets))

In [55]:
from scipy.special import softmax

predictions = trainer.predict(test_dataset)
logits = predictions.predictions
probs = softmax(logits)
predicted_labels = np.argmax(probs, axis=1)
print(predicted_labels)

[1 1 1 ... 1 1 1]


In [56]:
final_df = pd.DataFrame({"id": test_data["id"], "target": predicted_labels})

In [58]:
final_df.to_csv("submission.csv", index=False)