# Detecting disasters from Twitter data: BERT Supplement
### Ernesto Monroy (CID 01010397)
#### 31st of August 2020
#### Imperial College Business School: MSc in Business Analytics

This notebook is a complement to the submission of the Business Analytics Report of the same title


In [29]:
import pandas as pd
pd.options.display.max_colwidth = 200
df = pd.read_csv('train.csv')
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse into nearby homes http://t.co/STfMbbZFB5,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part of the state. Very troubling.,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ,1
7611,10872,,,Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.,1


### Cleaning

In [33]:
import string
import re
url_r = r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
def clean_text(in_text):
    # Remove URLs
    url_pattern = re.compile(url_r)
    result = url_pattern.sub(r'', in_text)
    # Remove html
    html_pattern = re.compile('<.*?>')
    result = html_pattern.sub(r'', result)
    #Remove Emojis
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    result = emoji_pattern.sub(r'', result)
    # Replace mentions
    mention_patter = re.compile(r'@\w+')
    result = mention_patter.sub(r'someone', result)
    # Remove punctuation
    result = result.translate(str.maketrans('', '', string.punctuation))
    # Remove hashtags??
    
    return result


df['clean'] = df['text'].str.lower().apply(clean_text)

### Spell Check

In [None]:
from spellchecker import SpellChecker
spell = SpellChecker()

def spell_check(in_tokens):
    corrected_text = []
    misspelled_words = spell.unknown(in_text.split())
    for word in in_text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)  

df['spell_checked'] = df['clean'].apply(spell_check)

### Tokenizing

At the moment we are tokenizing directly from the cleaned and not spell checked data because we dont know if the spell check will be good!

In [34]:
import nltk
#Change to lower case
df['tokens'] = df['clean']
#Splitting the sentence
df['tokens'] = df['tokens'].apply(lambda x : nltk.word_tokenize(x))

### POS Tags

In [36]:
df['pos'] = df['tokens'].apply(nltk.pos_tag)

### Remove stop words

In [37]:
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

def remove_stops(in_pos):
    out_pos = []
    for p in in_pos:
        if p[0] not in stops:
            out_pos += [p]
    return out_pos

df['pos'] = df['pos'].apply(remove_stops)

### Lemmatizing

In [38]:
from nltk import WordNetLemmatizer 
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()

#Translate NLTK POS Tags to Wordnet POS Tags
tag_translate = {
    "J": wordnet.ADJ,
    "N": wordnet.NOUN,
    "V": wordnet.VERB,
    "R": wordnet.ADV
}

def lemmatize(in_pos):
    #tag_translate.get(p[1][0], wordnet.NOUN)
    return [(lemmatizer.lemmatize(p[0],tag_translate.get(p[1][0], wordnet.NOUN)), p[1]) for p in in_pos]
        
df['lemma'] = df['pos'].apply(lemmatize)

### Vectorizing

First remerge to create text

In [39]:
df['final_text'] = df['lemma'].apply(lambda x: ' '.join([y[0] for y in x]))

# Test Train Splitting

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(df['final_text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

Due to the fact that the BERT model is trained on a separate notebook, we need to set the seed to guarantee the split of the data is identical.

In [41]:
from sklearn.model_selection import train_test_split
np.random.seed(43)
X_train, X_val, y_train, y_val = train_test_split(df['final_text'], df['target'], test_size=0.2)

# Model Training

In [43]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
def print_scores(y_true, y_predicted):
    print(f1_score(y_true, y_predicted))
    print(accuracy_score(y_true, y_predicted))

## BERT

### Get the Model

In [20]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
import tensorflow_hub as hub
import tensorflow as tf
import ssl
# Accept unverified certificates
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

input_word_ids = Input(shape=(max_tokens,), dtype=tf.int32, name="input_word_ids")
input_mask = Input(shape=(max_tokens,), dtype=tf.int32, name="input_mask")
segment_ids = Input(shape=(max_tokens,), dtype=tf.int32, name="segment_ids")

_, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
clf_output = sequence_output[:, 0, :]
out = Dense(1, activation='sigmoid')(clf_output)

BERTmodel = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
BERTmodel.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

### Tokenize the data for BERT

In [44]:
import tokenization
import numpy as np
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

tokenizer = tokenization.FullTokenizer(
        bert_layer.resolved_object.vocab_file.asset_path.numpy(),
        bert_layer.resolved_object.do_lower_case.numpy()
    )

X_train_bert = bert_encode(X_train, tokenizer, max_len=max_tokens)
X_val_bert = bert_encode(X_val, tokenizer, max_len=max_tokens)

### Tune BERT

In [45]:
history = BERTmodel.fit(
    X_train_bert, y_train,
    validation_split=0.2,
    epochs=5,
    batch_size=16
)

Train on 4872 samples, validate on 1218 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Score BERT

In [47]:
y_predict = BERTmodel.predict(X_val_bert)
print_scores(y_val, y_predict.flatten().round(0))

0.8894192521877488
0.9087327642810243
