## Importing data and libraries

In [2]:
from fastai.text.all import *
import pandas as pd
import string
import numpy as np
import re

In [6]:
df_train = pd.read_csv('data/train.csv')

## Data Exploration

In [7]:
df_train.head(20)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1
5,8,,,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires,1
6,10,,,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas",1
7,13,,,I'm on top of the hill and I can see a fire in the woods...,1
8,14,,,There's an emergency evacuation happening now in the building across the street,1
9,15,,,I'm afraid that the tornado is coming to our area...,1


In [8]:
df_test=pd.read_csv('data/test.csv')
df_test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, stay safe everyone."
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all"
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTENERS XrWn
3259,10865,,,Storm in RI worse than last hurricane. My city&amp;3others hardest hit. My yard looks like it was bombed. Around 20000K still without power
3260,10868,,,Green Line derailment in Chicago http://t.co/UtbXLcBIuY
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) http://t.co/3X6RBQJHn3


## Data Wrangling

In [9]:
df_train['keyword'] = df_train['keyword'] + " "
df_train['location'] = df_train['location'] + " "

In [10]:
df_train['keyword'] = df_train['keyword'].fillna("")
df_train['location'] = df_train['location'].fillna("")

In [11]:
df_train['text'] = df_train['keyword'] + " " + df_train['location']  + " " +  df_train['text']

In [12]:
df_train[31:40]

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,ablaze Birmingham @bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C,1
32,49,ablaze,Est. September 2012 - Bristol,ablaze Est. September 2012 - Bristol We always try to bring the heavy. #metal #RT http://t.co/YAo1e0xngw,0
33,50,ablaze,AFRICA,ablaze AFRICA #AFRICANBAZE: Breaking news:Nigeria flag set ablaze in Aba. http://t.co/2nndBGwyEi,1
34,52,ablaze,"Philadelphia, PA","ablaze Philadelphia, PA Crying out for more! Set me ablaze",0
35,53,ablaze,"London, UK","ablaze London, UK On plus side LOOK AT THE SKY LAST NIGHT IT WAS ABLAZE http://t.co/qqsmshaJ3N",0
36,54,ablaze,Pretoria,ablaze Pretoria @PhDSquares #mufc they've built so much hype around new acquisitions but I doubt they will set the EPL ablaze this season.,0
37,55,ablaze,World Wide!!,ablaze World Wide!! INEC Office in Abia Set Ablaze - http://t.co/3ImaomknnA,1
38,56,ablaze,,ablaze Barbados #Bridgetown JAMAICA ÛÒ Two cars set ablaze: SANTA CRUZ ÛÓ Head of the St Elizabeth Police Superintende... http://t.co/wDUEaj8Q4J,1
39,57,ablaze,Paranaque City,ablaze Paranaque City Ablaze for you Lord :D,0


Here we look into: 
- Reducing characters to lowercase
- Removing mentions (@) and hashtags (#)
- Removing external links to other websites
- Removing unicode for emojis

In [13]:
emoticons_happy = {':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}', ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D', '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P', 'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)', '<3', ':d'}
emoticons_sad = {':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<', ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c', ':c', ':{', '>:\\', ';('}
emotes = emoticons_happy.union(emoticons_sad)

# ===== TRANSFORM METHODS =====

def remove_url(text):
    """Given string, remove url by regex."""
    # url = re.compile(r'https?://\S+|www\.\S+')  # Axel
    url = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')  # Tom
    return url.sub(r'',text)

def remove_html(text):
    """Given string, remove html by regex."""
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_digits(text):
    """Given string, remove digits."""
    text = ''.join([i for i in text if not i.isdigit()])
    return text

def remove_punctuations(text):
    """Given string, remove punctuations."""
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

def transform_lower_chars(text):
    """Given string, transform into lower characters."""
    return str(text).lower()

def remove_emojis(text):
    """Given text, remove emojis."""
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def preprocess(df):
    # Transformations
    df['cleantext'] = df['text'].apply(remove_emojis)
    df['cleantext'] = df['cleantext'].apply(remove_html)
    df['cleantext'] = df['cleantext'].apply(remove_url)
    # df_train['cleantext'] = df_train['cleantext'].apply(transform_lower_chars)
    # df_train['cleantext'] = df_train['cleantext'].apply(remove_digits)
    # df_train['cleantext'] = df_train['cleantext'].apply(remove_punctuations)
    

    return df

In [14]:
df_train = preprocess(df_train)

In [15]:
df_train['text'] = df_train['cleantext']

In [16]:
df_train[31:40]

Unnamed: 0,id,keyword,location,text,target,cleantext
31,48,ablaze,Birmingham,ablaze Birmingham @bbcmtd Wholesale Markets ablaze,1,ablaze Birmingham @bbcmtd Wholesale Markets ablaze
32,49,ablaze,Est. September 2012 - Bristol,ablaze Est. September 2012 - Bristol We always try to bring the heavy. #metal #RT,0,ablaze Est. September 2012 - Bristol We always try to bring the heavy. #metal #RT
33,50,ablaze,AFRICA,ablaze AFRICA #AFRICANBAZE: Breaking news:Nigeria flag set ablaze in Aba.,1,ablaze AFRICA #AFRICANBAZE: Breaking news:Nigeria flag set ablaze in Aba.
34,52,ablaze,"Philadelphia, PA","ablaze Philadelphia, PA Crying out for more! Set me ablaze",0,"ablaze Philadelphia, PA Crying out for more! Set me ablaze"
35,53,ablaze,"London, UK","ablaze London, UK On plus side LOOK AT THE SKY LAST NIGHT IT WAS ABLAZE",0,"ablaze London, UK On plus side LOOK AT THE SKY LAST NIGHT IT WAS ABLAZE"
36,54,ablaze,Pretoria,ablaze Pretoria @PhDSquares #mufc they've built so much hype around new acquisitions but I doubt they will set the EPL ablaze this season.,0,ablaze Pretoria @PhDSquares #mufc they've built so much hype around new acquisitions but I doubt they will set the EPL ablaze this season.
37,55,ablaze,World Wide!!,ablaze World Wide!! INEC Office in Abia Set Ablaze -,1,ablaze World Wide!! INEC Office in Abia Set Ablaze -
38,56,ablaze,,ablaze Barbados #Bridgetown JAMAICA ÛÒ Two cars set ablaze: SANTA CRUZ ÛÓ Head of the St Elizabeth Police Superintende...,1,ablaze Barbados #Bridgetown JAMAICA ÛÒ Two cars set ablaze: SANTA CRUZ ÛÓ Head of the St Elizabeth Police Superintende...
39,57,ablaze,Paranaque City,ablaze Paranaque City Ablaze for you Lord :D,0,ablaze Paranaque City Ablaze for you Lord :D


In [17]:
desc = df_train.describe(include = 'all')

desc.loc['dtype'] = df_train.dtypes
desc.loc['null values'] = df_train.isnull().sum()

desc

Unnamed: 0,id,keyword,location,text,target,cleantext
count,7613,7613,7613,7613,7613,7613
unique,,222,3342,7332,,7332
top,,,,sandstorm USA Watch This Airport Get Swallowed Up By A Sandstorm In Under A Minute,,sandstorm USA Watch This Airport Get Swallowed Up By A Sandstorm In Under A Minute
freq,,61,2533,17,,17
mean,5441.93,,,,0.42966,
std,3137.12,,,,0.49506,
min,1,,,,0,
25%,2734,,,,0,
50%,5408,,,,0,
75%,8146,,,,1,


## Language Model Using DataBlock

In [23]:
dls_clas = DataBlock(
    blocks=(TextBlock.from_df('text'), CategoryBlock),
    get_x=ColReader('text'), get_y=ColReader('target'), splitter=RandomSplitter(0.1)).dataloaders(df_train, bs=64, seq_len=76)


  return array(a, dtype, copy=False, order=order)


In [28]:
dls_clas.show_batch(max_n=2)

Unnamed: 0,text,category
0,xxbos mayhem ? ? xxmaj made in the xxmaj philippines ? ? _ \n▁ xxrep 5 ? xxup retweet \n▁ xxrep 7 ? \n▁ xxrep 5 ? xxup follow xxup all xxup who xxup rt \n▁ xxrep 7 ? \n▁ xxrep 5 ? xxup xxunk \n▁ xxrep 7 ? \n▁ xxrep 5 ? xxup gain xxup with \n▁ xxrep 7 ? \n▁ xxrep 5 ? xxup follow ? xxunk # xxup xxunk \n▁ # xxup ty,0
1,"xxbos curfew xxmaj adelaide , xxmaj australia xxup info xxup r. xxup curfew xxup in xxup oper xxup until 2030 xxup z. xxup taxiways xxup foxtrot 5 & & xxup foxtrot 6 xxup navbl . xxup wnd : xxunk / 5 . xxup exp xxup inst xxup apch . xxup rwy 05 . xxup xxunk . xxup tmp : 10 . xxup xxunk : xxunk .",0


In [24]:
dls = TextDataLoaders.from_df(df_train, path=path, text_col='text', is_lm=True)


  return array(a, dtype, copy=False, order=order)


In [27]:
dls.show_batch(max_n=2)

Unnamed: 0,text,text_
0,"xxbos wild%20fires xxmaj johannesburg , xxmaj south xxmaj africa xxmaj they should just have load shedding xxunk and we will all just live like we in the wild and have camp fires cook with fire etc xxbos screamed ny xxunk just face xxunk me at the concert and just screamed for 2 minutes straight xxbos evacuated xxmaj good thing there was actually just a legit fire in the mall and nobody evacuated","wild%20fires xxmaj johannesburg , xxmaj south xxmaj africa xxmaj they should just have load shedding xxunk and we will all just live like we in the wild and have camp fires cook with fire etc xxbos screamed ny xxunk just face xxunk me at the concert and just screamed for 2 minutes straight xxbos evacuated xxmaj good thing there was actually just a legit fire in the mall and nobody evacuated !"
1,drowned as migrant boat capsizes off xxmaj libya xxbos exploded that exploded & & brought about the \n beginning of universe matches what 's \n mentioned in the xxunk heaven and xxmaj earth \n ( thus the universe ) xxbos drowned xxmaj india xxmaj hundreds of migrants feared drowned off xxmaj libya : xxmaj migrants stand next to their tent at a camp set near xxmaj xxunk _ xxbos body%20bags xxmaj fife,"as migrant boat capsizes off xxmaj libya xxbos exploded that exploded & & brought about the \n beginning of universe matches what 's \n mentioned in the xxunk heaven and xxmaj earth \n ( thus the universe ) xxbos drowned xxmaj india xxmaj hundreds of migrants feared drowned off xxmaj libya : xxmaj migrants stand next to their tent at a camp set near xxmaj xxunk _ xxbos body%20bags xxmaj fife ,"


In [25]:
b = dls.one_batch()

In [26]:
len(b), b[0].shape, b[1].shape

(2, torch.Size([64, 72]), torch.Size([64, 72]))

## Fine-Tuning the Language Model

In [29]:
learn = language_model_learner(
    dls, AWD_LSTM, drop_mult=0.3, 
    metrics=[accuracy, Perplexity()]).to_fp16()

In [30]:
learn.summary()

SequentialRNN (Input shape: 64 x 72)
Layer (type)         Output Shape         Param #    Trainable 
                     64 x 72 x 1152      
LSTM                                                           
LSTM                                                           
____________________________________________________________________________
                     64 x 72 x 400       
LSTM                                                           
RNNDropout                                                     
RNNDropout                                                     
RNNDropout                                                     
____________________________________________________________________________
                     64 x 72 x 5040      
Linear                                    2021040    True      
RNNDropout                                                     
____________________________________________________________________________

Total params: 2,021,040
Total

In [31]:
learn.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,4.64058,4.165506,0.303235,64.425293,00:11


Once the initial training has completed, we can continue fine-tuning the model after unfreezing:

In [32]:
learn.unfreeze()
learn.fit_one_cycle(10, 2e-3)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,3.828093,3.937871,0.317982,51.309258,00:11
1,3.628109,3.711152,0.352151,40.900902,00:11
2,3.36596,3.628122,0.359191,37.642044,00:11
3,3.046889,3.646531,0.371701,38.341434,00:11
4,2.715771,3.783869,0.355614,43.985874,00:11
5,2.368103,3.829975,0.379508,46.061371,00:11
6,2.041645,3.957491,0.376417,52.325859,00:11
7,1.777197,4.038222,0.380296,56.725414,00:11
8,1.576767,4.084863,0.37754,59.433777,00:11
9,1.456168,4.100678,0.378199,60.381237,00:11


In [33]:
learn.save_encoder('finetuned')

## Model for classifying text

In [34]:
learn = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult=0.5, 
                                metrics=accuracy).to_fp16()

In [35]:
learn = learn.load_encoder('../drive/MyDrive/Disaster Tweets/models/finetuned')

In [36]:
learn.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.536231,0.467841,0.777924,00:05


In [37]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

epoch,train_loss,valid_loss,accuracy,time
0,0.490123,0.452059,0.791064,00:05


In [38]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.464186,0.432442,0.801577,00:06


In [39]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.410628,0.444314,0.795007,00:07
1,0.385166,0.455215,0.805519,00:07


In [40]:
learn.summary()

SequentialRNN (Input shape: 64 x 76)
Layer (type)         Output Shape         Param #    Trainable 
                     64 x 4 x 1152       
LSTM                                                           
LSTM                                                           
____________________________________________________________________________
                     64 x 4 x 400        
LSTM                                                           
RNNDropout                                                     
RNNDropout                                                     
RNNDropout                                                     
BatchNorm1d                               2400       True      
Dropout                                                        
____________________________________________________________________________
                     64 x 50             
Linear                                    60000      True      
ReLU                                       

In [41]:
preds,targs = learn.get_preds()

In [42]:
learn.show_results()

Unnamed: 0,text,category,category_
0,xxbos injury xxmaj xxunk xxmaj live xxmaj cricket xxmaj score xxmaj in xxmaj all xxmaj match xxmaj international \n xxmaj domestic \n xxmaj team xxmaj tour xxmaj team xxmaj squad \n xxmaj profile & & xxmaj injury \n xxmaj xxunk xxmaj xxunk xxmaj xxunk xxmaj update \n + \n follow \n xxunk,0,0
1,"xxbos floods xxmaj xxunk xxmaj district , xxmaj new xxmaj zealand xxunk xxmaj yep . xxmaj during floods of 1 xxrep 3 9 or 2 xxrep 3 0 - xxmaj xxunk xxmaj dam xxunk every bit of water it could . xxmaj most xxunk ! xxup xxunk charging your way ?",1,1
2,"xxbos rainstorm xxmaj xxunk xxmaj xxunk , xxmaj rio xxmaj grande do xxmaj xxunk xxmaj xxunk up sick with a rainstorm outside would usually make me sad . xxmaj not today though . xxmaj put some xxmaj the xxmaj xxunk on the stereo and let 's do this .",0,0
3,"xxbos riot xxmaj los xxmaj angeles , xxup ca xxunk xxmaj xxunk xxmaj xxunk xxmaj salvador xxmaj the xxmaj secret xxmaj tips to xxmaj get 100 . xxrep 3 0 xxmaj riot xxmaj points lol are out now ! check the xxmaj secret on on my xxmaj bio",0,0
4,"xxbos cliff%20fall xxmaj florida , xxup usa ' i 'm there ! ' xxmaj bride & & xxmaj groom on mountain cliff xxunk . xxmaj ha xxmaj ha just kidding . i xxup will xxup not xxup ever be there . xxmaj ha xxmaj ha -",0,0
5,xxbos hail xxup usa xxmaj strong xxmaj thunderstorm 4 xxmaj miles xxmaj north of xxmaj xxunk xxmaj moving xxup se xxmaj at 25 xxup mph . xxmaj large xxmaj hail and xxmaj wind xxmaj gusts xxmaj up to 50 xxup mph xxmaj xxunk … # arwx,1,1
6,xxbos hail xxmaj arkansas xxmaj strong xxmaj thunderstorm 4 xxmaj miles xxmaj north of xxmaj xxunk xxmaj moving xxup se xxmaj at 25 xxup mph . xxmaj large xxmaj hail and xxmaj wind xxmaj gusts xxmaj up to 50 xxup mph xxmaj xxunk … # arwx,1,1
7,xxbos earthquake xxmaj xxunk xxmaj republica xxmaj argentina # xxmaj sismo m 1.3 - 1 km xxup xxunk of xxmaj the xxmaj xxunk xxmaj california : xxmaj time2015 - 08 - 05 xxunk xxup utc2015 - 08 - 05 xxunk xxunk a … # xxup xxunk,0,1
8,xxbos thunderstorm xxmaj east xxmaj coast xxup the xxup national xxup weather xxup service xxup in xxup little xxup rock xxup has xxup issued a * xxup severe xxup thunderstorm xxup warning xxup for … xxup van xxup xxunk xxup county xxup in û _,1,1


## Generating results for test data

In [43]:
df_test['keyword'] = df_test['keyword'] + " "
df_test['location'] = df_test['location'] + " "
df_test['keyword'] = df_test['keyword'].fillna("")
df_test['location'] = df_test['location'].fillna("")
df_test['text'] = df_test['keyword'] + " " + df_test['location']  + " " +  df_test['text']
df_test = preprocess(df_test)
df_test['text'] = df_test['cleantext']

In [44]:
dls_test = DataBlock(
    blocks=(TextBlock.from_df('text')),
    get_x=ColReader('text')).dataloaders(df_test, bs=64, seq_len=76)

  return array(a, dtype, copy=False, order=order)


In [46]:
dls_test.show_batch(max_n=2)

Unnamed: 0,text
0,xxbos screamed earth ' # xxup xxunk ' i xxup swear xxup to xxup god i xxup did nt xxup even xxup read xxup it xxup xxunk xxup and i xxup though xxup it xxup said ' xxunk xxup life ' xxup and xxup screamed . i xxup need xxup to xxup sleep xxup xxunk . xxup xxunk .
1,xxbos rainstorm xxunk squad makes me happy xxup does xxup anyone xxup remember xxup when 5sos xxup came w xxunk xxup on xxup the xxup xxunk xxup xxunk xxup we xxup had a xxup giant xxup rainstorm xxup and xxup last xxup night xxup we xxup had xxup one xxup also . xxunk xxup stop xxup xxunk xxup rain


In [67]:
ss = pd.read_csv(path/'sample_submission.csv')
ss.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [72]:
text_list = list(df_test['text'].values)

In [73]:
prediction = [learn.predict(i)[0] for i in text_list]

In [74]:
ss['target'] = prediction
ss['target'] = ss['target'].astype(int)

In [75]:
ss.to_csv('submission.csv', index=False)