# Natural Language Processing with Disaster Tweets
This kernel predicts which Tweets are about disasters and which are not. A pretrained DistilBERT model is finetuned on the Kaggle disaster Tweets dataset.   
 

In [1]:
! pip install transformers
! pip install datasets
! pip install scipy sklearn

[0mCollecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.4/325.4 KB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: xxhash, responses, datasets
Successfully installed datasets-2.1.0 responses-0.18.0 xxhash-3.0.0
[0m

In [2]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer
from transformers import create_optimizer
from transformers import TFAutoModelForSequenceClassification
from transformers import DataCollatorWithPadding

In [3]:
training_data_file = "../input/nlp-getting-started/train.csv"
test_data_file = "../input/nlp-getting-started/test.csv"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [4]:
dataset = load_dataset('csv', data_files = [training_data_file])
dataset = dataset['train'].train_test_split(test_size=0.1)
dataset['valid'] = dataset['test']
dataset['test'] = load_dataset('csv', data_files = [test_data_file])['train']

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-1201d457ad9363e2/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-1201d457ad9363e2/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-49d36cb4f0593017/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-49d36cb4f0593017/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'target'],
        num_rows: 6851
    })
    test: Dataset({
        features: ['id', 'keyword', 'location', 'text'],
        num_rows: 3263
    })
    valid: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'target'],
        num_rows: 762
    })
})

In [6]:
pd.DataFrame(dataset['train'])

Unnamed: 0,id,keyword,location,text,target
0,3648,destroy,New York City,Putin's plan to destroy Western food en masse ...,0
1,2486,collided,,We're happily collided :),0
2,1190,blizzard,Sydney,@Ashayo @MsMiggi Hi Ashayo! I believe there wi...,1
3,8177,rescuers,USA - Canada - Europe - Asia,VIDEO: 'We're picking up bodies from water': R...,1
4,3240,deluged,,Businesses are deluged with invoices. Make you...,1
...,...,...,...,...,...
6846,9246,sunk,18 | 509,I peeped you frontin' I was in the jeepåÊsunk ...,0
6847,5046,eyewitness,UK,RT patrickjbutler: Excellent damiengayle eyewi...,1
6848,1619,bombed,My old New England home,I liked a @YouTube video http://t.co/FX7uZZXtE...,0
6849,6420,hurricane,,@pattonoswalt @FoxNews Wait I thought Fecal Hu...,1


In [7]:
pd.DataFrame(dataset['valid'])

Unnamed: 0,id,keyword,location,text,target
0,2027,casualties,Heinz Field,There might be casualties tomorrow,1
1,7722,panicking,,all that panicking made me tired ;__; i want t...,1
2,8973,storm,#BossNation!,Finna storm. Fuck my back boutta start hurting...,0
3,5870,hailstorm,"Iliff,Colorado",Severe hailstorm in progress over Northeast Lo...,1
4,4499,emergency,New York,Survival Kit Whistle Fire Starter Wire Saw Cre...,1
...,...,...,...,...,...
757,10780,wreckage,Mumbai,Wreckage 'Conclusively Confirmed' as From MH37...,1
758,6703,lava,,My hands are cold but my feet are warm. That's...,0
759,6073,heat%20wave,Planet of da Bathing Apes,Heat wave gotta be over 9000 today,1
760,800,battle,NYC,YA BOY CLIP VS 4KUS FULL BATTLE\n\n@15MofeRadi...,0


In [8]:
pd.DataFrame(dataset['test'])

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [9]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

pre_tokenizer_cols = set(dataset["train"].features)
encoded = dataset.map(tokenize, batched=True)
tokenizer_cols = list(set(encoded["train"].features) - pre_tokenizer_cols)
print("New Columns:", tokenizer_cols)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

New Columns: ['attention_mask', 'input_ids']


In [10]:
encoded

DatasetDict({
    train: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'target', 'input_ids', 'attention_mask'],
        num_rows: 6851
    })
    test: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'input_ids', 'attention_mask'],
        num_rows: 3263
    })
    valid: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'target', 'input_ids', 'attention_mask'],
        num_rows: 762
    })
})

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

tf_train_dataset = encoded['train'].to_tf_dataset(
    columns=tokenizer_cols,
    label_cols=["target"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)

tf_validation_dataset = encoded['valid'].to_tf_dataset(
    columns=tokenizer_cols,
    label_cols=["target"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_test_dataset = encoded['test'].to_tf_dataset(
    columns=tokenizer_cols,
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_test_dataset

<PrefetchDataset shapes: {input_ids: (None, None), attention_mask: (None, None)}, types: {input_ids: tf.int64, attention_mask: tf.int64}>

In [12]:
model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

2022-04-25 22:16:52.056727: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_transform', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint 

In [13]:
num_epochs = 2
batches_per_epoch = len(encoded["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)

optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
loss = loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [14]:

model.fit(tf_train_dataset,validation_data=tf_validation_dataset,epochs=num_epochs)

Epoch 1/2
 34/428 [=>............................] - ETA: 3:31 - loss: 0.6304 - accuracy: 0.6765

KeyboardInterrupt: 

In [None]:
test_pred = model.predict(tf_test_dataset)

In [None]:
submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
submission['target'] = np.argmax(test_pred.logits, axis=1)
submission.to_csv('submission.csv', index=False)