In [32]:
import os
import numpy as np
import pandas as pd
import torch
import torchtext
from pathlib import Path
import matplotlib.pyplot as plt 
import sklearn.metrics as m
from transformers import BertTokenizer, BertModel 
import warnings
from tqdm import tqdm
warnings.filterwarnings('ignore')
torch.__version__ , torchtext.__version__

('1.7.0', '0.8.0a0+cd6902d')

In [2]:
PATH = Path("/kaggle/input/nlp-disaster-tweets-eda")
os.listdir(PATH)

['__results__.html',
 'train_clean.csv',
 'test_clean.csv',
 '__notebook__.ipynb',
 '__results___files',
 '__output__.json',
 'train.csv',
 'test.csv',
 'custom.css']

In [3]:
#importamos los datos de entrenamientos

train = pd.read_csv(PATH/"train.csv")
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [4]:
# Importamos el transformers de Bert
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
# Los transformers estan limitados en cuanto a la longitud de palabras a leer y es por ello que deberemos tenerlo en cuenta 

max_input_length = tokenizer.max_model_input_sizes["bert-base-uncased"] # Bert trabaja con 512.

def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2] #Deberemos restarle dos por los tokens unk y pad
    return tokens

In [6]:
ID = torchtext.data.RawField()
KEYWORD = torchtext.data.RawField()
LOCATION = torchtext.data.RawField()
TEXT = torchtext.data.Field(batch_first=True,# Dimension del batch en la primera dimension
                            use_vocab=False,
                            tokenize= tokenize_and_cut,
                            preprocessing = tokenizer.convert_tokens_to_ids,
                            init_token = tokenizer.cls_token_id,
                            eos_token = tokenizer.sep_token_id,
                            pad_token = tokenizer.pad_token_id,
                            unk_token = tokenizer.unk_token_id)
 
LABEL = torchtext.data.LabelField(dtype=torch.long)

dataset = torchtext.data.TabularDataset(
    path=PATH / 'train.csv',
    format = "CSV",
    fields = [("id",ID),("keyword",KEYWORD),("location",LOCATION),("text",TEXT),("target",LABEL)],
    skip_header=True
)

In [7]:
len(dataset)

7613

In [8]:
ix=0
print(vars(dataset.examples[ix]))

{'id': '1', 'keyword': '', 'location': '', 'text': [2256, 15616, 2024, 1996, 3114, 1997, 2023, 1001, 8372, 2089, 16455, 9641, 2149, 2035], 'target': '1'}


In [9]:
train_dataset, valid_dataset = dataset.split(
    split_ratio = 0.6,
    stratified =True,
    strata_field="target"
)
len(train_dataset), len(valid_dataset)

(4568, 3045)

In [13]:
# Hacemos el vocabulario de las etiquetas, 0 o 1
LABEL.build_vocab(train_dataset)
len(LABEL.vocab)

2

In [14]:
BATCH_SIZE = 64
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
dataloader = {
    "train": torchtext.data.BucketIterator(train_dataset, batch_size=BATCH_SIZE, shuffle=True, device=DEVICE),
    "val":  torchtext.data.BucketIterator(valid_dataset, batch_size=200, device=DEVICE) # Ponemos un mayor batch_size porque no tenemos que calcular gradientes y podremos ir mas rapido
    
}

## Modelo de LSTM cargando embeddings

In [29]:
# CONSTRUIMOS RED NEURONAL
class BERT(torch.nn.Module):
    def __init__(self, hidden_size=128, num_layers=2, n_outputs=2, bidirectional=False, dropout=0):
        super().__init__()
        self.bert= BertModel.from_pretrained("bert-base-uncased") # Calculará los embeding
        
        for name, param in self.bert.named_parameters(): # Le dice a todas capas de Bert que no aplique gradientes
            param.requires_grad=False
        
        self.rnn = torch.nn.LSTM( # Cargamos una red LSTM
            input_size= self.bert.config.to_dict()["hidden_size"], # la funcion config nos permite sacar las dimensiones de la capa oculta para pasarsela a nuestro modelo
            hidden_size = hidden_size,
            num_layers=num_layers,
            bidirectional=bidirectional,
            dropout= dropout,
            batch_first=True
        )
        self.fc = torch.nn.Linear(2*hidden_size if bidirectional else hidden_size, n_outputs)
    
    def forward(self, text):
        with torch.no_grad(): # Que no calcule gradientes 
            embedded = self.bert(text)[0]
        output, _ = self.rnn(embedded) # La capa de RNN nos devolvera los outputs y el valor del ultimo hidden state que no lo queremos para nada
        # Dimensiones en RNN y torchtext son ["batch_size","longitud del texto","hidden_size"]
        return self.fc(output[:,-1,:].squeeze(1)) # output[:,-1,:] Nos quedaremos con la ultima palabra, todo el batch_size y el hidden_size osea tenemos [1,64,128] dimensiones luego se lo pasaremos a la capa lineal quitandole la ultima palabra con la funcion squeeze(0)       

In [34]:
# CREAMOS LA FUNCION PARA ENTRENAR NUESTRO MODELO

def fit(model, dataloader, epochs = 10, lr=1e-3):
    
    model.to(DEVICE) #Mandamos nuestro modelo a la gpu
    criterion = torch.nn.CrossEntropyLoss() # Funcion de perdida CrossEntropyLoss por que queremos mas de una salida sino seria BCE
    optimizer = torch.optim.Adam(model.parameters(), lr) # Optimizador con los parametros del modelo y añadimos nuestra lr
    
    hist = {"loss" : [], "f1": [], "val_loss":[],"val_f1":[]} # Guardaremos las metricas
    best_f1 = 0.
    for e in range(1, epochs+1):
        
        # ENTRENAMOS
        model.train() # Ponemos el model en modo entrenamiento
        l, f1s = [], []
        bar = tqdm(dataloader["train"])
        for batch in bar:
            optimizer.zero_grad() # Ponemos a cero los gradientes
            y_pred = model(batch.text) # Calculamos salida pasando al modelo el texto
            loss = criterion(y_pred, batch.target) # calulo loss functions
            l.append(loss.item()) # guardamos loss function
            loss.backward()#calculo gradiente
            optimizer.step()# actualizo los pesos
            y_pred= torch.argmax(y_pred, axis=1)# cogera el indice del valor mas grande, es decir dará 0 o 1
            f1s.append(m.f1_score(batch.target.cpu(), y_pred.cpu()))
            bar.set_description(f"loss {np.mean(l):.5f} f1 {np.mean(f1s):.5f}")
        hist["loss"].append(np.mean(l))
        hist["f1"].append(np.mean(f1s))
        
        #EVALUAMOS
        model.eval()
        l,acc,f1s = [],[],[]
        with torch.no_grad():
            pg_bar = tqdm(dataloader["val"])
            for batch in pg_bar:
                y_pred = model(batch.text)
                loss = criterion (y_pred, batch.target)
                l.append(loss.item())
                y_pred = torch.argmax(y_pred, axis=1)
                f1s.append(m.f1_score(batch.target.cpu(), y_pred.cpu())) # Si trabajas con el paquete sklearn los datos deben estar en la cpu y en formato numpy
                bar.set_description(f"val_loss {np.mean(l):.5f} val_f1 {np.mean(f1s):.5f}")
        hist["val_loss"].append(np.mean(l))
        hist["val_f1"].append(np.mean(f1s))
        # CALLBACKS SAVE BEST MODEL
        if hist["val_f1"][-1] > best_f1:
            best_f1 = hist["val_f1"][-1]
            torch.save(model.state_dict(),"ckpt.pt")
        print(f'Epoch {e}/{epochs} loss:{hist["loss"][-1]:.5f} f1:{hist["f1"][-1]:.5f} val_loss:{hist["val_loss"][-1]:.5f} val_f1:{hist["val_f1"][-1]:.5f}')
    model.load_state_dict(torch.load("ckpt.pt"))
    return hist

In [35]:
# INSTANCIAMOS LA RNN Y ENTRENAMOS
model = BERT()


hist = fit(model, dataloader)

loss 0.65698 f1 0.41948: 100%|██████████| 72/72 [00:09<00:00,  7.43it/s]
100%|██████████| 16/16 [00:05<00:00,  2.77it/s]
loss 0.66474 f1 0.64516:   1%|▏         | 1/72 [00:00<00:09,  7.24it/s]

Epoch 1/10 loss:0.65698 f1:0.41948 val_loss:0.65019 val_f1:0.62051


loss 0.64062 f1 0.51245: 100%|██████████| 72/72 [00:09<00:00,  7.48it/s]
100%|██████████| 16/16 [00:05<00:00,  2.81it/s]
loss 0.62387 f1 0.08000:   1%|▏         | 1/72 [00:00<00:11,  6.03it/s]

Epoch 2/10 loss:0.64062 f1:0.51245 val_loss:0.64518 val_f1:0.05085


loss 0.64984 f1 0.43493: 100%|██████████| 72/72 [00:09<00:00,  7.45it/s]
100%|██████████| 16/16 [00:05<00:00,  2.79it/s]
loss 0.67000 f1 0.64000:   1%|▏         | 1/72 [00:00<00:10,  6.57it/s]

Epoch 3/10 loss:0.64984 f1:0.43493 val_loss:0.65432 val_f1:0.66300


loss 0.57899 f1 0.61449: 100%|██████████| 72/72 [00:09<00:00,  7.48it/s]
100%|██████████| 16/16 [00:05<00:00,  2.82it/s]
loss 0.48948 f1 0.71538:   1%|▏         | 1/72 [00:00<00:09,  7.37it/s]

Epoch 4/10 loss:0.57899 f1:0.61449 val_loss:0.47913 val_f1:0.76876


loss 0.46804 f1 0.75475: 100%|██████████| 72/72 [00:09<00:00,  7.54it/s]
100%|██████████| 16/16 [00:05<00:00,  2.77it/s]
loss 0.55805 f1 0.72727:   1%|▏         | 1/72 [00:00<00:10,  6.57it/s]

Epoch 5/10 loss:0.46804 f1:0.75475 val_loss:0.48693 val_f1:0.77867


loss 0.43698 f1 0.76622: 100%|██████████| 72/72 [00:09<00:00,  7.46it/s]
100%|██████████| 16/16 [00:05<00:00,  2.79it/s]
loss 0.41806 f1 0.71111:   1%|▏         | 1/72 [00:00<00:09,  7.16it/s]

Epoch 6/10 loss:0.43698 f1:0.76622 val_loss:0.41340 val_f1:0.77507


loss 0.40911 f1 0.77729: 100%|██████████| 72/72 [00:09<00:00,  7.52it/s]
100%|██████████| 16/16 [00:05<00:00,  2.81it/s]
loss 0.34006 f1 0.85185:   1%|▏         | 1/72 [00:00<00:13,  5.38it/s]

Epoch 7/10 loss:0.40911 f1:0.77729 val_loss:0.41123 val_f1:0.78341


loss 0.38647 f1 0.80216: 100%|██████████| 72/72 [00:09<00:00,  7.48it/s]
100%|██████████| 16/16 [00:05<00:00,  2.80it/s]
loss 0.35312 f1 0.75556:   1%|▏         | 1/72 [00:00<00:10,  6.96it/s]

Epoch 8/10 loss:0.38647 f1:0.80216 val_loss:0.40232 val_f1:0.77110


loss 0.38150 f1 0.80011: 100%|██████████| 72/72 [00:09<00:00,  7.46it/s]
100%|██████████| 16/16 [00:05<00:00,  2.79it/s]
loss 0.33675 f1 0.76190:   1%|▏         | 1/72 [00:00<00:10,  6.58it/s]

Epoch 9/10 loss:0.38150 f1:0.80011 val_loss:0.40543 val_f1:0.78831


loss 0.36471 f1 0.80490: 100%|██████████| 72/72 [00:09<00:00,  7.47it/s]
100%|██████████| 16/16 [00:05<00:00,  2.74it/s]


Epoch 10/10 loss:0.36471 f1:0.80490 val_loss:0.41764 val_f1:0.78997


In [None]:
def plot(hist): # Funcion para graficar nuestras metricas 
    fig = plt.figure(dpi = 200, figsize= (10,3))
    ax = plt.subplot(121)
    hist = pd.DataFrame(hist)
    hist[["loss","val_loss"]].plot(ax=ax, grid=True)
    ax = plt.subplot(122)
    hist[["f1", "val_f1"]].plot(ax=ax, grid=True)
    plt.show()
plot(hist)

Estos malos resultados es porque las RNN sencillas no funcionan bien con longuitudes grandes. Van bien con secuancias de texto de 10-20 pero en nuestro caso podemos tener unas secuencias mayor a 100, entonces estas redes tan sencillas fallan mucho. Deberemos buscar mejores modelos como LSTM, bidireccionales o transformers

## TEST

In [36]:
# Creamos nuestros dataset de test
test_dataset = torchtext.data.TabularDataset(
    path=PATH/'test.csv',
    format = "CSV",
    fields = [("id",ID),("keyword",KEYWORD),("location",LOCATION),("text",TEXT)],
    skip_header=True
)
len(test_dataset)

3263

In [37]:
# Comprobamos si es testo correcto
ix=3258
print(vars(test_dataset.examples[ix]))

{'id': '10861', 'keyword': '', 'location': '', 'text': [8372, 3808, 3050, 3349, 1057, 2080, 3808, 3435, 24454, 2015, 1060, 2099, 7962]}


In [39]:
test_dataloader = torchtext.data.BucketIterator(test_dataset, batch_size=BATCH_SIZE, shuffle=False, device=DEVICE)

In [40]:
def predict():
    model.eval()
    preds = torch.tensor([]).to(DEVICE)
    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            y_pred=model(batch.text)
            y_pred = torch.argmax(y_pred, axis=1)
            preds = torch.cat([preds, y_pred])
    return preds

In [41]:
preds = predict()
preds

100%|██████████| 51/51 [00:05<00:00,  8.59it/s]


tensor([1., 1., 1.,  ..., 1., 1., 1.], device='cuda:0')

In [43]:
submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
submission.target = preds.cpu().long()# Me traigo mis pres a la cpu 
submission.to_csv("submission.csv", index=False)