In [1]:
#Utilizacion de una GPU si esta disponible
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [2]:
#Instalacion de pytorch
!pip install pytorch-pretrained-bert pytorch-nlp

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |████████████████████████████████| 133kB 3.5MB/s 
[?25hCollecting pytorch-nlp
[?25l  Downloading https://files.pythonhosted.org/packages/4f/51/f0ee1efb75f7cc2e3065c5da1363d6be2eec79691b2821594f3f2329528c/pytorch_nlp-0.5.0-py3-none-any.whl (90kB)
[K     |████████████████████████████████| 92kB 6.3MB/s 
Installing collected packages: pytorch-pretrained-bert, pytorch-nlp
Successfully installed pytorch-nlp-0.5.0 pytorch-pretrained-bert-0.6.2


In [3]:
#Imports necesarios para la ejecucion del codigo
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

In [4]:
#Preparacion de la GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla K80'

In [5]:
#Subimos los dos archivos necesarios para trabajar. Train y Test
from google.colab import files
uploaded = files.upload()

Saving test.csv to test.csv
Saving train.csv to train.csv


In [None]:
#Creacion de los dos data frame
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test.drop(columns=['location', 'keyword'], inplace=True)


In [6]:
#Imports necesarios para limpiar correctamente el codigo
import pandas as pd
pd.set_option('max_colwidth', -1)
import string
import nltk
nltk.download('stopwords')
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec

  This is separate from the ipykernel package so we can avoid doing imports until


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#Al igual que en otros notebooks, preprocesamiento previo
# Quitamos las urls
df['text'] = df['text'].str.replace(r'http:\/\/.*', '', regex=True).replace(r'https:\/\/.*', '', regex=True)

# Quitamos user mentions, signos de puntuación, hashtags y stopwords.
def clean_text(text):
    words = text.lower().split(' ')
    text = ' '.join([word for word in words if not word.startswith('@') and word not in stopwords.words('english')])
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df['text'] = df['text'].apply(clean_text)

In [None]:
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,deeds reason earthquake may allah forgive us,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,residents asked shelter place notified officers evacuation shelter place orders expected,1
3,6,,,13000 people receive wildfires evacuation orders california,1
4,7,,,got sent photo ruby alaska smoke wildfires pours school,1
...,...,...,...,...,...
7608,10869,,,two giant cranes holding bridge collapse nearby homes,1
7609,10870,,,control wild fires california even northern part state troubling,1
7610,10871,,,m194 0104 utc5km volcano hawaii,1
7611,10872,,,police investigating ebike collided car little portugal ebike rider suffered serious nonlife threatening injuries,1


In [None]:
#Bert necesita las oraciones que componen el texto de cada tweet pre-procesados de forma especial. Asi que extraemos los campos de oraciones de cada entrada
sentences = df.text.values

#Y les agregamos un separador antes y despues.
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = df.target.values

In [None]:
import torch

#Codigo para informar que se esta utilizando correctamente la GPU.
if torch.cuda.is_available():        
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
#Utilizaremos el BERT pre entrenado para minusculas (Por eso, realizamos un preprocesamiento previo que hemos repetido en otros notebooks inicial del texto)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

#Luego, separaremos las oraciones en lo que se conoce como tokens. Cada palabra equivale a un token
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

100%|██████████| 231508/231508 [00:00<00:00, 1202106.73B/s]


Tokenize the first sentence:
['[CLS]', 'deeds', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'us', '[SEP]']


In [None]:
#Establecemos la cantidad maxima de palabras que puede tener un tweet. Con 128 nos aseguramos de que no haya ningun tweet que sea cortado
MAX_LEN = 128

In [None]:
#Se obtienen los respectivos id de cada una de las series de tokens que representa un texto.
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [None]:
#Se necesita que todos los id, tengan el tamaño maximo, asi que les aplicaremos padding en caso de ser necesario
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [None]:
# Es necesario crear una mascara para cada id
attention_masks = []

# De igual manera, es necesario aplicar padding de ser necesario
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [None]:
# Particion del test en set de entrenamiento y de validacion

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [None]:
# Ultima conversion de la data para que sea utilizada por BERT

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [None]:
# Los autores recomiendan que esta variable sea 16 o 32
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.cuda()

100%|██████████| 407873900/407873900 [00:10<00:00, 38327086.36B/s]


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [None]:
#Hiperparametros posibles y la obtencion de los optimos
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [None]:
# optimizer tiene toda la informacion requerida sobre los hiperparametros
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)

t_total value of -1 results in schedule not being applied


In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
t = [] 

# Otra variable que los autores recomiendan que este entre 2 y 4
epochs = 4


for _ in trange(epochs, desc="Epoch"):
  
  
  # Entrenamiento
  
  # Es necesario setear el modelo como entrenamiento
  model.train()
  
  # Variables de seguimiento
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  

  for step, batch in enumerate(train_dataloader):
    # Para utilizacion de GPU
    batch = tuple(t.to(device) for t in batch)
    # Desempacamos del iterador
    b_input_ids, b_input_mask, b_labels = batch
    #Con estas lineas, reseteamos el gradiente entre variables y podemos calcular la perdida de informacion al entrenar
    optimizer.zero_grad()
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    train_loss_set.append(loss.item())    
    loss.backward()
    # Actualizacion de la variable con hiperparametros
    optimizer.step()
    
    
    #Actualizacion de las variables de seguimiento
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
  # Validacion

  # Cambio de modo
  model.eval()

  # Nuevas variables de seguimiento 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluamos por cada una de las iteraciones que decidimos anteriormente
  for batch in validation_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    # Se recomienda la siguiente linea para optimizar tiempos
    with torch.no_grad():
      #Calculo de predicciones
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
  next_m.mul_(beta1).add_(1 - beta1, grad)


Train loss: 0.5330948408259902


Epoch:  25%|██▌       | 1/4 [03:03<09:09, 183.26s/it]

Validation Accuracy: 0.8341346153846154
Train loss: 0.4078174358190492


Epoch:  50%|█████     | 2/4 [06:06<06:06, 183.16s/it]

Validation Accuracy: 0.8295272435897436
Train loss: 0.31618157583613726


Epoch:  75%|███████▌  | 3/4 [09:09<03:03, 183.10s/it]

Validation Accuracy: 0.8490584935897436
Train loss: 0.23244974531753118


Epoch: 100%|██████████| 4/4 [12:11<00:00, 182.96s/it]

Validation Accuracy: 0.8409455128205128





In [None]:
df_test
#Al igual que en otros notebooks, preprocesamiento previo
# Quitamos las urls
df_test['text'] = df_test['text'].str.replace(r'http:\/\/.*', '', regex=True).replace(r'https:\/\/.*', '', regex=True)

# Quitamos user mentions, signos de puntuación, hashtags y stopwords.
def clean_text(text):
    words = text.lower().split(' ')
    text = ' '.join([word for word in words if not word.startswith('@') and word not in stopwords.words('english')])
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df_test['text'] = df_test['text'].apply(clean_text)

In [None]:


#Realizamos el mismo preprocesamiento
sentences = df_test.text.values

#Sin embargo, por razones obvias, no vamos a obtener los labels
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]


MAX_LEN = 128


input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

attention_masks = []


for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
  
batch_size = 32  


prediction_data = TensorDataset(prediction_inputs, prediction_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
# Modelo en modo evaluacion
model.eval()

#Variable para guardar las predicciones
predictions = []

# Predecir
for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask = batch
  with torch.no_grad():
    #Calculo de las predicciones
    logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

  logits = logits.detach().cpu().numpy()
  
  #Guardamos las predicciones
  predictions.append(logits)

In [None]:
#Las predicciones no se guardan de forma "normal" en un vector, asi que es necesario reacomodarlas
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
len(flat_predictions)

3263

In [None]:
df_test['target'] = flat_predictions
df_test

Unnamed: 0,id,text,target
0,0,happened terrible car crash,1
1,2,heard earthquake different cities stay safe everyone,1
2,3,forest fire spot pond geese fleeing across street cannot save,1
3,9,apocalypse lighting spokane wildfires,1
4,11,typhoon soudelor kills 28 china taiwan,1
...,...,...,...
3258,10861,earthquake safety los angeles ûò safety fasteners xrwn,1
3259,10865,storm ri worse last hurricane cityamp3others hardest hit yard looks like bombed around 20000k still without power,1
3260,10868,green line derailment chicago,1
3261,10874,meg issues hazardous weather outlook hwo,1


In [None]:
df_test[['id', 'target']].to_csv('bert.csv',index=False)