<a href="https://colab.research.google.com/github/cristobalvch/BERT-Transformers/blob/main/BERT_Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
from google.colab import files
drive.mount('/content/gdrive')
files.upload()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Saving politics-test-tagged.xml to politics-test-tagged (2).xml


{'politics-test-tagged.xml': b'<?xml version="1.0" encoding="UTF-8"?>\n<tweets>\n <tweet>\n  <tweetid>137228516625367040</tweetid>\n  <user>TonyKrdniosa</user>\n  <content>"@marianorajoy: En Espa\xc3\xb1a las cosas se pueden, se deben y se van a hacer infinitamente mejor que estos \xc3\xbaltimos 4 a\xc3\xb1os" Eso son soluciones!!</content>\n  <date>2011-10-17T19:00:02</date>\n  <lang>es</lang>\n  <sentiments>\n   <polarity>\n    <value>P</value>\n    <type>AGREEMENT</type>\n   </polarity>\n   <polarity>\n    <entity source="PP">@marianorajoy</entity>\n    <value>P</value>\n    <type>AGREEMENT</type>\n   </polarity>\n  </sentiments>\n  <topics>\n   <topic>pol\xc3\xadtica</topic>\n  </topics>\n </tweet>\n <tweet>\n  <tweetid>137228522019229697</tweetid>\n  <user>elhijodelapepa</user>\n  <content>En PSO\xe2\x82\xac el que no corre vuela, todav\xc3\xada caliente el cad\xc3\xa1ver pol\xc3\xadtico de ZP y Rubalcaba y la PANtumaca buscando hueco #votaPP #sumatealcambio</content>\n  <date>201

In [None]:
!pip install transformers



In [None]:
import pandas as pd
import numpy as np
import torch
import re
import xml.etree.ElementTree as et

from transformers import BertModel,BertTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from textwrap import wrap
import matplotlib.pyplot as plt

In [None]:
xtree = et.parse('politics-test-tagged.xml')
xroot = xtree.getroot()

df_cols = ['content','values']
rows = []
for node in xroot:
  s_content = node.find('content').text
  try:
    s_value = node.getchildren()[5].getchildren()[1].getchildren()[1].text
  except IndexError:
    s_value = 'NaN'

  rows.append({'content':s_content,'values':s_value})

df_text = pd.DataFrame(rows,columns=df_cols).sample(frac=1).reset_index(drop=True)

In [None]:
df_text = df_text[df_text['values'].isin(['NEU','N','P'])].reset_index(drop=True)

In [None]:
df_text['encode'] = df_text['values'].astype('category').cat.codes

In [None]:
df_text['content'] = df_text['content'].str.replace(r"[@|#]\w+","").str.replace(r"[^\w\s]","")

In [None]:
# Inicialización
RANDOM_SEED = 42
MAX_LEN = 100
BATCH_SIZE = 20
NCLASSES = 3

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
PRE_TRAINED_MODEL_NAME = 'dccuchile/bert-base-spanish-wwm-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
# Ejemplo tokenización
sample_txt = 'Hola a Todos'
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print('Frase: ', sample_txt)
print('Tokens: ', tokens)
print('Tokens numéricos: ', token_ids)

Frase:  Hola a Todos
Tokens:  ['Hola', 'a', 'Todos']
Tokens numéricos:  [1894, 1013, 2906]


In [None]:
# Codificación para introducir a BERT
encoding = tokenizer.encode_plus(
    sample_txt,
    max_length = MAX_LEN,
    truncation = True,
    add_special_tokens = True,
    return_token_type_ids = False,
    pad_to_max_length = True,
    return_attention_mask = True,
    return_tensors = 'pt'
)

encoding.keys()



dict_keys(['input_ids', 'attention_mask'])

In [None]:

# CREACIÓN DATASET

class IMDBDataset(Dataset):

  def __init__(self,reviews,labels,tokenizer,max_len):
    self.reviews = reviews
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
      return len(self.reviews)
    
  def __getitem__(self, item):
    review = str(self.reviews[item])
    label = self.labels[item]
    encoding = tokenizer.encode_plus(
        review,
        max_length = self.max_len,
        truncation = True,
        add_special_tokens = True,
        return_token_type_ids = False,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt'
        )
    

    return {
          'review': review,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'label': torch.tensor(label, dtype=torch.long)
      }

In [None]:
# Data loader:
def data_loader(df, tokenizer, max_len, batch_size):
  dataset = IMDBDataset(
      reviews = df.content.to_numpy(),
      labels = df.encode.to_numpy(),
      tokenizer = tokenizer,
      max_len = MAX_LEN
  )

  return DataLoader(dataset, batch_size = BATCH_SIZE, num_workers = 4)

In [None]:
df_train, df_test = train_test_split(df_text, test_size = 0.3, random_state=RANDOM_SEED)

train_data_loader = data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
df_test

Unnamed: 0,content,values,encode
1532,La Dependencia no es que no sea viablesino qu...,N,0
1214,Visca Catalunya socialista crida el CCIB Tot a...,N,0
351,El que representa a las personas y no a la...,P,2
420,mi primer compromiso es defender la igualdad ...,NEU,1
1392,httptcoPLkM5UWL se presenta como diputado de ...,P,2
...,...,...,...
1509,Valderas pide el voto a toda la izquierda alte...,P,2
208,Soy tetrapléjica la Dependencia no es un lujo...,NEU,1
1272,Chacón urge a votar el domingo para evitar la...,N,0
1547,bajo un 5 el sueldo de profesores y un 8 el p...,P,2


In [None]:
# EL MODELO!

class BERTSentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(BERTSentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.33)
    self.linear = nn.Linear(self.bert.config.hidden_size, 128)
    self.drop = nn.Dropout(p=0.3)
    self.linear = nn.Linear(self.bert.config.hidden_size, 64)
    self.drop = nn.Dropout(p=0.1)
    self.linear = nn.Linear(self.bert.config.hidden_size, NCLASSES)

  def forward(self, input_ids, attention_mask):
    _, cls_output = self.bert(
        input_ids = input_ids,
        attention_mask = attention_mask
    )
    drop_output = self.drop(cls_output)
    output = self.linear(drop_output)
    return output

In [None]:
model = BERTSentimentClassifier(NCLASSES)
model = model.to(device)

In [None]:
# ENTRENAMIENTO
EPOCHS = 30
optimizer = AdamW(model.parameters(), lr=0.0005, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
# Iteración entrenamiento
def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0
  for batch in data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    outputs = model(input_ids = input_ids, attention_mask = attention_mask)
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)
    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double()/n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids = input_ids, attention_mask = attention_mask)
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, labels)
      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())
  return correct_predictions.double()/n_examples, np.mean(losses)

In [None]:
# Entrenamiento!!!

train_accuracy = []
test_accuracy = []

training_loss = []
testing_loss = []

for epoch in range(EPOCHS):
  print('Epoch {} de {}'.format(epoch+1, EPOCHS))
  print('------------------')
  train_acc, train_loss = train_model(
      model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train)
  )
  test_acc, test_loss = eval_model(
      model, test_data_loader, loss_fn, device, len(df_test)
  )

  train_accuracy.append(train_acc)
  test_accuracy.append(test_acc)
  training_loss.append(train_loss)
  testing_loss.append(test_acc)
  print('Entrenamiento: Loss: {}, accuracy: {}'.format(train_loss, train_acc))
  print('Validación: Loss: {}, accuracy: {}'.format(test_loss, test_acc))
  print('')

Epoch 1 de 30
------------------
Entrenamiento: Loss: 1.441317998445951, accuracy: 0.3654743390357698
Validación: Loss: 1.170948054109301, accuracy: 0.25

Epoch 2 de 30
------------------
Entrenamiento: Loss: 1.1542430721796475, accuracy: 0.38413685847589424
Validación: Loss: 1.1314397496836526, accuracy: 0.3278985507246377

Epoch 3 de 30
------------------
Entrenamiento: Loss: 1.136483657360077, accuracy: 0.3864696734059098
Validación: Loss: 1.1091142062629973, accuracy: 0.3278985507246377

Epoch 4 de 30
------------------
Entrenamiento: Loss: 1.139640030494103, accuracy: 0.3849144634525661
Validación: Loss: 1.1637503313166755, accuracy: 0.4221014492753623

Epoch 5 de 30
------------------
Entrenamiento: Loss: 1.1026835083961486, accuracy: 0.4144634525660964
Validación: Loss: 1.1400513329676218, accuracy: 0.3278985507246377

Epoch 6 de 30
------------------
Entrenamiento: Loss: 1.1228260938937848, accuracy: 0.39580093312597203
Validación: Loss: 1.142836915595191, accuracy: 0.422101449

In [None]:
train_ac =[float(train.cpu().numpy()) for train in train_accuracy]
train_ln = [float(train) for train in training_loss]

test_ac = [float(test.cpu().numpy()) for test in test_accuracy]
test_ln = [float(test.cpu().numpy()) for test in testing_loss]

In [None]:
ep = list(range(1,EPOCHS+1))
plt.plot(ep,train_ac,color='b')
plt.plot(ep,test_ac,color='r')

In [None]:
plt.plot(ep,train_ln,color='b')
plt.plot(ep,test_ln,color='r')

In [None]:
def classifySentiment(review_text):
  encoding_review = tokenizer.encode_plus(
    sample_txt,
    max_length = MAX_LEN,
    truncation = True,
    add_special_tokens = True,
    return_token_type_ids = False,
    padding = 'max_length',
    return_attention_mask = True,
    return_tensors = 'pt')
  
  input_ids = encoding_review['input_ids'].to(device)
  attention_mask = encoding_review['attention_mask'].to(device)
  output = model(input_ids,attention_mask)
  _,prediction = torch.max(output,dim=1)
  print("\n".join(wrap(review_text)))
  if prediction == 1:
    print("Neutral")
  if prediction == 0:
    print("Negative")
  if prediction == 2:
    print("Positive")

  print("\n {}".format(output))