In [None]:
!pip install transformers
!pip install dfply
!pip install xlsxwriter
!CUDA_LAUNCH_BLOCKING=1

In [None]:
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from textwrap import wrap
import random
from dfply import *
import os
import string
from sklearn.preprocessing import LabelEncoder
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
# Inicialización
RANDOM_SEED = 40
MAX_LEN = 80
BATCH_SIZE = 20
data = pd.read_excel('AMAZON_AUGMENTED.xlsx',engine='openpyxl',sheet_name = "DATA")
NCLASSES_SENTIMENT = 5
NCLASSES_ASPECT = 20

In [None]:
LE = LabelEncoder()
df = data >> mask(X.FILTER=='No') >> select(X.CLAUSE, X.ASPECT, X.SENTIMENT) 
df.ASPECT = (LE.fit_transform(df['ASPECT'])).astype(int)
df.SENTIMENT = df.SENTIMENT.astype(int)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
# Ejemplo tokenización
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
sample_txt = 'I really loved that movie!'
tokens = tokenizer(sample_txt, return_tensors="tf")
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print('Frase: ', sample_txt)
print('Tokens: ', tokens)
print('Tokens numéricos: ', token_ids)

In [None]:
# Codificación para introducir a BERT
encoding = tokenizer.encode_plus(
    sample_txt,
    max_length = 80,
    truncation = True,
    add_special_tokens = True,
    return_token_type_ids = False,
    pad_to_max_length = True,
    return_attention_mask = True,
    return_tensors = 'pt'
)



In [None]:
# CREACIÓN DATASET
class AMAZON(Dataset):
  def __init__(self,reviews,labels, tokenizer,max_len):
    self.reviews = reviews
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
      return len(self.reviews)
    
  def __getitem__(self, item):
    review = str(self.reviews[item])
    labels = self.labels[item]
    encoding = tokenizer.encode_plus(
        review,
        max_length = self.max_len,
        truncation = True,
        add_special_tokens = True,
        return_token_type_ids = False,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
        )
    return {
          'review': review,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(labels, dtype=torch.long)
      } 

In [None]:
# Data loader:
def data_loader_aspect(df, tokenizer, max_len, batch_size):
  dataset = AMAZON(
      reviews = df.CLAUSE.to_numpy(),
      labels = df.ASPECT.to_numpy(),
      tokenizer = tokenizer,
      max_len = MAX_LEN
  )
  return DataLoader(dataset, batch_size = BATCH_SIZE, num_workers = 4)

def data_loader_sentiment(df, tokenizer, max_len, batch_size):
  dataset = AMAZON(
      reviews = df.CLAUSE.to_numpy(),
      labels = df.SENTIMENT.to_numpy(),
      tokenizer = tokenizer,
      max_len = MAX_LEN
  )
  return DataLoader(dataset, batch_size = BATCH_SIZE, num_workers = 4)

In [None]:
df_train, df_test = train_test_split(df, test_size = 0.01, random_state=RANDOM_SEED)
train_data_loader = data_loader_aspect(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = data_loader_aspect(df_test, tokenizer, MAX_LEN, BATCH_SIZE)



In [None]:
# EL MODELO!
class ROBERTA(nn.Module):
  def __init__(self, n_classes):
    super(ROBERTA, self).__init__()
    self.bert = AutoModel.from_pretrained(model_name, return_dict=False)
    self.drop = nn.Dropout(p=0.3)
    self.linear = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    _, cls_output = self.bert(input_ids = input_ids,attention_mask = attention_mask, return_dict=False)
    drop_output = self.drop(cls_output)
    output = self.linear(drop_output)
    return output

In [None]:
model = ROBERTA(NCLASSES_ASPECT)
model = model.to(device)

In [None]:
EPOCHS = 20
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps = 0,num_training_steps = total_steps)
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0
  for batch in data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids = input_ids, attention_mask = attention_mask)
    _, preds = torch.max(outputs, dim = 1)
    loss = loss_fn(outputs, labels)
    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double()/n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn_aspect, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)
      outputs = model(input_ids = input_ids, attention_mask = attention_mask)
      _, preds = torch.max(outputs, dim = 1)
      loss = loss_fn(outputs, labels)
      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())
  return correct_predictions.double()/n_examples,np.mean(losses)

In [None]:
for epoch in range(EPOCHS):
  print('Epoch {} de {}'.format(epoch+1, EPOCHS))
  print('------------------')
  train_acc, train_loss = train_model(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train))
  test_acc, test_loss = eval_model(model, test_data_loader, loss_fn, device, len(df_test))
  print('Entrenamiento: accuracy aspect: {}, loss aspect: {}'.format(train_acc, train_loss))
  print('Validación: accuracy aspect: {}, loss aspect: {}'.format(test_acc, test_loss))
  print('')

Epoch 1 de 20
------------------
Entrenamiento: accuracy aspect: 0.872138130371948, loss aspect: 0.5345372303040828
Validación: accuracy aspect: 0.9097744360902256, loss aspect: 0.4355545810290745

Epoch 2 de 20
------------------
Entrenamiento: accuracy aspect: 0.9435612687305089, loss aspect: 0.23707208533680751
Validación: accuracy aspect: 0.9248120300751879, loss aspect: 0.4058194841657366

Epoch 3 de 20
------------------
Entrenamiento: accuracy aspect: 0.9591541796607591, loss aspect: 0.1716059814469102
Validación: accuracy aspect: 0.9398496240601504, loss aspect: 0.3832205750181207

Epoch 4 de 20
------------------
Entrenamiento: accuracy aspect: 0.9711721305240739, loss aspect: 0.12857731364102679
Validación: accuracy aspect: 0.9323308270676691, loss aspect: 0.38215695920267273

Epoch 5 de 20
------------------
Entrenamiento: accuracy aspect: 0.9809842549631095, loss aspect: 0.08750036747055412
Validación: accuracy aspect: 0.9323308270676691, loss aspect: 0.3882379132722105

Ep

In [None]:
df2 = data >> mask(X.FILTER=='No', X.ASPECT != 'None') >> select(X.CLAUSE, X.SENTIMENT) 
df2.SENTIMENT = df2.SENTIMENT.astype(int)

In [None]:
df_train_sentiment, df_test_sentiment = train_test_split(df2, test_size = 0.01, random_state=RANDOM_SEED)
train_data_loader_sentiment = data_loader_sentiment(df_train_sentiment, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader_sentiment = data_loader_sentiment(df_test_sentiment, tokenizer, MAX_LEN, BATCH_SIZE)



In [None]:
model_sentiment = ROBERTA(NCLASSES_SENTIMENT)
model_sentiment = model_sentiment.to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
EPOCHS = 20
optimizer_sentiment = AdamW(model_sentiment.parameters(), lr=2e-5, correct_bias=False)
total_steps_sentiment = len(train_data_loader_sentiment) * EPOCHS
scheduler_sentiment = get_linear_schedule_with_warmup(optimizer_sentiment,num_warmup_steps = 0,num_training_steps = total_steps_sentiment)
loss_fn_sentiment = nn.CrossEntropyLoss().to(device)



In [None]:
for epoch in range(EPOCHS):
  print('Epoch {} de {}'.format(epoch+1, EPOCHS))
  print('------------------')
  train_acc_sentiment, train_loss_sentiment = train_model(model_sentiment, train_data_loader_sentiment, loss_fn_sentiment, optimizer_sentiment, device, scheduler_sentiment, len(df_train_sentiment))
  test_acc_sentiment, test_loss_sentiment = eval_model(model_sentiment, test_data_loader_sentiment, loss_fn_sentiment, device, len(df_test_sentiment))
  print('Entrenamiento: accuracy sentiment: {}, loss sentiment: {}'.format(train_acc_sentiment, train_loss_sentiment))
  print('Validación: accuracy sentiment: {}, loss sentiment: {}'.format(test_acc_sentiment, test_loss_sentiment))
  print('')

Epoch 1 de 20
------------------




Entrenamiento: accuracy sentiment: 0.7445378151260503, loss sentiment: 0.7204540968179702
Validación: accuracy sentiment: 0.7795275590551181, loss sentiment: 0.6849342414311

Epoch 2 de 20
------------------




Entrenamiento: accuracy sentiment: 0.8436174469787915, loss sentiment: 0.4366564678788185
Validación: accuracy sentiment: 0.8110236220472441, loss sentiment: 0.6734295615128109

Epoch 3 de 20
------------------




Entrenamiento: accuracy sentiment: 0.9026010404161664, loss sentiment: 0.3065093619301915
Validación: accuracy sentiment: 0.8267716535433071, loss sentiment: 0.5955478101968765

Epoch 4 de 20
------------------




Entrenamiento: accuracy sentiment: 0.9314925970388155, loss sentiment: 0.2358030985824764
Validación: accuracy sentiment: 0.8188976377952756, loss sentiment: 0.6447223871946335

Epoch 5 de 20
------------------




Entrenamiento: accuracy sentiment: 0.9532613045218087, loss sentiment: 0.18690072932317853
Validación: accuracy sentiment: 0.8267716535433071, loss sentiment: 0.7473647509302411

Epoch 6 de 20
------------------




Entrenamiento: accuracy sentiment: 0.9639055622248899, loss sentiment: 0.15102723029628395
Validación: accuracy sentiment: 0.84251968503937, loss sentiment: 0.7591222205332347

Epoch 7 de 20
------------------




Entrenamiento: accuracy sentiment: 0.9703881552621048, loss sentiment: 0.1263928088016808
Validación: accuracy sentiment: 0.8503937007874016, loss sentiment: 0.732872048658984

Epoch 8 de 20
------------------




Entrenamiento: accuracy sentiment: 0.9791916766706682, loss sentiment: 0.10097943413034081
Validación: accuracy sentiment: 0.8582677165354331, loss sentiment: 0.6921847207205636

Epoch 9 de 20
------------------




Entrenamiento: accuracy sentiment: 0.983673469387755, loss sentiment: 0.0796889038673602
Validación: accuracy sentiment: 0.8267716535433071, loss sentiment: 1.0391279458999634

Epoch 10 de 20
------------------




Entrenamiento: accuracy sentiment: 0.9855942376950779, loss sentiment: 0.06665221151858568
Validación: accuracy sentiment: 0.8346456692913385, loss sentiment: 1.1361306479998998

Epoch 11 de 20
------------------




Entrenamiento: accuracy sentiment: 0.9891156462585033, loss sentiment: 0.05429909693077207
Validación: accuracy sentiment: 0.8188976377952756, loss sentiment: 1.1125679867608207

Epoch 12 de 20
------------------




Entrenamiento: accuracy sentiment: 0.9923169267707083, loss sentiment: 0.04386779192341492
Validación: accuracy sentiment: 0.8188976377952756, loss sentiment: 1.1799651426928384

Epoch 13 de 20
------------------




Entrenamiento: accuracy sentiment: 0.9929571828731492, loss sentiment: 0.03949775871837046
Validación: accuracy sentiment: 0.8503937007874016, loss sentiment: 1.1396439671516418

Epoch 14 de 20
------------------




Entrenamiento: accuracy sentiment: 0.9939975990396158, loss sentiment: 0.03129373388316017
Validación: accuracy sentiment: 0.8503937007874016, loss sentiment: 1.0769110197600509

Epoch 15 de 20
------------------




Entrenamiento: accuracy sentiment: 0.9956782713085234, loss sentiment: 0.02200654820947675
Validación: accuracy sentiment: 0.8503937007874016, loss sentiment: 1.1485004339899336

Epoch 16 de 20
------------------




Entrenamiento: accuracy sentiment: 0.9968787515006001, loss sentiment: 0.01665233473383123
Validación: accuracy sentiment: 0.8503937007874016, loss sentiment: 1.2288758201258523

Epoch 17 de 20
------------------




Entrenamiento: accuracy sentiment: 0.9976790716286514, loss sentiment: 0.012422247086535209
Validación: accuracy sentiment: 0.8346456692913385, loss sentiment: 1.408441343477794

Epoch 18 de 20
------------------




Entrenamiento: accuracy sentiment: 0.9979991996798718, loss sentiment: 0.009880025590927106
Validación: accuracy sentiment: 0.84251968503937, loss sentiment: 1.3799849397369794

Epoch 19 de 20
------------------




Entrenamiento: accuracy sentiment: 0.9983993597438975, loss sentiment: 0.008867073669307866
Validación: accuracy sentiment: 0.8346456692913385, loss sentiment: 1.4381810895034246

Epoch 20 de 20
------------------




Entrenamiento: accuracy sentiment: 0.9991196478591435, loss sentiment: 0.004103499914502026
Validación: accuracy sentiment: 0.8346456692913385, loss sentiment: 1.479892196399825



In [None]:
def Aspect_Class(preds_aspect):
    if preds_aspect == 0:
      return np.array('Alarm')
    elif preds_aspect == 1:
      return np.array('App')
    elif preds_aspect == 2:
      return np.array('Battery Life')
    elif preds_aspect == 3:
      return np.array('Calories Burned')
    elif preds_aspect == 4:
      return np.array('Clock Face')
    elif preds_aspect == 5:
      return np.array('Connect')
    elif preds_aspect == 6:
      return np.array('Distance Tracking')
    elif preds_aspect == 7:
      return np.array('Heart Rate Tracking')
    elif preds_aspect == 8:
      return np.array('Mobile App')
    elif preds_aspect == 9:
      return np.array('None')
    elif preds_aspect == 10:
      return np.array('Notifications')
    elif preds_aspect == 11:
      return np.array('Price')
    elif preds_aspect == 12:
      return np.array('Reminders')
    elif preds_aspect == 13:
      return np.array('Screen')
    elif preds_aspect == 14:
      return np.array('Sleep Tracker')
    elif preds_aspect == 15:
      return np.array('Step Counter')
    elif preds_aspect == 16:
      return np.array('Swimproof + Swim tracking')
    elif preds_aspect == 17:
      return np.array('Syncing')
    elif preds_aspect == 18:
      return np.array('Watch Face')
    else: 
      return np.array('Waterproof')

In [None]:
def classifySentiment(review_text):
  encoding_review = tokenizer.encode_plus(
      review_text,
      max_length = MAX_LEN,
      truncation = True,
      add_special_tokens = True,
      return_token_type_ids = False,
      pad_to_max_length = True,
      return_attention_mask = True,
      return_tensors = 'pt')
  input_ids = encoding_review['input_ids'].to(device)
  attention_mask = encoding_review['attention_mask'].to(device)
  outputs_aspect = model(input_ids = input_ids, attention_mask = attention_mask)
  outputs_sentiment = model_sentiment(input_ids = input_ids, attention_mask = attention_mask)
  _, preds_aspect = torch.max(outputs_aspect, dim = 1)
  _, preds_sentiment = torch.max(outputs_sentiment, dim = 1)
  return Aspect_Class(preds_aspect), np.array(preds_sentiment.cpu()).astype(int)

In [None]:
USERS = pd.read_excel('AMAZON_AUGMENTED.xlsx',engine='openpyxl',sheet_name = "DATA")
USERS.SENTIMENT = USERS.SENTIMENT.astype(int)
result = []
for i in np.arange(0, len(USERS), 1):
  result.append(classifySentiment(USERS.CLAUSE[i]))
pred = pd.DataFrame(result, columns=['PRED_ASPECT', 'PRED_SENTIMENT'])

In [None]:
USERS = pd.concat([USERS, pred], axis=1)
USERS['PRED_SENTIMENT'] = USERS.PRED_SENTIMENT.astype(int) + 1
USERS.SENTIMENT = USERS.SENTIMENT + 1

In [None]:
MANUFACTURER = pd.read_excel('MANUFACTURER.xlsx',engine='openpyxl',sheet_name = "DATA")
result = []
for i in np.arange(0, len(MANUFACTURER), 1):
  result.append(classifySentiment(MANUFACTURER.CLAUSE[i]))
pred = pd.DataFrame(result, columns=['PRED_ASPECT', 'PRED_SENTIMENT'])
MANUFACTURER['SENTIMENT'] = pred.PRED_SENTIMENT.astype(int) + 1

In [None]:
writer = pd.ExcelWriter('AMAZON.xlsx', engine='xlsxwriter')
USERS.to_excel(writer,        sheet_name = 'USERS',        index = False, header=True)
MANUFACTURER.to_excel(writer, sheet_name = 'MANUFACTURER', index = False, header=True)
writer.close()