# Train Sentiment Analysis Model

In [1]:
# importando bibliotecas

import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Dropout
from keras.initializers import Constant

2023-11-25 15:00:18.240136: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [157]:
# importando bibliotecas

from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.losses import SparseCategoricalCrossentropy

from sklearn.model_selection import train_test_split

In [158]:
# importando bibliotecas

import tensorflow as tf
from transformers import (
    TFBertModel,
    BertTokenizer,
    TFAutoModelForSequenceClassification,
    DataCollatorWithPadding,
)
from datasets import Dataset, DatasetDict

In [160]:
# leitura e tratamento dos dados

sentimental_analysis_data = pd.read_csv("twitter_training.csv")

sentimental_analysis_data.columns = ["id", "palavra-chave", "sentimento", "tweet"]

sentimental_analysis_data.dropna(inplace=True)

sentimental_analysis_data = sentimental_analysis_data[["tweet", "sentimento"]]

In [164]:
# mapeando as variáveis de interesse, transformando de categóricas para numéricas (tipo interpretável pelo BERT)

mapeamento = {"Positive": 0, "Neutral": 1, "Negative": 2, "Irrelevant": 1}
sentimental_analysis_data["sentimento"] = sentimental_analysis_data["sentimento"].map(mapeamento)
sentimental_analysis_data["sentimento"] = sentimental_analysis_data["sentimento"].astype(int)

# separando dataset de treino e teste

sentimental_analysis_train, sentimental_analysis_test = train_test_split(sentimental_analysis_data, test_size=0.1, random_state = 42)
sentimental_analysis_train.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentimento'] = data['sentimento'].map(mapping_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentimento'] = data['sentimento'].astype(int)


Unnamed: 0,tweet,sentimento
6416,People are so wicked in this life they will kn...,0
36396,IGP 26 times! This time it is @ superhys and @...,1
42946,<unk> shock to desi mom's this eid,1
19969,So I was doing the Lower Blackrock Spire with ...,1
14806,Yup og is my new word a f nigma,1
49538,I never spend my money on fifa points anyways.,2
8596,It's weird how Overwatch doesn't enable anymor...,2
13793,Me: hasn’t watched a single game this season ....,1
39242,"This is bullshit, says @PlayHearthstone <unk> ...",2
20876,@Wobblespurt and I had an amazing encounter. I...,0


In [167]:
# definindo a funcao tokenizadora

def tokenize_function(example):
    return tokenizer(example["tweet"], truncation=True)

In [166]:
# baixando o tokenzier do modelo padrão do BERT

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [168]:
# convertendo os datasets de treino e teste para o formato 'DatasetDict', amplamente utilizado em modelos de aprendizado

sentimental_analysis_train = Dataset.from_pandas(sentimental_analysis_train)
sentimental_analysis_test = Dataset.from_pandas(sentimental_analysis_test)

dataset = DatasetDict({"train": sentimental_analysis_train, "test": sentimental_analysis_test})

In [169]:
# tokenizando o dataset

sentimental_analysis_tokenized = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/66595 [00:00<?, ? examples/s]

Map:   0%|          | 0/7400 [00:00<?, ? examples/s]

In [170]:
# especificando o data collator para TensorFlow

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [171]:
# convertendo os dados para o formato TensorFlow
# aqui ja se especifica o tamanho do batch em 8

tf_train_dataset = sentimental_analysis_tokenized["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["sentimento"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = sentimental_analysis_tokenized["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["sentimento"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


In [174]:
# carregando o modelo base do BERT

model = TFAutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=3
)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [179]:
# especificando parâmetros importantes (learning rate, loss, métrica e número de épocas) e treinando o modelo

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ["accuracy"]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7db7089b5120>