In [1]:
# Import modules
import pandas as pd
import numpy as np
import bert
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import  Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tqdm import tqdm
import matplotlib.pyplot as plt

print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)
pd.set_option('display.max_colwidth',1000)


TensorFlow Version: 2.7.0
Hub version:  0.12.0


## Data preprocessing

In [2]:
# https://www.kaggle.com/dataset/4af304c0f797e3b08f22895d6a0dcf95eee4c37f7a20775c7a4ee2281c6ba2ff
DATASET_PATH = "NoThemeTweets.csv"

In [3]:
dataset = pd.read_csv( 
         DATASET_PATH,
         engine="python", 
         encoding="latin1"
)

In [4]:
dataset.head()

Unnamed: 0,id,tweet_text,tweet_date,sentiment,query_used
0,1031761728445530112,@Tixaa23 14 para eu ir :),Tue Aug 21 04:35:39 +0000 2018,Positivo,:)
1,1031761040462278656,@drexalvarez O meu like eu jÃ¡ dei na Ã©poca :),Tue Aug 21 04:32:55 +0000 2018,Positivo,:)
2,1031760962372689920,Eu sÃ³ queria conseguir comer alguma coisa pra poder dormir :),Tue Aug 21 04:32:37 +0000 2018,Positivo,:)
3,1031760948250456066,:D que lindo dia !,Tue Aug 21 04:32:33 +0000 2018,Positivo,:)
4,1031760895985246208,"@Primo_Resmungao Pq da pr jeito!!Ã© uma ""oferta"", ha q aproveitar. :P",Tue Aug 21 04:32:21 +0000 2018,Positivo,:)


In [5]:
df = dataset[["tweet_text", "sentiment"]].copy()

In [6]:
df.isna().sum()

tweet_text    0
sentiment     0
dtype: int64

In [7]:
df.head()

Unnamed: 0,tweet_text,sentiment
0,@Tixaa23 14 para eu ir :),Positivo
1,@drexalvarez O meu like eu jÃ¡ dei na Ã©poca :),Positivo
2,Eu sÃ³ queria conseguir comer alguma coisa pra poder dormir :),Positivo
3,:D que lindo dia !,Positivo
4,"@Primo_Resmungao Pq da pr jeito!!Ã© uma ""oferta"", ha q aproveitar. :P",Positivo


In [8]:
def preprocess_text(text):
    
    # Not needed to be imported globally
    from bs4 import BeautifulSoup
    import re
    text = BeautifulSoup(text, "lxml").get_text()
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) # Remove urls
    text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"[^a-zA-Z.!?']", ' ', text)
    text = re.sub(r" +", ' ', text)
    
    return text

In [9]:
df["tweet_text"] = df["tweet_text"].apply(lambda text: preprocess_text(text))

In [33]:
df.rename(columns={"tweet_text":"text"}, inplace=True)

In [34]:
df.head()

Unnamed: 0,text,sentiment
134780,EU to com muita dor de cabe a j tomei rem dio e n o mudou nada aaaa eu preciso estudar mas n o consigo,Negativo
404943,faz mais de um m s que eu n o vejo a mel,Negativo
273057,called me foda isso mesmo.. meu pai que advogado.. tem processo que ta a fcking anos demorando...se fosse rapida essa justi a tudo seria melhor x e o pior que eu pago imposto e os cara parece que n o trabalha D,Positivo
617541,tamb m,Negativo
601454,Eu tamb m,Positivo


In [35]:
df.to_csv("data_clean.csv", index=False)

In [4]:
# reading clean data directly
df = pd.read_csv("data_clean.csv")

In [5]:
# Take a peek at the dataset
df["sentiment"].value_counts()

Negativo    522707
Positivo    263107
Name: sentiment, dtype: int64

In [6]:
df["sentiment"].value_counts(normalize=True)

Negativo    0.665179
Positivo    0.334821
Name: sentiment, dtype: float64

In [7]:
print(f"Shape of df : {df.shape}")

Shape of df : (785814, 2)


In [8]:
# Identify missing values
df.isnull().sum()

text         545
sentiment      0
dtype: int64

In [9]:
df.dropna(inplace=True)

In [10]:
df.shape

(785269, 2)

**Download token**

In [16]:
!rm -rf bert-base-portuguese-cased
!mkdir bert-base-portuguese-cased
!wget https://neuralmind-ai.s3.us-east-2.amazonaws.com/nlp/bert-base-portuguese-cased/bert-base-portuguese-cased_pytorch_checkpoint.zip
!wget https://neuralmind-ai.s3.us-east-2.amazonaws.com/nlp/bert-base-portuguese-cased/vocab.txt 


!unzip bert-base-portuguese-cased_pytorch_checkpoint.zip -d bert-base-portuguese-cased
!mv vocab.txt bert-base-portuguese-cased/vocab.txt 

--2022-02-15 00:28:42--  https://neuralmind-ai.s3.us-east-2.amazonaws.com/nlp/bert-base-portuguese-cased/bert-base-portuguese-cased_pytorch_checkpoint.zip
Resolving neuralmind-ai.s3.us-east-2.amazonaws.com (neuralmind-ai.s3.us-east-2.amazonaws.com)... 52.219.109.42
Connecting to neuralmind-ai.s3.us-east-2.amazonaws.com (neuralmind-ai.s3.us-east-2.amazonaws.com)|52.219.109.42|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 406220891 (387M) [application/zip]
Saving to: ‘bert-base-portuguese-cased_pytorch_checkpoint.zip.1’


2022-02-15 00:29:14 (12,4 MB/s) - ‘bert-base-portuguese-cased_pytorch_checkpoint.zip.1’ saved [406220891/406220891]

--2022-02-15 00:29:14--  https://neuralmind-ai.s3.us-east-2.amazonaws.com/nlp/bert-base-portuguese-cased/vocab.txt
Resolving neuralmind-ai.s3.us-east-2.amazonaws.com (neuralmind-ai.s3.us-east-2.amazonaws.com)... 52.219.88.160
Connecting to neuralmind-ai.s3.us-east-2.amazonaws.com (neuralmind-ai.s3.us-east-2.amazonaws.com)|52.21

In [23]:
from transformers import BertTokenizer, BertConfig, TFBertModel
bert_model = TFBertModel.from_pretrained("bert-base-portuguese-cased", from_pt=True)

2022-02-15 01:19:04.564751: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-15 01:19:04.619988: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory
2022-02-15 01:19:04.628474: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-02-15 01:19:04.628499: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required

In [18]:
# Functions for constructing BERT Embeddings: input_ids, input_masks, input_segments and Inputs
MAX_SEQ_LEN=250 # max sequence length

def get_masks(tokens):
    """Masks: 1 for real tokens and 0 for paddings"""
    return [1]*len(tokens) + [0] * (MAX_SEQ_LEN - len(tokens))
 
def get_segments(tokens):
    """Segments: 0 for the first sequence, 1 for the second"""  
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (MAX_SEQ_LEN - len(tokens))

def get_ids(tokens, tokenizer):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    input_ids = token_ids + [0] * (MAX_SEQ_LEN - len(token_ids))
    return input_ids

def create_single_input(sentence, tokenizer, max_len):
    """Create an input from a sentence"""
    stokens = tokenizer.tokenize(sentence)
    stokens = stokens[:max_len]
    stokens = ["[CLS]"] + stokens + ["[SEP]"]
 
    ids = get_ids(stokens, tokenizer)
    masks = get_masks(stokens)
    segments = get_segments(stokens)

    return ids, masks, segments
 
def convert_sentences_to_features(sentences, tokenizer):
    """Convert sentences to features: input_ids, input_masks and input_segments"""
    input_ids, input_masks, input_segments = [], [], []
 
    for sentence in tqdm(sentences,position=0, leave=True):
        ids,masks,segments=create_single_input(sentence,tokenizer,MAX_SEQ_LEN-2)
        assert len(ids) == MAX_SEQ_LEN
        assert len(masks) == MAX_SEQ_LEN
        assert len(segments) == MAX_SEQ_LEN
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)

    return [np.asarray(input_ids, dtype=np.int32), 
        np.asarray(input_masks, dtype=np.int32), 
        np.asarray(input_segments, dtype=np.int32)]

def create_tonkenizer(bert_layer):
    """Instantiate Tokenizer with vocab"""
    # vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy()
    # do_lower_case=bert_layer.resolved_object.do_lower_case.numpy() 
    # tokenizer=bert.bert_tokenization.FullTokenizer(vocab_file,do_lower_case)
    do_lower_case = False
    tokenizer = BertTokenizer("bert-base-portuguese-cased/vocab.txt", do_lower_case)
    return tokenizer

## Modelling

In [26]:
def nlp_model(callable_object):
    # Load the pre-trained BERT base model
    bert_layer = hub.KerasLayer(handle=callable_object, trainable=True)  


    #bert_layer = callable_object
   
    # BERT layer three inputs: ids, masks and segments
    input_ids = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="input_ids")           
    input_masks = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="input_masks")       
    input_segments = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="segment_ids")
    
    inputs = [input_ids, input_masks, input_segments] # BERT inputs
    # If using hub.KerasLayer, PLEASE, CHANGE THE ORDER of the variables, I mean: 
    #pooled_output, sequence_output = bert_layer(inputs)
    #sequence_output, pooled_output = bert_layer(inputs) # BERT outputs 
    
    bert_output = bert_model(inputs)
    sequence_output = bert_output.last_hidden_state
    pooled_output = bert_output.pooler_output
    
    print(pooled_output)
    # Add a hidden layer
    x = Dense(units=768, activation='relu')(pooled_output)
    x = Dropout(0.3)(x)
 
    # Add output layer
    outputs = Dense(3, activation="softmax")(x)

    # Construct a new model
    model = Model(inputs=inputs, outputs=outputs, )
    return model

In [27]:
model = nlp_model(bert_model)
model.summary()

KerasTensor(type_spec=TensorSpec(shape=(None, 768), dtype=tf.float32, name=None), name='tf_bert_model/bert/pooler/dense/Tanh:0', description="created by layer 'tf_bert_model'")
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 500)]        0           []                               
                                                                                                  
 input_masks (InputLayer)       [(None, 500)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 500)]        0           []                               
                                                                                                  
 tf_bert_model

## Model training

In [28]:
df["sentiment_num"] = df["sentiment"].factorize()[0]

In [29]:
df.head()

Unnamed: 0,text,sentiment,sentiment_num
0,EU to com muita dor de cabe a j tomei rem dio e n o mudou nada aaaa eu preciso estudar mas n o consigo,Negativo,0
1,faz mais de um m s que eu n o vejo a mel,Negativo,0
2,called me foda isso mesmo.. meu pai que advogado.. tem processo que ta a fcking anos demorando...se fosse rapida essa justi a tudo seria melhor x e o pior que eu pago imposto e os cara parece que n o trabalha D,Positivo,1
3,tamb m,Negativo,0
4,Eu tamb m,Positivo,1


In [None]:
# Create examples for training and testing

df = df.sample(frac=1) # Shuffle the dataset
tokenizer = create_tonkenizer(model.layers[3])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['sentiment_num'], 
    test_size=0.3, 
    stratify=df['sentiment_num'], 
    random_state=15 
    )

print( "\nx_train: {}; \tX_test: {}".format(X_train.shape, X_test.shape))
print("\ny_test: \n{}, \n\ny_train: \n{}".format(y_train.value_counts(normalize=True), y_test.value_counts(normalize=True) ) )

X_train = convert_sentences_to_features(X_train, tokenizer)
X_test = convert_sentences_to_features(X_test, tokenizer)

y_train = to_categorical( y_train )
y_test =  to_categorical( y_test )



x_train: (549688,); 	X_test: (235581,)

y_test: 
0    0.665397
1    0.334603
Name: sentiment_num, dtype: float64, 

y_train: 
0    0.665397
1    0.334603
Name: sentiment_num, dtype: float64


100%|████████████████████████████████▉| 549417/549688 [02:50<00:00, 3516.83it/s]

In [None]:
y_train

In [None]:
# callback

checkpoint_path = "./sentiment_analysis_model"
ckpt = tf.train.Checkpoint(model=model)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

class CustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [None]:
# Train the model
BATCH_SIZE = 4
EPOCHS = 2

# Use Adam optimizer to minimize the categorical_crossentropy loss
opt = Adam(learning_rate=2e-5)

# loss = tf.keras.losses.CategoricalCrossentropy()
# metric = tf.keras.metrics.CategoricalAccuracy()


# softmax_cross_entropy_with_logits
model.compile(optimizer=opt, 
              loss= 'categorical_crossentropy', #binary_crossentropy
              metrics = ['categorical_accuracy']
              )

# Fit the data to the model
history = model.fit(X_train, y_train,
                    validation_data=(X_test, y_test),
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    verbose = 1,
                    callbacks=[CustomCallback()]
                    )


In [None]:
def save_model(model, name, path, h5=False):
    '''
    model, model_name, path, h5(optional)
    '''
    
    if h5:
        model.save( "{}.h5".format(name) )
    else:
        model.save( name )


In [None]:
save_model(model, "sentiment_model", "trained_model")

In [None]:
history.history

## Analysis of model performance

In [None]:
# Predict on test dataset
from sklearn.metrics import classification_report, confusion_matrix
pred_test = np.argmax(model.predict(X_test), axis=1)

In [None]:
print(classification_report(np.argmax(y_test,axis=1), pred_test))

In [None]:
print(pred_test[:40])
print( y_test[:40].argmax(1) )

# Predict

In [None]:
def get_predictions(model_, sentence):
    sent = []
    sent.append(sentence)
    sentence_feature = convert_sentences_to_features(sent, tokenizer)

    prediction = np.argmax(model_.predict( sentence_feature ) , axis=1) 

    # Show Positivo/Negativo
    pred = ["Negativo" if x == 0 else "Positivo" if x == 2 else "Neutro"  for x in prediction]

    return pred

In [None]:
# Predict
get_predictions( model, "Aquele ator é ruim" )

In [None]:
get_predictions( model, "Eu gosto do seu sorriso" )