In [1]:
# Import modules
import pandas as pd
import numpy as np
import bert
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import  Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tqdm import tqdm
import matplotlib.pyplot as plt

print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)
pd.set_option('display.max_colwidth',1000)


TensorFlow Version: 2.7.0
Hub version:  0.12.0


## Data preprocessing

In [2]:
# https://www.kaggle.com/dataset/4af304c0f797e3b08f22895d6a0dcf95eee4c37f7a20775c7a4ee2281c6ba2ff
DATASET_PATH = "NoThemeTweets.csv"

In [3]:
dataset = pd.read_csv( 
         DATASET_PATH,
         engine="python", 
         encoding="latin1"
)

In [5]:
dataset.head()

Unnamed: 0,id,tweet_text,tweet_date,sentiment,query_used
0,1031761728445530112,@Tixaa23 14 para eu ir :),Tue Aug 21 04:35:39 +0000 2018,Positivo,:)
1,1031761040462278656,@drexalvarez O meu like eu jÃ¡ dei na Ã©poca :),Tue Aug 21 04:32:55 +0000 2018,Positivo,:)
2,1031760962372689920,Eu sÃ³ queria conseguir comer alguma coisa pra poder dormir :),Tue Aug 21 04:32:37 +0000 2018,Positivo,:)
3,1031760948250456066,:D que lindo dia !,Tue Aug 21 04:32:33 +0000 2018,Positivo,:)
4,1031760895985246208,"@Primo_Resmungao Pq da pr jeito!!Ã© uma ""oferta"", ha q aproveitar. :P",Tue Aug 21 04:32:21 +0000 2018,Positivo,:)


In [6]:
df = dataset[["tweet_text", "sentiment"]]

In [7]:
df.isna().sum()

tweet_text    0
sentiment     0
dtype: int64

In [8]:
df.shape

(785814, 2)

In [None]:
def get_treated_data(dataset, cols, cols_drop = [], col_to_change='sentiment', val_col_change = {"Negativo": 0, "Positivo":1}):
    
    # # 1. Criar a variável "data"
    dataset = pd.read_csv( 
         DATASET_PATH,
         engine="python", 
         encoding="latin1"
    )
        
    # 2. Rename columns
    dataset.columns = cols
    
    # 3. Drop columns not needed
    dataset.drop(cols_drop, axis=1, inplace=True)
    
    # 3.1 Drop all rows with at least one element is missing
    dataset.dropna()
    
    # 4. Convert setiments from "Negative/Positive" to "0/1" 
    # dataset.replace({col_to_change: val_col_change}, inplace=True)
    
    # Return our dataset
    return dataset

In [None]:
df.head()

In [None]:
default_cols = ["id", "text", "date", "sentiment", "query"];
default_drop_cols = ["id", "date", "query"]
# default_cols = ["sentiment", "text"];
# default_drop_cols = ["id", "date", "query"]

# class_names = ['Negativo', 'Positive']
df = get_treated_data(df, default_cols, cols_drop = default_drop_cols)

In [None]:
# Take a peek at the dataset
df["sentiment"].value_counts(normalize=True)

In [None]:
def preprocess_text(text):
    
    # Not needed to be imported globally
    from bs4 import BeautifulSoup
    import re
    text = BeautifulSoup(text, "lxml").get_text()
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) # Remove urls
    text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"[^a-zA-Z.!?']", ' ', text)
    text = re.sub(r" +", ' ', text)
    
    return text

In [None]:
df['text'] = df['text'].apply(lambda text: preprocess_text(text))

df.head(5)

In [None]:
print("The number of rows and columns in the dataset is: {}".format(df.shape))

In [None]:
# Identify missing values
df.apply(lambda x: sum(x.isnull()), axis=0)

In [None]:
# Check the target class balance
df["sentiment"].value_counts(normalize=True)

**Download token**

In [None]:
!rm -rf bert-base-portuguese-cased
!mkdir bert-base-portuguese-cased
!wget https://neuralmind-ai.s3.us-east-2.amazonaws.com/nlp/bert-base-portuguese-cased/bert-base-portuguese-cased_pytorch_checkpoint.zip
!wget https://neuralmind-ai.s3.us-east-2.amazonaws.com/nlp/bert-base-portuguese-cased/vocab.txt 

!apt-get install unzip

!unzip bert-base-portuguese-cased_pytorch_checkpoint.zip -d bert-base-portuguese-cased
!mv vocab.txt bert-base-portuguese-cased/vocab.txt 
!pip install -U transformers

In [None]:
from transformers import AutoModel # For BERTs
from transformers import AutoModeForSequenceClassification # For models fine-tuned on MNLI
from transformers import AutoTokenizer

bert_model = AutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-tiny") # v1 and v2

In [None]:
#from transformers import BertTokenizer, BertConfig, TFBertModel
#bert_model = TFBertModel.from_pretrained("bert-base-portuguese-cased", from_pt=True)

In [None]:
# Functions for constructing BERT Embeddings: input_ids, input_masks, input_segments and Inputs
MAX_SEQ_LEN=500 # max sequence length

def get_masks(tokens):
    """Masks: 1 for real tokens and 0 for paddings"""
    return [1]*len(tokens) + [0] * (MAX_SEQ_LEN - len(tokens))
 
def get_segments(tokens):
    """Segments: 0 for the first sequence, 1 for the second"""  
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (MAX_SEQ_LEN - len(tokens))

def get_ids(tokens, tokenizer):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    input_ids = token_ids + [0] * (MAX_SEQ_LEN - len(token_ids))
    return input_ids

def create_single_input(sentence, tokenizer, max_len):
    """Create an input from a sentence"""
    stokens = tokenizer.tokenize(sentence)
    stokens = stokens[:max_len]
    stokens = ["[CLS]"] + stokens + ["[SEP]"]
 
    ids = get_ids(stokens, tokenizer)
    masks = get_masks(stokens)
    segments = get_segments(stokens)

    return ids, masks, segments
 
def convert_sentences_to_features(sentences, tokenizer):
    """Convert sentences to features: input_ids, input_masks and input_segments"""
    input_ids, input_masks, input_segments = [], [], []
 
    for sentence in tqdm(sentences,position=0, leave=True):
      ids,masks,segments=create_single_input(sentence,tokenizer,MAX_SEQ_LEN-2)
      assert len(ids) == MAX_SEQ_LEN
      assert len(masks) == MAX_SEQ_LEN
      assert len(segments) == MAX_SEQ_LEN
      input_ids.append(ids)
      input_masks.append(masks)
      input_segments.append(segments)

    return [np.asarray(input_ids, dtype=np.int32), 
          np.asarray(input_masks, dtype=np.int32), 
          np.asarray(input_segments, dtype=np.int32)]

def create_tonkenizer(bert_layer):
    """Instantiate Tokenizer with vocab"""
    # vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy()
    # do_lower_case=bert_layer.resolved_object.do_lower_case.numpy() 
    # tokenizer=bert.bert_tokenization.FullTokenizer(vocab_file,do_lower_case)
    do_lower_case = False
    tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny") # v1 and v2
    #tokenizer = BertTokenizer("bert-base-portuguese-cased/vocab.txt", do_lower_case)
    return tokenizer

## Modelling

In [None]:
def nlp_model(callable_object):
    # Load the pre-trained BERT base model
    # bert_layer = hub.KerasLayer(handle=callable_object, trainable=True)  

    bert_layer = callable_object
   
    # BERT layer three inputs: ids, masks and segments
    input_ids = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="input_ids")           
    input_masks = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="input_masks")       
    input_segments = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="segment_ids")
    
    inputs = [input_ids, input_masks, input_segments] # BERT inputs
    # If using hub.KerasLayer, PLEASE, CHANGE THE ORDER of the variables, I mean: 
    # pooled_output, sequence_output = 
    sequence_output, pooled_output = bert_layer(inputs) # BERT outputs 
    
    # Add a hidden layer
    x = Dense(units=768, activation='relu')(pooled_output)
    x = Dropout(0.3)(x)
 
    # Add output layer
    outputs = Dense(3, activation="softmax")(x)

    # Construct a new model
    model = Model(inputs=inputs, outputs=outputs, )
    return model




In [None]:
model = nlp_model(bert_model)
model.summary()


## Model training

In [None]:
# Create examples for training and testing

df = df.sample(frac=1) # Shuffle the dataset
tokenizer = create_tonkenizer(model.layers[3])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['sentiment'], 
    test_size=0.3, 
    stratify=df['sentiment'], 
    random_state=15 
    )

print( "\nx_train: {}; \tX_test: {}".format(X_train.shape, X_test.shape))
print("\ny_test: \n{}, \n\ny_train: \n{}".format(y_train.value_counts(normalize=True), y_test.value_counts(normalize=True) ) )

X_train = convert_sentences_to_features(X_train, tokenizer)
X_test = convert_sentences_to_features(X_test, tokenizer)

y_train = to_categorical( y_train )
y_test =  to_categorical( y_test )


In [None]:
y_train

In [None]:
# callback

checkpoint_path = "./sentiment_analysis_model"
ckpt = tf.train.Checkpoint(model=model)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

class CustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [None]:
# Train the model
BATCH_SIZE = 10
EPOCHS = 2

# Use Adam optimizer to minimize the categorical_crossentropy loss
opt = Adam(learning_rate=2e-5)

# loss = tf.keras.losses.CategoricalCrossentropy()
# metric = tf.keras.metrics.CategoricalAccuracy()


# softmax_cross_entropy_with_logits
model.compile(optimizer=opt, 
              loss= 'categorical_crossentropy', #binary_crossentropy
              metrics = ['categorical_accuracy']
              )

# Fit the data to the model
history = model.fit(X_train, y_train,
                    validation_data=(X_test, y_test),
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    verbose = 1,
                    callbacks=[CustomCallback()]
                    )


In [None]:
def save_model(model, name, path, h5=False):
  '''
  model, model_name, path, h5(optional)
  '''
  if h5:
    !pip install -q pyyaml h5py  # Required to save models in HDF5 format
    model.save( "{}.h5".format(name) )
  else:
    model.save( name )


In [None]:
save_model(model, "sentiment_model", "trained_model")

In [None]:
history.history

## Analysis of model performance

In [None]:
# # Load the pretrained nlp_model
# from tensorflow.keras.models import load_model
# new_model = load_model('test')
# new_model.summary



In [None]:
# Predict on test dataset
from sklearn.metrics import classification_report, confusion_matrix
pred_test = np.argmax(model.predict(X_test), axis=1)

In [None]:
print(classification_report(np.argmax(y_test,axis=1), pred_test))

In [None]:
print(pred_test[:40])
print( y_test[:40].argmax(1) )

# Predict

In [None]:
def get_predictions(model_, sentence):
  sent = []
  sent.append(sentence)
  sentence_feature = convert_sentences_to_features(sent, tokenizer)

  prediction = np.argmax(model_.predict( sentence_feature ) , axis=1) 

  # Show Positivo/Negativo
  pred = ["Negativo" if x == 0 else "Positivo" if x == 2 else "Neutro"  for x in prediction]

  return pred

In [None]:
# Predict
get_predictions( model, "Aquele ator é ruim" )

In [None]:
get_predictions( model, "Eu gosto do seu sorriso" )