In [None]:
!pip install -q transformers


In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer
from nltk.corpus import stopwords
from contextlib import redirect_stdout
import keras
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from transformers import TFBertModel

In [None]:
train = pd.read_csv("train_data.csv")
test = pd.read_csv("test_data.csv")

In [None]:
train.loc[train['labels'] == 'ANALYSIS', 'labels'] = 0
train.loc[train['labels'] == 'FAC', 'labels'] = 1
train.loc[train['labels'] == 'PREAMBLE', 'labels'] = 2
train.loc[train['labels'] == 'PRE_RELIED', 'labels'] = 3
train.loc[train['labels'] == 'NONE', 'labels'] = 4
train.loc[train['labels'] == 'ARG_PETITIONER', 'labels'] = 5
train.loc[train['labels'] == 'RPC', 'labels'] = 6
train.loc[train['labels'] == 'RLC', 'labels'] = 7
train.loc[train['labels'] == 'ARG_RESPONDENT', 'labels'] = 8
train.loc[train['labels'] == 'RATIO', 'labels'] = 9
train.loc[train['labels'] == 'STA', 'labels'] = 10
train.loc[train['labels'] == 'ISSUE', 'labels'] = 11
train.loc[train['labels'] == 'PRE_NOT_RELIED', 'labels'] = 12

In [None]:
test.loc[test['labels'] == 'ANALYSIS', 'labels'] = 0
test.loc[test['labels'] == 'FAC', 'labels'] = 1
test.loc[test['labels'] == 'PREAMBLE', 'labels'] = 2
test.loc[test['labels'] == 'PRE_RELIED', 'labels'] = 3
test.loc[test['labels'] == 'NONE', 'labels'] = 4
test.loc[test['labels'] == 'ARG_PETITIONER', 'labels'] = 5
test.loc[test['labels'] == 'RPC', 'labels'] = 6
test.loc[test['labels'] == 'RLC', 'labels'] = 7
test.loc[test['labels'] == 'ARG_RESPONDENT', 'labels'] = 8
test.loc[test['labels'] == 'RATIO', 'labels'] = 9
test.loc[test['labels'] == 'STA', 'labels'] = 10
test.loc[test['labels'] == 'ISSUE', 'labels'] = 11
test.loc[test['labels'] == 'PRE_NOT_RELIED', 'labels'] = 12

In [None]:
stop = stopwords.words('english')
train['text_without_stopwords'] = train['text'].str.lower().apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
model = TFBertModel.from_pretrained('bert-base-cased') # bert base model with pretrained weights

In [None]:
token = tokenizer.encode_plus(
    train['text_without_stopwords'].iloc[0], 
    max_length=512, 
    truncation=True,
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

In [None]:
X_input_ids = np.zeros((len(train), 512))
X_attn_masks = np.zeros((len(train), 512))

In [None]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['text_without_stopwords'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=512, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=512, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

In [None]:
X_input_ids, X_attn_masks = generate_training_data(train, X_input_ids, X_attn_masks, tokenizer)

In [None]:
labels = np.zeros((len(train), 13))
labels.shape

In [None]:
labels[np.arange(len(train)), train['labels'].values.tolist()] = 1 # one-hot encoded target tensor

In [None]:
np.unique(labels, axis=0)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset.take(0) # one sample data

In [None]:
def SentimentDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [None]:
dataset = dataset.map(SentimentDatasetMapFunction) # converting to required format for tensorflow dataset 

In [None]:
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor

In [None]:
p = 1
train_size = int((len(train)//16)*p) # for each 16 batch of data we will have len(df)//16 samples, take 80% of that for train.

In [None]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [None]:
params = [{'dense': 32, 'learning_rate': 1e-5*16}, {'dense': 64, 'learning_rate': 1e-5*8}, {'dense': 128, 'learning_rate': 1e-5*4},{'dense': 256, 'learning_rate': 1e-5*2}, {'dense': 512, 'learning_rate': 1e-5}, {'dense': 1024, 'learning_rate': 1e-5/2}]

In [None]:
for param in params:
    print('Inicio do processo com o modelo de ' + str(param['dense']) + ' camadas')
    
    # defining 2 input dense for input_ids and attn_masks
    input_ids = tf.keras.Input(shape=(512,), name='input_ids', dtype='int32')
    attn_masks = tf.keras.Input(shape=(512,), name='attention_mask', dtype='int32')

    bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1]
    intermediate_layer = tf.keras.layers.Dense(param['dense'], activation='relu', name='intermediate_layer')(bert_embds)
    intermediate_layer2 = tf.keras.layers.Dense(param['dense']*2, activation='relu', name='intermediate_layer2')(intermediate_layer)
    intermediate_layer3 = tf.keras.layers.Dense(param['dense'], activation='relu', name='intermediate_layer3')(intermediate_layer2)
    output_layer = tf.keras.layers.Dense(13, activation='softmax', name='output_layer')(intermediate_layer3)

    rr_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
    rr_model.summary()
    
    with open('/kaggle/working/' + str(param['dense']) + '_modelsummary.csv', 'w') as f:
        with redirect_stdout(f):
            rr_model.summary()
            
    print('Summary exportado')
    
    optim = tf.keras.optimizers.legacy.Adam(learning_rate=param['learning_rate'], decay=1e-6)
    loss_func = tf.keras.losses.CategoricalCrossentropy()
    acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
    
    rr_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])
    
    print('Treinamento iniciado')
    hist = rr_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5
    )
    
    rr_model.save(str(param['dense']) + '_rr_classification_model.h5')
    print('Modelo com ' + str(param['dense']) + ' camadas exportado')
    
    output = []
    
    for i in test["text"]:
        input = prepare_data(i, tokenizer)
        predict = rr_model(input)
        output.append(np.argmax(predict).tolist())
    print('Teste do modelo de ' + str(param['dense']) + ' camadas finalizado')
    
    df_out = pd.DataFrame(output, columns = ["labels"])
    df_out_id = pd.DataFrame(test["id"], columns = ["id"])
    df_out_id["labels"] = df_out["labels"]
    
    
    df_out_id.to_csv(str(param['dense']) + '_predict.csv', index=False)
    print('Saida do modelo de ' + str(param['dense']) + ' camadas exportada')
    
    y_true = test["labels"].tolist()
    y_pred = df_out_id['labels'].tolist()
    cm = confusion_matrix(y_true, y_pred)

    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['ANALYSIS', 'FAC', 'PREAMBLE', 'PRE_RELIED', 'NONE', 'ARG_PETITIONER', 
                                                  'RPC', 'RLC', 'ARG_RESPONDENT', 'RATIO', 'STA', 'ISSUE', 'PRE_NOT_RELIED'])

    disp.plot(cmap=plt.cm.Blues, xticks_rotation='vertical')
    plt.savefig(str(param['dense']) + '_cm.png', bbox_inches='tight')
    print('Matriz de confusao do modelo de ' + str(param['dense']) + ' camadas exportada')
    
    metrics = classification_report(y_true, y_pred)
    with open(str(param['dense']) + '_metrics.csv', 'w') as out:
        out.write(metrics)
        
    print('Metrics do modelo de ' + str(param['dense']) + ' camadas exportada')
    print('Processo do modelo de ' + str(param['dense']) + ' camadas finalizado')
    