In [None]:
!pip install lime
!pip3 install openpyxl
!pip install transformers
!pip install --force-reinstall -v "openpyxl==3.1.0" 

**Importul bibliotecilor necesare**

In [None]:
import numpy as np

# Pandas ne ajuta sa citim fisiere .xlsx
import pandas as pd

import random

import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer

import openpyxl

# train_test_split ne ajuta sa impartim setul de date citit in set de 
# antrenare si set de testare
from sklearn.model_selection import train_test_split

# lime ne ajuta sa verificam prezicerile facute si sa vedem logica din spate
# in format human readable
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from sklearn.pipeline import make_pipeline

**Definirea setului de date**

In [None]:
df_1 = pd.read_excel('/kaggle/input/review-uri/reviews_negative_123.xlsx')
df_2 = pd.read_excel('/kaggle/input/review-uri/reviews_positive_45.xlsx')

frames = [df_1, df_2]
data = pd.concat(frames)

data = data.dropna()

**Impartirea setului de date in set de antrenare, set de testare si set de validare**

In [None]:
train, test = train_test_split(data, random_state=100, shuffle=True)
train, val = train_test_split(train, random_state=100, shuffle=True)

train_reviews = train['body'].to_list()
train_ratings = train['rating'].to_list()

test_reviews = test['body'].to_list()
test_ratings = test['rating'].to_list()

val_reviews = val['body'].to_list()
val_ratings = val['rating'].to_list()

train_ratings = list(map(lambda x: x - 1, train_ratings))
test_ratings = list(map(lambda x: x - 1, test_ratings))
val_ratings = list(map(lambda x: x - 1, val_ratings))

print(val_reviews[0])
print(val_ratings[0])

**Crearea environment-ului de lucru**

In [None]:
tokenizer = AutoTokenizer.from_pretrained("readerbench/RoBERT-base")
bert = TFAutoModel.from_pretrained("readerbench/RoBERT-base")

token_ids = tf.keras.layers.Input((None,), dtype=np.int32)
type_ids = tf.keras.layers.Input((None,), dtype=np.int32)
attention = tf.keras.layers.Input((None,), dtype=np.int32)
bert_output = bert(input_ids=token_ids, attention_mask=attention, token_type_ids=type_ids)
cls_output = bert_output.last_hidden_state[:,0,:]
# avg_output = tf.keras.layers.GlobalAveragePooling1D()(bert_output.last_hidden_state, mask=attention)
hidden = tf.keras.layers.Dense(32, activation="tanh")(cls_output)
output = tf.keras.layers.Dense(5, activation="softmax")(hidden)
model = tf.keras.Model(inputs=[token_ids, type_ids, attention], outputs=[output])

**Detectarea layer-ului aferent lui Bert**

In [None]:
print(model.summary())
print("---------------------------------------------")
print(model.layers[3])
print(model.layers[4])

**Stabilirea optimizatorului si a functiei loss pentru compilare**

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-05, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)

**Setarea layer-ului aferent lui Bert ca inactiv si compilarea modelului**

In [None]:
model.layers[3].trainable = False
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

**Prelucarea setului de date**

In [None]:
train_reviews_encoded = tokenizer(train_reviews, padding=True, return_tensors="np")
val_reviews_encoded = tokenizer(val_reviews, padding=True, return_tensors="np")
test_reviews_encoded = tokenizer(test_reviews, padding=True, return_tensors="np")

**Stabilirea parametrilor pentru antrenarea modelului**

In [None]:
input = [val_reviews_encoded['input_ids'], 
         val_reviews_encoded['token_type_ids'],
         val_reviews_encoded['attention_mask'],]

val_data = (input, np.array(val_ratings))

input = [train_reviews_encoded['input_ids'],
         train_reviews_encoded['token_type_ids'],
         train_reviews_encoded['attention_mask'],]

**Antrenarea modelului**

In [None]:
model.fit(x=input, y=np.array(train_ratings), batch_size=6, epochs=1, validation_data=val_data)

**Activarea layer-ului aferent lui Bert**

In [None]:
model.layers[3].trainable = True

**Modificarea parametrilor de compilare astfel incat sa avem o rata de invatare mai mica**

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-05, epsilon=1e-08)

**Recompilarea modelului**

In [None]:
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

**Reantrenarea modelului**

In [None]:
model.fit(x=input, y=np.array(train_ratings), batch_size=6, epochs=2, validation_data=val_data)

**Evaluarea modelului**

In [None]:
input = [test_reviews_encoded['input_ids'], 
         test_reviews_encoded['token_type_ids'],
         test_reviews_encoded['attention_mask']]

model.evaluate(x=input, y=np.array(test_ratings), batch_size=6)

**Prezicerea setului de testare**

In [None]:
predicted_sentiments = model.predict(input)
print(predicted_sentiments)

In [None]:
predicted_ratings_binary = tf.nn.softmax(predicted_sentiments,axis=1)
predicted_ratings_binary = tf.argmax(predicted_ratings_binary, axis=1)
predicted_ratings_binary = predicted_ratings_binary.numpy()
print(predicted_sentiments)
print("-----------------------------------")
print(test_ratings)

**Vizualizarea performantei modelului**

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(test_ratings, predicted_ratings_binary)
print(cm)

import seaborn as sns
sns.heatmap(cm, annot=True)

In [None]:
import matplotlib.pyplot as plt

def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          title=None):
    '''
    This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization.
    Arguments
    ---------
    cf:            confusion matrix to be passed in
    group_names:   List of strings that represent the labels row by row to be shown in each square.
    categories:    List of strings containing the categories to be displayed on the x,y axis. Default is 'auto'
    count:         If True, show the raw number in the confusion matrix. Default is True.
    normalize:     If True, show the proportions for each category. Default is True.
    cbar:          If True, show the color bar. The cbar values are based off the values in the confusion matrix.
                   Default is True.
    xyticks:       If True, show x and y ticks. Default is True.
    xyplotlabels:  If True, show 'True Label' and 'Predicted Label' on the figure. Default is True.
    sum_stats:     If True, display summary statistics below the figure. Default is True.
    figsize:       Tuple representing the figure size. Default will be the matplotlib rcParams value.
    cmap:          Colormap of the values displayed from matplotlib.pyplot.cm. Default is 'Blues'
                   See http://matplotlib.org/examples/color/colormaps_reference.html
                   
    title:         Title for the heatmap. Default is None.
    '''


    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations
        accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""


    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False


    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf,annot=box_labels,fmt="",cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)
    
    if title:
        plt.title(title)

In [None]:
labels = ["True Neg","False Pos","False Neg","True Pos"]
categories = ["Zero", "One"]
make_confusion_matrix(cm, figsize=(8,6))

**Afisarea datelor prezise gresit**

In [None]:
from distutils.dir_util import copy_tree

from_directory = '/kaggle/input/wrong-predicted-reviews'
to_directory = '/kaggle/working/'

copy_tree(from_directory, to_directory)

In [None]:
file_path = '/kaggle/working/wrong-predicted-reviews.xlsx'
wb = openpyxl.Workbook()
wb.save(file_path)

sheet = wb.active
k = 0

for i in range(0, len(test_reviews)):
    if predicted_ratings_binary[i] != test_ratings[i]:
        k = k + 1
        c = sheet.cell(row = k, column = 1)
        c.value = test_reviews[i]

        c = sheet.cell(row = k, column = 2)
        c.value = predicted_ratings_binary[i]

        c = sheet.cell(row = k, column = 3)
        c.value = test_ratings[i]
        
wb.save(file_path)

**Afisarea valorilor pentru precision, recall si f1 score**

In [None]:
precision_1 = cm[0,0] / sum(cm[:,0])
precision_2 = cm[1,1] / sum(cm[:,1])
precision_3 = cm[2,2] / sum(cm[:,2])
precision_4 = cm[3,3] / sum(cm[:,3])
precision_5 = cm[4,4] / sum(cm[:,4])

recall_1 = cm[0,0] / sum(cm[0,:])
recall_2 = cm[1,1] / sum(cm[1,:])
recall_3 = cm[2,2] / sum(cm[2,:])
recall_4 = cm[3,3] / sum(cm[3,:])
recall_5 = cm[4,4] / sum(cm[4,:])

f1_score_1 = 2 * ((precision_1 * recall_1) / (precision_1 + recall_1))
f1_score_2 = 2 * ((precision_2 * recall_2) / (precision_2 + recall_2))
f1_score_3 = 2 * ((precision_3 * recall_3) / (precision_3 + recall_3))
f1_score_4 = 2 * ((precision_4 * recall_4) / (precision_4 + recall_4))
f1_score_5 = 2 * ((precision_5 * recall_5) / (precision_5 + recall_5))

f1_score_avg = (f1_score_1 + f1_score_2 + f1_score_3 + f1_score_4 + f1_score_5) / 5

text_1 = "Label 1: Precision={:0.3f}, Recall={:0.3f}, F1 Score={:0.3f}".format(precision_1, recall_1, f1_score_1)
text_2 = "Label 2: Precision={:0.3f}, Recall={:0.3f}, F1 Score={:0.3f}".format(precision_2, recall_2, f1_score_2)
text_3 = "Label 3: Precision={:0.3f}, Recall={:0.3f}, F1 Score={:0.3f}".format(precision_3, recall_3, f1_score_3)
text_4 = "Label 4: Precision={:0.3f}, Recall={:0.3f}, F1 Score={:0.3f}".format(precision_4, recall_4, f1_score_4)
text_5 = "Label 5: Precision={:0.3f}, Recall={:0.3f}, F1 Score={:0.3f}".format(precision_5, recall_5, f1_score_5)

print(text_1)
print(text_2)
print(text_3)
print(text_4)
print(text_5)
print("Average F1 Score={:0.3f}".format(f1_score_avg))

In [None]:
def new_predict(text):
    encoded = tokenizer(text, padding=True, return_tensors="np")
    input = [encoded['input_ids'], encoded['token_type_ids'], encoded['attention_mask']]
    return model.predict(input)

In [None]:
import lime
import lime.lime_tabular

test_data = []
test_data.append(test_reviews)
test_data.append(test_ratings)

test_array = np.array(test_data)

from lime.lime_text import LimeTextExplainer

explainer = LimeTextExplainer(class_names=[1, 2, 3, 4, 5])

In [None]:
exp = explainer.explain_instance(test_reviews[0], new_predict, num_features=512, top_labels=1)

In [None]:
exp.show_in_notebook()

In [None]:
model.save('bert_5.h5')

In [None]:
from tensorflow.keras.models import load_model

In [None]:
import transformers as trans

loaded_model = load_model('bert_5.h5', custom_objects={"TFBertModel": trans.TFBertModel})

In [None]:
loaded_model.summary()

In [None]:
output = new_predict(test_reviews[0])
print(output)