In [None]:
!pip install lime
!pip3 install openpyxl
!pip install transformers
!pip install --upgrade transformers
!pip install --upgrade datasets

**Importul bibliotecilor necesare**

In [None]:
import numpy as np

# Pandas ne ajuta sa citim fisiere .xlsx
import pandas as pd
import re
import random

import tensorflow as tf
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer

import openpyxl

# train_test_split ne ajuta sa impartim setul de date citit in set de 
# antrenare si set de testare
from sklearn.model_selection import train_test_split

# lime ne ajuta sa verificam prezicerile facute si sa vedem logica din spate
# in format human readable
from lime import lime_text
from lime.lime_text import LimeTextExplainer
from sklearn.pipeline import make_pipeline

from transformers import DataCollatorWithPadding

**Extragerea setului de date**

In [None]:
df_1 = pd.read_excel('/kaggle/input/review-uri/reviews_negative_123.xlsx')
df_2 = pd.read_excel('/kaggle/input/review-uri/reviews_positive_45.xlsx')

negative_ratings = [0] * len(df_1['body'].to_list())
positive_ratings = [1] * len(df_2['body'].to_list())

df_1['sentiment'] = negative_ratings
df_2['sentiment'] = positive_ratings

frames = [df_1, df_2]
data = pd.concat(frames)

data = data.dropna()

print(data.iloc[0])

**Pregatirea setului de date**

In [None]:
# prepare data
def clean_df(df):
    # strip dash but keep a space
    df['body'] = df['body'].str.replace('-', ' ')
    # lower case the data
    df['body'] = df['body'].apply(lambda x: x.lower())
    # remove excess spaces near punctuation
    df['body'] = df['body'].apply(lambda x: re.sub(r'\s([?.!"](?:\s|$))', r'\1', x))
    # remove excess white spaces
    df['body'] = df['body'].apply(lambda x: " ".join(x.split()))
    # add " </s>" to end of body
    df['body'] = df['body'] + " </s>"
    
    #df['rating'] = df['rating'] - 1
    df['sentiment'] = [str(x) for x in df['sentiment']]
    df['sentiment'] = df['sentiment'] + "</s>"
    
    return df


# clean df
data = clean_df(data)

**Impartirea setului de date in set de antrenare, set de testare si set de validare**

In [None]:
train, test = train_test_split(data, random_state=100, shuffle=True)
train, val = train_test_split(train, random_state=100, shuffle=True)

train_reviews = train['body'].to_list()
train_sentiments = train['sentiment'].to_list()

test_reviews = test['body'].to_list()
test_sentiments = test['sentiment'].to_list()

val_reviews = val['body'].to_list()
val_sentiments = val['sentiment'].to_list()

print(val_reviews[0])
print(val_sentiments[0])

print(train_reviews[0])
print(train_sentiments[0])

print(test_reviews[0])
print(test_sentiments[0])

**Crearea environment-ului de lucru**

In [None]:
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')
model = TFAutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-base')

**Prelucrarea setului de date**

In [None]:
## PRELUCRAREA DATELOR DE ANTRENARE------------------------------------------------------------------------------------------------------------------------

task_prefix = "Score the following review on a scale from 0 to 1:"

encoding = tokenizer([task_prefix + sequence for sequence in train_reviews], padding="longest",
                        max_length=512, truncation=True,return_tensors="np")

train_input_ids, train_attention_mask = encoding.input_ids, encoding.attention_mask

train_sentiments_strings = [str(x) for x in train_sentiments]
target_encoding = tokenizer(train_sentiments_strings, padding="longest", max_length=128, truncation=True, return_tensors="np")

train_labels = target_encoding.input_ids
train_labels[train_labels == tokenizer.pad_token_id] = -100

## PRELUCRAREA DATELOR DE VALIDARE------------------------------------------------------------------------------------------------------------------------

encoding = tokenizer([task_prefix + sequence for sequence in val_reviews], padding="longest",
                        max_length=512, truncation=True,return_tensors="np")

val_input_ids, val_attention_mask = encoding.input_ids, encoding.attention_mask

val_sentiments_strings = [str(x) for x in val_sentiments]
target_encoding = tokenizer(val_sentiments_strings, padding="longest", max_length=128, truncation=True, return_tensors="np")

val_labels = target_encoding.input_ids
val_labels[val_labels == tokenizer.pad_token_id] = -100

**Stabilirea optimizatorului si a metricii compilare**

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-05, epsilon=1e-08)
#loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
metrics=tf.metrics.SparseCategoricalAccuracy()

**Compilarea modelului**

In [None]:
model.compile(optimizer=optimizer)

**Stabilirea parametrilor pentru antrenarea modelului**

In [None]:
batch_size = 8
num_epochs = 2

In [None]:
train_dict = {
    "input_ids": train_input_ids,
    "attention_mask": train_attention_mask,
    "labels": train_labels
}

validation_dict = {
    "input_ids": val_input_ids,
    "attention_mask": val_attention_mask,
    "labels": val_labels
}

train_dataset = tf.data.Dataset.from_tensor_slices(train_dict)
train_dataset = train_dataset.shuffle(16384).batch(batch_size)

val_dataset = tf.data.Dataset.from_tensor_slices(validation_dict)
val_dataset = val_dataset.shuffle(16384).batch(batch_size)

**Antrenarea modelului**

In [None]:
model.fit(x=train_dataset, epochs=num_epochs, batch_size=batch_size, validation_data=val_dataset)

In [None]:
outputs = []

for review in test_reviews:
    encoding = tokenizer(task_prefix + review, max_length=512, truncation=True, return_tensors="tf")
    output = model.generate(encoding.input_ids, max_length=2)
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    outputs.append(output)

In [None]:
print(type(outputs[0]))
print(outputs)

In [None]:
for i in range(0, len(outputs)):
    if outputs[i] == '':
        outputs[i] = 0
    else:
        outputs[i] = int(outputs[i])
    
print(type(outputs[0]))
print(outputs)

In [None]:
print(test_sentiments)

In [None]:
for i in range(0, len(test_sentiments)):
    test_sentiments[i] = test_sentiments[i].replace("</s>", "")

for i in range(0, len(test_sentiments)):
    test_sentiments[i] = int(test_sentiments[i])
    
print(type(test_sentiments[0]))
print(test_sentiments)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(test_sentiments, outputs)
print(cm)

import seaborn as sns
sns.heatmap(cm, annot=True)

In [None]:
import matplotlib.pyplot as plt

def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          title=None):
    '''
    This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization.
    Arguments
    ---------
    cf:            confusion matrix to be passed in
    group_names:   List of strings that represent the labels row by row to be shown in each square.
    categories:    List of strings containing the categories to be displayed on the x,y axis. Default is 'auto'
    count:         If True, show the raw number in the confusion matrix. Default is True.
    normalize:     If True, show the proportions for each category. Default is True.
    cbar:          If True, show the color bar. The cbar values are based off the values in the confusion matrix.
                   Default is True.
    xyticks:       If True, show x and y ticks. Default is True.
    xyplotlabels:  If True, show 'True Label' and 'Predicted Label' on the figure. Default is True.
    sum_stats:     If True, display summary statistics below the figure. Default is True.
    figsize:       Tuple representing the figure size. Default will be the matplotlib rcParams value.
    cmap:          Colormap of the values displayed from matplotlib.pyplot.cm. Default is 'Blues'
                   See http://matplotlib.org/examples/color/colormaps_reference.html
                   
    title:         Title for the heatmap. Default is None.
    '''


    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations
        accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""


    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False


    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)
    
    if title:
        plt.title(title)

In [None]:
labels = ["True Neg","False Pos","False Neg","True Pos"]
categories = ["Zero", "One"]
make_confusion_matrix(cm, figsize=(8,6))

In [None]:
model.save_pretrained('flan_t5_2')

In [None]:
!zip -r flan_t5_2.zip /kaggle/working/flan_t5_2