In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from google.colab import files
from google.colab import drive

In [None]:
drive.mount('/content/drive')
_PATH_FOLDER_OUT = './outputs/'
_PATH_FOLDER_IN = './inputs/'

In [None]:
num_gpus_available = len(tf.config.experimental.list_physical_devices('GPU'))
print("Num GPUs Available: ", num_gpus_available)
assert num_gpus_available > 0

In [None]:
!pip install transformers

In [None]:
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification


In [None]:
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
#tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
from sklearn.model_selection import train_test_split

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
def f_load_data():
    df = pd.read_csv(_PATH_FOLDER_IN + 'df_agg.csv', sep = ';')
    df['short_review'] = df['text']
    df['short_review'] = df['short_review'].str.lower()
    df = df[["short_review", "Sentiment"]]
    return df


In [None]:

def f_save_df(df_save, file_name_in):
  df_save.to_csv( _PATH_FOLDER_OUT +  '/' + file_name_in + ".csv", sep=';', index= False)


In [None]:
def f_train_model(df_in):
    reviews = df_in['short_review'].values.tolist()
    labels = df_in['Sentiment'].tolist()
    training_sentences, validation_sentences, training_labels, validation_labels = train_test_split(reviews, labels, test_size=.10)
    tokenizer([training_sentences[0]], truncation=True,
                            padding=True, max_length=1024)
    

    train_encodings = tokenizer(training_sentences,
                            truncation=True,
                            padding=True)
    val_encodings = tokenizer(validation_sentences,
                                truncation=True,
                                padding=True)
    
    train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
        training_labels
    ))

    val_dataset = tf.data.Dataset.from_tensor_slices((
        dict(val_encodings),
        validation_labels
    ))

    #model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
    model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
    model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
    model.fit(train_dataset.shuffle(100).batch(2),
            epochs=2,
            batch_size=2)
    # model.fit(train_dataset.shuffle(150).batch(2),
    #         epochs=2,
    #         batch_size=16,
    #         validation_data = val_dataset.shuffle(150).batch(2))
    # model.save_pretrained("./sentiment")
    # loaded_model = TFDistilBertForSequenceClassification.from_pretrained("./sentiment")
    return model


In [None]:
def f_get_prediction(model_in, df_in):


    text_tmp = df_in['short_review'].iloc[-1]
    sent_tmp = df_in['Sentiment'].iloc[-1]
    predict_input = tokenizer.encode(text_tmp,
                                    truncation=True,
                                    padding=True,
                                    return_tensors="tf")

    tf_output = model_in.predict(predict_input)[0]


    tf_prediction = tf.nn.softmax(tf_output, axis=1)
    labels = [0,1]
    label = tf.argmax(tf_prediction, axis=1)
    label = label.numpy()
    pred_tmp = labels[label[0]]
    print('sentiment: ' + str(sent_tmp) + ' pred: ' + str(pred_tmp))
    return pred_tmp

In [None]:
df_data = f_load_data()
df_agg = df_data.copy()
df_agg['Prediction'] = 0
index_lst = df_data.index 
for index_tmp in index_lst[20:]:
    print('index_tmp: ' + str(index_tmp) + '/' + str(index_lst[-1]))
    df_tmp = df_data[:index_tmp]
    df_train = df_data[:index_tmp-1]
    model_tmp = f_train_model(df_train)
    pred_tmp = f_get_prediction(model_tmp, df_tmp)
    df_agg.loc[index_tmp,'Prediction'] = pred_tmp
    del model_tmp
    tf.keras.backend.clear_session()
    filename_out = '_out_' + str(index_tmp)
    f_save_df(df_agg, filename_out)

In [None]:
print(df_agg.tail())

In [None]:
df_out = df_agg[['Sentiment','Prediction']]
df_out.to_csv('df_out.csv', sep = ';', index = False)



In [None]:
  filename_out = 'df_out'
  f_save_df(df_out, filename_out)

In [None]:
confusion_matrix = pd.crosstab(df_out['Sentiment'], df_out['Prediction'], rownames=['Actual'], colnames=['Predicted'], margins = True)
print (confusion_matrix)