## Data Cleaning from ENG

In [1]:
import os
import shutil
import tarfile
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from transformers import BertTokenizer, TFBertForSequenceClassification
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# Not required but useful
from transformers import logging
logging.set_verbosity_error()
from tqdm.notebook import tqdm
tqdm.pandas()

2024-03-17 19:03:24.116915: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv('IMDB Dataset SPANISH.csv', encoding='utf8')
df = df[['review_en', 'review_es', 'sentiment']]

In [3]:
train_n = 5000
test_n = 5000
random.seed(42)
train_df = df.sample(train_n, random_state=42) #df[['review_en', 'sentiment', 'review_
test_df = (df[~df.isin(train_df)].dropna()).sample(test_n, random_state=42) #[['review

## Sentiment Analysis: Spanish

In [4]:
# Splits train between train(40%) and validation(20%)
x_train_es, x_val_es, y_train_es, y_val_es = train_test_split(
                                                    train_df['review_es'],
                                                    train_df['sentiment'],
                                                    test_size=0.25,
                                                    stratify = train_df['sentiment'],
                                                    random_state=42
                                                )

In [5]:
# Tokenizes and encodes the sentences for training
# tried google-bert/bert-base-multilingual-uncased and dccuchile/bert-base-spanish-wwm-uncased
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased', do_lower_case=True)
max_len= 128

X_train_encoded_es = tokenizer.batch_encode_plus(x_train_es.tolist(),
                                              padding=True,
                                              truncation=True,
                                              max_length = max_len,
                                              return_tensors='tf')

X_val_encoded_es = tokenizer.batch_encode_plus(x_val_es.tolist(),
                                              padding=True,
                                              truncation=True,
                                              max_length = max_len,
                                              return_tensors='tf')

2024-03-17 19:03:47.181636: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10398 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080 Ti, pci bus id: 0000:60:00.0, compute capability: 6.1


In [6]:
# Gets model and fine-tunes
LEARNING_RATE = 5e-5

model = TFBertForSequenceClassification.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased', num_labels=2)

optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=LEARNING_RATE)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer,
              loss=loss,
              metrics=[metric])

In [7]:
# Cleans datasets for model fitting
train_dataset_es = tf.data.Dataset.from_tensor_slices((
    dict(X_train_encoded_es),
    y_train_es.apply(lambda x: True if x=='positive' else False)
))

test_dataset_es = tf.data.Dataset.from_tensor_slices((
    dict(X_val_encoded_es),
    y_val_es.apply(lambda x: True if x=='positive' else False)
))

In [None]:
# Fits model based on data
model.fit(train_dataset_es.batch(16),
              epochs=4,
              batch_size=16,
              validation_data=test_dataset_es.batch(16))


Epoch 1/4

In [None]:
#running 15 trials to get test accuracies
for i in range(15):
    sample_df = test_df.sample(n=1000, random_state=i)  
    
    sample_df['es_sentiment'] = sample_df['review_es'].progress_apply(predict)
    
    
    correct_pred = sample_df.query('sentiment == es_sentiment').shape[0]
    accuracy = correct_pred / sample_df.shape[0]
    
    accuracies.append(accuracy)
    print("trial accuracy for", i, "=", accuracy)

average_accuracy_es = np.mean(accuracies)
print("mean spanish accuracy =",average_accuracy_es)

In [None]:
# Save data
accuracies_df = pd.DataFrame(accuracies, columns=['accuracy'])
accuracies_df.to_csv('test_df_with_bert_only.csv')

test_df_es = pd.read_csv('test_df_with_bert_only.csv')
test_df_es
test_df_es = test_df_es.rename(columns={'Unnamed: 0': 'trials'})

In [None]:
test_df

In [None]:
test_df_es

In [None]:
test_df

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Gets BERT sentiment and accuracy
test_df['bert_es_sentiment'] = test_df['review_es'].progress_apply(predict)
test_df.query('sentiment == bert_es_sentiment').shape[0]/test_df.shape[0]
conf_matrix = confusion_matrix(test_df['sentiment'], test_df['bert_es_sentiment'])

conf_matrix_df = pd.DataFrame(conf_matrix, 
                              index=['True Neg', 'True Pos'], 
                              columns=['Predicted Neg', 'Predicted Pos'])

plt.figure(figsize=(10,7))
sns.heatmap(conf_matrix_df, annot=True, fmt="d")
plt.title('Confusion Matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
test_df_es