In [2]:
!pip install tensorflow==2.12.0
!pip install tensorflow_hub==0.13.0
!pip install tensorflow_text==2.12.0


Mounted at /content/drive


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [6]:
!pip install beautifulsoup4 unidecode

Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: unidecode
Successfully installed unidecode-1.3.8


In [7]:
from bs4 import BeautifulSoup
import re
import unidecode

# Function to remove HTML tags and convert to lowercase
def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags
    text = unidecode.unidecode(text)  # Convert Unicode characters to ASCII
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = text.lower().strip()  # Convert to lowercase and strip
    return text

# Main text preprocessing function
def text_preprocess(text):
    text = clean_text(text)
    # Additional steps like acronym normalization or word segmentation can be added here if needed
    return text

In [8]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Assuming bigdata_selected is already loaded from Google Drive
# Replace this with your actual data loading code if necessary
file_path = '/content/drive/MyDrive/ABSA/bigdata_selected.csv'
bigdata_selected = pd.read_csv(file_path)

# Define features (X) and target (y)
X = bigdata_selected['Review'].astype(str).apply(text_preprocess) # Assuming 'Review' is your feature column
y = bigdata_selected[['Taste', 'Smell', 'Quality']]  # Assuming these are your target columns


split_index = int(0.8 * len(X))
X_train, X_val = X[:split_index], X[split_index:]
y_train, y_val = y[:split_index], y[split_index:]


# Print shapes to verify split
print("Training set:")
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print("Validation set:")
print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")

# Now you can proceed to train your model using X_train and y_train,
# and validate using X_val and y_val


  text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags


Training set:
X_train shape: (2467,), y_train shape: (2467, 3)
Validation set:
X_val shape: (617,), y_val shape: (617, 3)


In [9]:
import numpy as np
def make_outputs(df):
    outputs = []
    for row in range(len(df)):
        row_one_hot = []
        for col in range(0, len(df.columns)):
            sentiment = df.iloc[row, col]
            if   sentiment == 0: one_hot = [1, 0, 0, 0] # None
            elif sentiment == 1: one_hot = [0, 1, 0, 0] # Pos
            elif sentiment == 2: one_hot = [0, 0, 1, 0] # Neg
            elif sentiment == 3: one_hot = [0, 0, 0, 1] # Neu
            row_one_hot.extend(one_hot)

        outputs.append(row_one_hot)
    return np.array(outputs, dtype='uint8')

In [10]:
y_train_standardize_output = make_outputs(y_train)
y_val_standardize_output = make_outputs(y_val)

print('Train outputs:', y_train_standardize_output.shape)
print('Validate outputs:', y_val_standardize_output.shape)
y_train_standardize_output[0]

Train outputs: (2467, 12)
Validate outputs: (617, 12)


array([0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0], dtype=uint8)

In [11]:
from tensorflow.keras.optimizers import Adam

In [13]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')

    # Preprocessing layer
    preprocessing_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3', name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)

    # Encoder layer
    encoder = hub.KerasLayer('https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/bert-en-uncased-l-10-h-512-a-8/2', trainable=False, name='BERT_encoder')
    outputs = encoder(encoder_inputs)

    # Extract the pooled output
    net = outputs['pooled_output']

    # Add dense and dropout layers
    net = tf.keras.layers.Dense(400, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(200, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(100, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(50, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.1)(net)

    # Output layer
    net = tf.keras.layers.Dense(12, activation='sigmoid', name='classifier')(net)

    # Create and return the model
    return tf.keras.Model(inputs=text_input, outputs=net)

# Build the classifier model
classifier_model = build_classifier_model()

# Print the model summary
classifier_model.summary()

# Compile the model
adam = Adam(learning_rate=0.001)
metric_acc = tf.keras.metrics.BinaryAccuracy(name="binary_accuracy", dtype=None, threshold=0.5)
classifier_model.compile(optimizer=adam, loss="binary_crossentropy", metrics=[metric_acc])




Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 preprocessing (KerasLayer)     {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

In [17]:
callback_model= tf.keras.callbacks.ModelCheckpoint('model_small_Bert.h5', monitor ="val_loss")
history= classifier_model.fit(x=X_train, y =y_train_standardize_output,  epochs =10, validation_data=[X_val,y_val_standardize_output], callbacks=[callback_model])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
g