In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from transformers import RobertaTokenizer, TFRobertaModel
import tensorflow as tf

file_path = '/content/drive/My Drive/266 Project/efcamdat_valid_data.csv'

df = pd.read_csv(file_path)
print(df.head())

print(df.columns)

   Unnamed: 0                                               text  labels
0      144914  My name is Guo Jing and my english C English n...       1
1      107289  I've traveled SP travelled to Venezuela. I've ...       1
2      224725  My name is Nancy, AS I live in Wuhan. AS I'm t...       0
3       19046  I just heard a song by Josh Woodward called 'W...       2
4      231400  Hi, It's been brought to my attention that you...       0
Index(['Unnamed: 0', 'text', 'labels'], dtype='object')


**Check data**

In [44]:
df.columns

Index(['Unnamed: 0', 'text', 'labels'], dtype='object')

In [45]:
df.shape[0]

40607

In [46]:
print("Unique label values before filtering:", df['labels'].unique())

Unique label values before filtering: [1 0 2 3 4 5]


In [47]:
# Filter out invalid labels
df = df[df['labels'].isin(range(6))]

In [48]:
# Check unique label values after filtering make sure 6 labels
print("Unique label values after filtering:", df['labels'].unique())

Unique label values after filtering: [1 0 2 3 4 5]


**Validation and test set sizes**

In [49]:
text = df['text']
labels = df['labels']

In [50]:
#validation and test set sizes
test_size = int(0.1 * len(df))  # 10% for testing
valid_size = int(0.2 * len(df))  # 20% for validation

In [51]:
# Split off the test set
train_texts, test_texts, train_labels, test_labels = train_test_split(text, labels, test_size=test_size, shuffle=True, random_state=42)

In [52]:
# Split off the validation set from the remaining training data
train_texts, valid_texts, train_labels, valid_labels = train_test_split(train_texts, train_labels, test_size=valid_size, shuffle=True, random_state=42)

In [53]:
# Check unique label values
print("Unique label values in training set:", train_labels.unique())
print("Unique label values in validation set:", valid_labels.unique())
print("Unique label values in test set:", test_labels.unique())

Unique label values in training set: [3 2 1 0 4 5]
Unique label values in validation set: [1 3 2 0 4 5]
Unique label values in test set: [3 0 2 4 1 5]


In [54]:
len(test_texts)

4060

In [55]:
# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=10000)
train_vectors = vectorizer.fit_transform(train_texts)
test_vectors = vectorizer.transform(test_texts)

In [56]:
# Train a logistic regression model
baseline_model = LogisticRegression(max_iter=1000)
baseline_model.fit(train_vectors, train_labels)


**Evaluate the model**

In [57]:
test_preds = baseline_model.predict(test_vectors)
print(classification_report(test_labels, test_preds))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1054
           1       0.94      0.92      0.93      1124
           2       0.89      0.94      0.92      1091
           3       0.87      0.90      0.89       609
           4       0.98      0.59      0.74       159
           5       0.89      0.35      0.50        23

    accuracy                           0.92      4060
   macro avg       0.92      0.78      0.82      4060
weighted avg       0.92      0.92      0.92      4060



**Re-check Label Values Before Creating TensorFlow Datasets**

In [58]:
text = df['text'].tolist()
labels = df['labels'].tolist()

validation_proportion = 0.1

# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(text, labels, test_size=0.1, shuffle=True, random_state=42)



In [59]:
# Split the training set into a smaller training set and validation set
train_texts, valid_texts, train_labels, valid_labels = train_test_split(train_texts, train_labels, test_size=0.1, shuffle=True, random_state=42)

In [60]:
# Check the data
print(f"Total dataset size: {len(text)}")
print(f"Training set size: {len(train_texts)}")
print(f"Validation set size: {len(valid_texts)}")
print(f"Test set size: {len(test_texts)}")

print(f"First 5 elements of train_texts: {train_texts[:5]}")
print(f"First 5 elements of valid_texts: {valid_texts[:5]}")
print(f"First 5 elements of test_texts: {test_texts[:5]}")


Total dataset size: 40607
Training set size: 32891
Validation set size: 3655
Test set size: 4061
First 5 elements of train_texts: ["I don''t want to name He is so inconvinient and he But he can''t imagine a girl put a video in the internet when he sleep and she became a 15 minutes celebrites. He get very nervous with that and the girl is getting money with this.", "I'm Anna I'm 40. I have long brown hair and big black eyes. I'm short and slim. I have AR a big RS nose and AR a small mouth. I'm a manager. AS I like my job very much. I have three pepole SP people in my family.%%My husband, AS my son and I. I love them very much.", "Hi, Mr. Souza . PU , I have great news about the meeting I had with Mr. Lucio, on the issue of buying him food company, AS '' RS OCP Alimentos''. They really showed interest in concluding the negotiations, may join our strength and our prestige to their company, but the values ??are still a bit high for us, so we have to work to decrease and leave a level that 

In [61]:
# Define the validation and test set sizes
valid_size = 40607
test_size = 4061

**Tokenization**

In [62]:
max_length = 50
rtokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize_texts(texts, tokenizer, max_length):
    if not isinstance(texts, list) or not all(isinstance(t, str) for t in texts):
        raise ValueError("Input texts should be a list of strings.")
    return tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='tf')

In [63]:
def safe_tokenize(texts, tokenizer, max_length):
    try:
        encodings = tokenize_texts(texts, tokenizer, max_length)
        if 'input_ids' not in encodings:
            raise ValueError("Tokenization did not produce 'input_ids'.")
        print(f"Successfully tokenized {len(texts)} texts.")
        return encodings
    except ValueError as e:
        print(f"Tokenization error: {e}")
        return None


In [64]:
train_encodings = safe_tokenize(list(train_texts), rtokenizer, max_length)
valid_encodings = safe_tokenize(list(valid_texts), rtokenizer, max_length)
test_encodings = safe_tokenize(list(test_texts), rtokenizer, max_length)

Successfully tokenized 32891 texts.
Successfully tokenized 3655 texts.
Successfully tokenized 4061 texts.


In [65]:
if train_encodings is None or valid_encodings is None or test_encodings is None:
    raise ValueError("Tokenization failed for one or more datasets.")

# Tokenized encodings to verify
print(f"Keys of train_encodings: {train_encodings.keys()}")
print(f"Keys of valid_encodings: {valid_encodings.keys()}")
print(f"Keys of test_encodings: {test_encodings.keys()}")

Keys of train_encodings: dict_keys(['input_ids', 'attention_mask'])
Keys of valid_encodings: dict_keys(['input_ids', 'attention_mask'])
Keys of test_encodings: dict_keys(['input_ids', 'attention_mask'])


In [66]:
# Convert to TensorFlow Datasets
train_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": train_encodings.input_ids, "attention_mask": train_encodings.attention_mask}, train_labels)).shuffle(len(train_texts)).batch(16).prefetch(tf.data.experimental.AUTOTUNE)
valid_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": valid_encodings.input_ids, "attention_mask": valid_encodings.attention_mask}, valid_labels)).batch(16).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": test_encodings.input_ids, "attention_mask": test_encodings.attention_mask}, test_labels)).batch(16).prefetch(tf.data.experimental.AUTOTUNE)

In [67]:
# Adjust the input dictionary keys to match the expected names in the model
def adjust_keys(encodings):
    return {
        'input_ids_layer': encodings['input_ids'],
        'attention_mask_layer': encodings['attention_mask']
    }

In [68]:
# Apply adjust_keys to each encoding
train_encodings_adjusted = adjust_keys(train_encodings)
valid_encodings_adjusted = adjust_keys(valid_encodings)
test_encodings_adjusted = adjust_keys(test_encodings)

# Print the Adjusted keys of train_encodings
print(f"Adjusted keys of train_encodings: {train_encodings_adjusted.keys()}")
print(f"Adjusted keys of valid_encodings: {valid_encodings_adjusted.keys()}")
print(f"Adjusted keys of test_encodings: {test_encodings_adjusted.keys()}")


Adjusted keys of train_encodings: dict_keys(['input_ids_layer', 'attention_mask_layer'])
Adjusted keys of valid_encodings: dict_keys(['input_ids_layer', 'attention_mask_layer'])
Adjusted keys of test_encodings: dict_keys(['input_ids_layer', 'attention_mask_layer'])


In [69]:
# Convert to TensorFlow Datasets
train_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids_layer": train_encodings.input_ids, "attention_mask_layer": train_encodings.attention_mask}, train_labels)).shuffle(len(train_texts)).batch(16).prefetch(tf.data.experimental.AUTOTUNE)
valid_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids_layer": valid_encodings.input_ids, "attention_mask_layer": valid_encodings.attention_mask}, valid_labels)).batch(16).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids_layer": test_encodings.input_ids, "attention_mask_layer": test_encodings.attention_mask}, test_labels)).batch(16).prefetch(tf.data.experimental.AUTOTUNE)

In [70]:
# Check TensorFlow Dataset Elements
for batch in test_dataset.take(1):
    inputs, labels = batch
    print(f"Input IDs shape: {inputs['input_ids_layer'].shape}")
    print(f"Attention Mask shape: {inputs['attention_mask_layer'].shape}")
    print(f"Labels shape: {labels.shape}")

Input IDs shape: (16, 50)
Attention Mask shape: (16, 50)
Labels shape: (16,)


**Train the Model**

In [71]:
def create_roberta_cl_model(model, num_classes=6, dropout=0.3, learning_rate=0.0001):
    model.trainable = False

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='input_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer')

    model_inputs = [input_ids, attention_mask]
    model_out = model(input_ids=input_ids, attention_mask=attention_mask)
    model_out = model_out.last_hidden_state

    conv = tf.keras.layers.Conv1D(filters=256, kernel_size=3, activation='relu')(model_out)
    conv = tf.keras.layers.MaxPooling1D(pool_size=2)(conv)
    conv = tf.keras.layers.Dropout(dropout)(conv)
    lstm = tf.keras.layers.LSTM(units=256, return_sequences=False, return_state=False)(conv)
    lstm = tf.keras.layers.Dropout(dropout)(lstm)
    classification = tf.keras.layers.Dense(num_classes, activation='softmax', name='classification_layer')(lstm)

    classification_model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=[classification])

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, clipnorm=0.1)
    optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)

    classification_model.compile(optimizer=optimizer,
                                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                                 metrics=['accuracy'])

    return classification_model



In [72]:
roberta_model = TFRobertaModel.from_pretrained('roberta-base')
roberta_cl_model = create_roberta_cl_model(model=roberta_model, num_classes=6)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.embeddings.position_ids', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

In [73]:
import gc
# Clear session and free memory before starting the training
tf.keras.backend.clear_session()
gc.collect()

33955

In [74]:
roberta_cl_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids_layer (InputLaye  [(None, 50)]                 0         []                            
 r)                                                                                               
                                                                                                  
 attention_mask_layer (Inpu  [(None, 50)]                 0         []                            
 tLayer)                                                                                          
                                                                                                  
 tf_roberta_model (TFRobert  TFBaseModelOutputWithPooli   1246456   ['input_ids_layer[0][0]',     
 aModel)                     ngAndCrossAttentions(last_   32         'attention_mask_layer[0][

In [75]:
# dataset = tf.data.Dataset.from_tensor_slices(({"input_ids_layer": train_encodings.input_ids, "attention_mask_layer": train_encodings.attention_mask}, train_labels))
# dataset = dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)

vdataset = tf.data.Dataset.from_tensor_slices(({"input_ids_layer": valid_encodings.input_ids, "attention_mask_layer": valid_encodings.attention_mask}, valid_labels))
vdataset = vdataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)

In [76]:
roberta_cl_model_history = roberta_cl_model.fit(
    train_dataset,
    validation_data=valid_dataset,
    epochs=1
)

Epoch 1/2
Epoch 2/2


In [77]:
# Predict and evaluate the model
preds = roberta_cl_model.predict(test_dataset)
pred_labels = tf.argmax(preds, axis=-1)
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      1054
           1       0.87      0.91      0.89      1125
           2       0.95      0.74      0.83      1091
           3       0.71      0.90      0.79       609
           4       0.59      0.60      0.60       159
           5       0.00      0.00      0.00        23

    accuracy                           0.86      4061
   macro avg       0.67      0.68      0.67      4061
weighted avg       0.86      0.86      0.86      4061



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
