In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from transformers import RobertaTokenizer, TFRobertaModel
import tensorflow as tf

file_path = '/content/drive/My Drive/266 Project/efcamdat_sub.csv'

df = pd.read_csv(file_path)

print(df.head())
print(df.columns)

       id  level  unit  learner_id learner_nationality  grade  \
0  679604      7     3      114335                  br     90   
1  151196      9     2      136139                  sa     94   
2  117084      9     4       34715                  br     88   
3  113857      7     6       90269                  fr     90   
4   22083      9     3       48465                  br     94   

                      date  topic_id  \
0  2013-11-09 20:50:31.707        51   
1  2012-09-25 06:01:08.117        66   
2  2011-08-28 08:01:15.677        68   
3  2011-07-31 14:51:22.547        54   
4  2011-08-31 16:41:04.210        67   

                                                text  cefr_numeric  \
0  From:l AS xxx@hotmail.com To: AS xxx@IXW.corpo...             3   
1  I am so glad to receive this email from you. A...             3   
2  Hi Fun Skydive, so I give up of my idea. I und...             3   
3  Dear James, Some serious problems have been br...             3   
4  Dear Sue, Thank

**Check data**

In [3]:
df.columns

Index(['id', 'level', 'unit', 'learner_id', 'learner_nationality', 'grade',
       'date', 'topic_id', 'text', 'cefr_numeric', 'cefr_grouped'],
      dtype='object')

In [4]:
df.shape[0]

377967

In [7]:
# Check unique label values
df['labels'] = df['cefr_numeric'].apply(lambda x: x-1)

In [8]:
# Filter out invalid labels
df = df[df['labels'].isin(range(6))]

In [9]:
# Check unique label values after filtering
print("Unique label values after filtering:", df['labels'].unique())

Unique label values after filtering: [2 1 0 3 4 5]


**Validation and test set sizes**

In [10]:
text = df['text']
labels = df['labels']

In [11]:
# Validation and test set sizes
test_size = int(0.1 * len(df))  # 10% for testing
valid_size = int(0.2 * len(df))  # 20% for validation

In [12]:
# Split off the test set
train_texts, test_texts, train_labels, test_labels = train_test_split(text, labels, test_size=test_size, shuffle=True, random_state=42)

In [13]:
# Split off the validation set from the remaining training data
train_texts, valid_texts, train_labels, valid_labels = train_test_split(train_texts, train_labels, test_size=valid_size, shuffle=True, random_state=42)

In [14]:
# Unique label values in training set
print("Unique label values in training set:", train_labels.unique())
print("Unique label values in validation set:", valid_labels.unique())
print("Unique label values in test set:", test_labels.unique())

Unique label values in training set: [2 0 1 3 4 5]
Unique label values in validation set: [2 0 3 1 4 5]
Unique label values in test set: [5 1 2 3 0 4]


In [15]:
len(test_texts)

37796

In [16]:
# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=10000)  # You can adjust the number of features
train_vectors = vectorizer.fit_transform(train_texts)
test_vectors = vectorizer.transform(test_texts)

In [17]:
# Train a logistic regression model
baseline_model = LogisticRegression(max_iter=1000)
baseline_model.fit(train_vectors, train_labels)


In [None]:
**Evaluate the model**

In [19]:
# Predict on the test set
test_preds = baseline_model.predict(test_vectors)

# Evaluate the model
print(classification_report(test_labels, test_preds))


              precision    recall  f1-score   support

           0       0.97      0.98      0.98      9970
           1       0.95      0.95      0.95      9918
           2       0.95      0.96      0.95     10079
           3       0.94      0.95      0.95      6184
           4       0.95      0.87      0.91      1444
           5       0.98      0.68      0.80       201

    accuracy                           0.96     37796
   macro avg       0.96      0.90      0.92     37796
weighted avg       0.96      0.96      0.96     37796



**Re-check Label Values Before Creating TensorFlow Datasets**

In [20]:
text = df['text'].tolist()
labels = df['labels'].tolist()

validation_proportion = 0.1

# Split the data into training and testing sets first
train_texts, test_texts, train_labels, test_labels = train_test_split(text, labels, test_size=0.1, shuffle=True, random_state=42)


In [21]:
# Split the training set into a smaller training set and validation set
train_texts, valid_texts, train_labels, valid_labels = train_test_split(train_texts, train_labels, test_size=0.1, shuffle=True, random_state=42)

In [22]:
# Check the data input
print(f"Total dataset size: {len(text)}")
print(f"Training set size: {len(train_texts)}")
print(f"Validation set size: {len(valid_texts)}")
print(f"Test set size: {len(test_texts)}")

print(f"First 5 elements of train_texts: {train_texts[:5]}")
print(f"First 5 elements of valid_texts: {valid_texts[:5]}")
print(f"First 5 elements of test_texts: {test_texts[:5]}")


Total dataset size: 377967
Training set size: 306153
Validation set size: 34017
Test set size: 37797
First 5 elements of train_texts: ['February 21 Dear Mom, How are you doing? Argentina was really warm. I took a tour so that I could see the famous restaurants, and the food was delicious. I had a very good time, and I loved the night, because I could see many shows. I stayed in a great hotel, with a big swimming pool, and I had a lot of fun. Love, Tereza', 'When I was a child, people bought the newspaper, but now times have changed and now people buy less WC lesser newspaper PL newspapers , and you can now find a newspaper online. Not existed before smart phones WO Smartphones did not exist before PU , or tablets, or anything like that , there NS . There was no color TV PU , was in WC only black and white, much less remote, you had to stop and change the tv SP TV .', "Blue Cruise Alaska The ship wasn't new and it wasn't a cruise ship; in fact, it was a horrible, shabby old little boat.

In [23]:
# Define the validation and test set sizes
valid_size = 40607
test_size = 4061

**Tokenization**

In [24]:
max_length = 50
rtokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize_texts(texts, tokenizer, max_length):
    if not isinstance(texts, list) or not all(isinstance(t, str) for t in texts):
        raise ValueError("Input texts should be a list of strings.")
    return tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='tf')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [25]:
def safe_tokenize(texts, tokenizer, max_length):
    try:
        encodings = tokenize_texts(texts, tokenizer, max_length)
        if 'input_ids' not in encodings:
            raise ValueError("Tokenization did not produce 'input_ids'.")
        print(f"Successfully tokenized {len(texts)} texts.")
        return encodings
    except ValueError as e:
        print(f"Tokenization error: {e}")
        return None


In [26]:
train_encodings = safe_tokenize(list(train_texts), rtokenizer, max_length)
valid_encodings = safe_tokenize(list(valid_texts), rtokenizer, max_length)
test_encodings = safe_tokenize(list(test_texts), rtokenizer, max_length)

Successfully tokenized 306153 texts.
Successfully tokenized 34017 texts.
Successfully tokenized 37797 texts.


In [27]:
if train_encodings is None or valid_encodings is None or test_encodings is None:
    raise ValueError("Tokenization failed for one or more datasets.")

# Print the Keys of train_encodings
print(f"Keys of train_encodings: {train_encodings.keys()}")
print(f"Keys of valid_encodings: {valid_encodings.keys()}")
print(f"Keys of test_encodings: {test_encodings.keys()}")

Keys of train_encodings: dict_keys(['input_ids', 'attention_mask'])
Keys of valid_encodings: dict_keys(['input_ids', 'attention_mask'])
Keys of test_encodings: dict_keys(['input_ids', 'attention_mask'])


In [28]:
# Convert to TensorFlow Datasets
train_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": train_encodings.input_ids, "attention_mask": train_encodings.attention_mask}, train_labels)).shuffle(len(train_texts)).batch(16).prefetch(tf.data.experimental.AUTOTUNE)
valid_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": valid_encodings.input_ids, "attention_mask": valid_encodings.attention_mask}, valid_labels)).batch(16).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": test_encodings.input_ids, "attention_mask": test_encodings.attention_mask}, test_labels)).batch(16).prefetch(tf.data.experimental.AUTOTUNE)

In [29]:
# Adjust the input dictionary keys to match the expected names in the model
def adjust_keys(encodings):
    return {
        'input_ids_layer': encodings['input_ids'],
        'attention_mask_layer': encodings['attention_mask']
    }

In [31]:
# Apply adjust_keys to each encoding
train_encodings_adjusted = adjust_keys(train_encodings)
valid_encodings_adjusted = adjust_keys(valid_encodings)
test_encodings_adjusted = adjust_keys(test_encodings)

# Print the adjusted keys of train_encodings
print(f"Adjusted keys of train_encodings: {train_encodings_adjusted.keys()}")
print(f"Adjusted keys of valid_encodings: {valid_encodings_adjusted.keys()}")
print(f"Adjusted keys of test_encodings: {test_encodings_adjusted.keys()}")


Adjusted keys of train_encodings: dict_keys(['input_ids_layer', 'attention_mask_layer'])
Adjusted keys of valid_encodings: dict_keys(['input_ids_layer', 'attention_mask_layer'])
Adjusted keys of test_encodings: dict_keys(['input_ids_layer', 'attention_mask_layer'])


In [32]:
# Convert to TensorFlow Datasets
train_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids_layer": train_encodings.input_ids, "attention_mask_layer": train_encodings.attention_mask}, train_labels)).shuffle(len(train_texts)).batch(16).prefetch(tf.data.experimental.AUTOTUNE)
valid_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids_layer": valid_encodings.input_ids, "attention_mask_layer": valid_encodings.attention_mask}, valid_labels)).batch(16).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids_layer": test_encodings.input_ids, "attention_mask_layer": test_encodings.attention_mask}, test_labels)).batch(16).prefetch(tf.data.experimental.AUTOTUNE)

In [33]:
# Check TensorFlow Dataset Elements
for batch in test_dataset.take(1):
    inputs, labels = batch
    print(f"Input IDs shape: {inputs['input_ids_layer'].shape}")
    print(f"Attention Mask shape: {inputs['attention_mask_layer'].shape}")
    print(f"Labels shape: {labels.shape}")

Input IDs shape: (16, 50)
Attention Mask shape: (16, 50)
Labels shape: (16,)


**Train the Model**

In [44]:
def create_roberta_cl_model(model, num_classes=6, dropout=0.3, learning_rate=0.0001):
    model.trainable = False

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='input_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer')

    model_inputs = [input_ids, attention_mask]
    model_out = model(input_ids=input_ids, attention_mask=attention_mask)
    model_out = model_out.last_hidden_state

    conv = tf.keras.layers.Conv1D(filters=256, kernel_size=3, activation='relu')(model_out)
    conv = tf.keras.layers.MaxPooling1D(pool_size=2)(conv)
    conv = tf.keras.layers.Dropout(dropout)(conv)
    lstm = tf.keras.layers.LSTM(units=256, return_sequences=False, return_state=False)(conv)
    lstm = tf.keras.layers.Dropout(dropout)(lstm)
    classification = tf.keras.layers.Dense(num_classes, activation='softmax', name='classification_layer')(lstm)

    classification_model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=[classification])

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, clipnorm=0.1)
    optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)

    classification_model.compile(optimizer=optimizer,
                                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                                 metrics=['accuracy'])

    return classification_model



In [45]:
roberta_model = TFRobertaModel.from_pretrained('roberta-base')
roberta_cl_model = create_roberta_cl_model(model=roberta_model, num_classes=6)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.embeddings.position_ids', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

In [46]:
import gc
# Clear session and free memory before starting the training
tf.keras.backend.clear_session()
gc.collect()

34313

In [47]:
roberta_cl_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids_layer (InputLaye  [(None, 50)]                 0         []                            
 r)                                                                                               
                                                                                                  
 attention_mask_layer (Inpu  [(None, 50)]                 0         []                            
 tLayer)                                                                                          
                                                                                                  
 tf_roberta_model (TFRobert  TFBaseModelOutputWithPooli   1246456   ['input_ids_layer[0][0]',     
 aModel)                     ngAndCrossAttentions(last_   32         'attention_mask_layer[0][

In [48]:
# dataset = tf.data.Dataset.from_tensor_slices(({"input_ids_layer": train_encodings.input_ids, "attention_mask_layer": train_encodings.attention_mask}, train_labels))
# dataset = dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)

vdataset = tf.data.Dataset.from_tensor_slices(({"input_ids_layer": valid_encodings.input_ids, "attention_mask_layer": valid_encodings.attention_mask}, valid_labels))
vdataset = vdataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
roberta_cl_model_history = roberta_cl_model.fit(
    train_dataset,
    validation_data=valid_dataset,
    epochs=1
)

 1864/19135 [=>............................] - ETA: 14:24:31 - loss: 0.5815 - accuracy: 0.7949