In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
pip install textstat

Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.15.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyphen, textstat
Successfully installed pyphen-0.15.0 textstat-0.7.4


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, TFRobertaModel
import tensorflow as tf
from sklearn.metrics import classification_report, accuracy_score
import gc
import textstat

file_path = '/content/drive/My Drive/266 Project/efcamdat_sub.csv'

df = pd.read_csv(file_path)

print(df.head())

       id  level  unit  learner_id learner_nationality  grade  \
0  679604      7     3      114335                  br     90   
1  151196      9     2      136139                  sa     94   
2  117084      9     4       34715                  br     88   
3  113857      7     6       90269                  fr     90   
4   22083      9     3       48465                  br     94   

                      date  topic_id  \
0  2013-11-09 20:50:31.707        51   
1  2012-09-25 06:01:08.117        66   
2  2011-08-28 08:01:15.677        68   
3  2011-07-31 14:51:22.547        54   
4  2011-08-31 16:41:04.210        67   

                                                text  cefr_numeric  \
0  From:l AS xxx@hotmail.com To: AS xxx@IXW.corpo...             3   
1  I am so glad to receive this email from you. A...             3   
2  Hi Fun Skydive, so I give up of my idea. I und...             3   
3  Dear James, Some serious problems have been br...             3   
4  Dear Sue, Thank

**Check data**

In [5]:
df.columns

Index(['id', 'level', 'unit', 'learner_id', 'learner_nationality', 'grade',
       'date', 'topic_id', 'text', 'cefr_numeric', 'cefr_grouped'],
      dtype='object')

In [6]:
df.shape[0]

377967

In [7]:
# Check unique label values
df['labels'] = df['cefr_numeric'].apply(lambda x: x-1)

In [8]:
# Filter out invalid labels
df = df[df['labels'].isin(range(6))]

In [9]:
# Check unique label values after filtering
print("Unique label values after filtering:", df['labels'].unique())

Unique label values after filtering: [2 1 0 3 4 5]


In [10]:
text = df['text']
labels = df['labels']

In [11]:
# Validation and test set sizes
test_size = int(0.1 * len(df))  # 10% for testing
valid_size = int(0.2 * len(df))  # 20% for validation

In [12]:
# Split off the test set
train_texts, test_texts, train_labels, test_labels = train_test_split(text, labels, test_size=test_size, shuffle=True, random_state=42)

# Split off the validation set from the remaining training data
train_texts, valid_texts, train_labels, valid_labels = train_test_split(train_texts, train_labels, test_size=valid_size, shuffle=True, random_state=42)

In [13]:
# Unique label values in training set
print("Unique label values in training set:", train_labels.unique())
print("Unique label values in validation set:", valid_labels.unique())
print("Unique label values in test set:", test_labels.unique())

Unique label values in training set: [2 0 1 3 4 5]
Unique label values in validation set: [2 0 3 1 4 5]
Unique label values in test set: [5 1 2 3 0 4]


**Divide the data into training, validation, and test sets**

In [14]:
len(test_texts)

37796

In [15]:
# Reduce the size of the training and validation sets for faster processing
train_texts = train_texts[:37796]
train_labels = train_labels[:37796]
valid_texts = valid_texts[:37796]
valid_labels = valid_labels[:37796]
test_texts = test_texts[:37796]
test_labels = test_labels[:37796]

In [16]:
# Extract SMOG Index scores
def extract_smog_features(texts):
    smog_scores = [textstat.smog_index(text) for text in texts]
    return smog_scores


In [17]:
train_smog_scores = extract_smog_features(train_texts)
valid_smog_scores = extract_smog_features(valid_texts)
test_smog_scores = extract_smog_features(test_texts)

In [18]:
# Tokenize the data
max_length = 50
rtokenizer = RobertaTokenizer.from_pretrained('roberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [19]:
def tokenize_texts(texts, tokenizer, max_length):
    if not isinstance(texts, list) or not all(isinstance(t, str) for t in texts):
        raise ValueError("Input texts should be a list of strings.")
    return tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='tf')

In [20]:
def safe_tokenize(texts, tokenizer, max_length):
    try:
        encodings = tokenize_texts(texts, tokenizer, max_length)
        if 'input_ids' not in encodings:
            raise ValueError("Tokenization did not produce 'input_ids'.")
        print(f"Successfully tokenized {len(texts)} texts.")
        return encodings
    except ValueError as e:
        print(f"Tokenization error: {e}")
        return None


In [21]:
train_encodings = safe_tokenize(list(train_texts), rtokenizer, max_length)
valid_encodings = safe_tokenize(list(valid_texts), rtokenizer, max_length)
test_encodings = safe_tokenize(list(test_texts), rtokenizer, max_length)

Successfully tokenized 37796 texts.
Successfully tokenized 37796 texts.
Successfully tokenized 37796 texts.


In [22]:
if train_encodings is None or valid_encodings is None or test_encodings is None:
    raise ValueError("Tokenization failed for one or more datasets.")

# Print the keys of the tokenized encodings to verify
print(f"Keys of train_encodings: {train_encodings.keys()}")
print(f"Keys of valid_encodings: {valid_encodings.keys()}")
print(f"Keys of test_encodings: {test_encodings.keys()}")

Keys of train_encodings: dict_keys(['input_ids', 'attention_mask'])
Keys of valid_encodings: dict_keys(['input_ids', 'attention_mask'])
Keys of test_encodings: dict_keys(['input_ids', 'attention_mask'])


In [23]:
# Convert to TensorFlow Datasets with SMOG Index scores
train_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids_layer": train_encodings.input_ids, "attention_mask_layer": train_encodings.attention_mask, "smog_score": train_smog_scores},
    train_labels
)).shuffle(len(train_texts)).batch(16).prefetch(tf.data.experimental.AUTOTUNE)

valid_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids_layer": valid_encodings.input_ids, "attention_mask_layer": valid_encodings.attention_mask, "smog_score": valid_smog_scores},
    valid_labels
)).batch(16).prefetch(tf.data.experimental.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids_layer": test_encodings.input_ids, "attention_mask_layer": test_encodings.attention_mask, "smog_score": test_smog_scores},
    test_labels
)).batch(16).prefetch(tf.data.experimental.AUTOTUNE)

In [24]:
# Check TensorFlow Dataset Elements
for batch in test_dataset.take(1):
    inputs, labels = batch
    print(f"Input IDs shape: {inputs['input_ids_layer'].shape}")
    print(f"Attention Mask shape: {inputs['attention_mask_layer'].shape}")
    print(f"SMOG Score shape: {inputs['smog_score'].shape}")
    print(f"Labels shape: {labels.shape}")
    print(f"Label values: {labels.numpy()}")

Input IDs shape: (16, 50)
Attention Mask shape: (16, 50)
SMOG Score shape: (16,)
Labels shape: (16,)
Label values: [5 1 1 2 3 2 1 0 0 3 0 0 4 2 2 2]


**Model architecture**

In [25]:
# Define the model
def create_roberta_cl_model(model, num_classes=6, dropout=0.3, learning_rate=0.0001):
    model.trainable = False

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='input_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer')
    smog_score = tf.keras.layers.Input(shape=(1,), dtype=tf.float32, name='smog_score')  # Changed from 'ari_score' to 'smog_score'

    model_inputs = [input_ids, attention_mask, smog_score]
    model_out = model(input_ids=input_ids, attention_mask=attention_mask)
    model_out = model_out.last_hidden_state

    conv = tf.keras.layers.Conv1D(filters=256, kernel_size=3, activation='relu')(model_out)
    conv = tf.keras.layers.MaxPooling1D(pool_size=2)(conv)
    conv = tf.keras.layers.Dropout(dropout)(conv)
    lstm = tf.keras.layers.LSTM(units=256, return_sequences=False, return_state=False)(conv)
    lstm = tf.keras.layers.Dropout(dropout)(lstm)

    # Concatenate LSTM output with the SMOG score
    concatenated = tf.keras.layers.Concatenate()([lstm, smog_score])  # Changed from 'ari_score' to 'smog_score'

    classification = tf.keras.layers.Dense(num_classes, activation='softmax', name='classification_layer')(concatenated)

    classification_model = tf.keras.Model(inputs=model_inputs, outputs=classification)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, clipnorm=0.1)
    optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)

    classification_model.compile(optimizer=optimizer,
                                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                                 metrics=['accuracy'])

    return classification_model

In [26]:
roberta_model = TFRobertaModel.from_pretrained('roberta-base')
roberta_cl_model = create_roberta_cl_model(model=roberta_model, num_classes=6)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'roberta.embeddings.position_ids', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

In [27]:
# Clear session and free memory before starting the training
tf.keras.backend.clear_session()
gc.collect()

23318

In [28]:
# Display model summary
roberta_cl_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids_layer (InputLaye  [(None, 50)]                 0         []                            
 r)                                                                                               
                                                                                                  
 attention_mask_layer (Inpu  [(None, 50)]                 0         []                            
 tLayer)                                                                                          
                                                                                                  
 tf_roberta_model (TFRobert  TFBaseModelOutputWithPooli   1246456   ['input_ids_layer[0][0]',     
 aModel)                     ngAndCrossAttentions(last_   32         'attention_mask_layer[0][

In [29]:
# dataset = tf.data.Dataset.from_tensor_slices(({"input_ids_layer": train_encodings.input_ids, "attention_mask_layer": train_encodings.attention_mask}, train_labels))
# dataset = dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)

vdataset = tf.data.Dataset.from_tensor_slices(({"input_ids_layer": valid_encodings.input_ids, "attention_mask_layer": valid_encodings.attention_mask}, valid_labels))
vdataset = vdataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)

In [30]:
# Train the model
roberta_cl_model_history = roberta_cl_model.fit(
    train_dataset,
    validation_data=valid_dataset,
    epochs=1)



In [None]:
# Predict and evaluate the model
preds = roberta_cl_model.predict(test_dataset)
pred_labels = tf.argmax(preds, axis=-1)
print(classification_report(test_labels, pred_labels))

