In [1]:
from collections import Counter
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [5]:
from transformers import AutoTokenizer, TFAutoModel

In [7]:
df = pd.read_csv("../efcamdat.csv")

In [8]:
df.columns

Index(['writing_id', 'learner_id', 'learner_id_categorical', 'nationality',
       'l1', 'cefr', 'cefr_numeric', 'level', 'unit', 'topic_id_original',
       'topic_id_original_categorical', 'topic_id', 'topic_id_categorical',
       'text_number_per_learner_in_task', 'topic', 'secondary_topic',
       'topic_to_keep', 'date', 'time', 'grade', 'wordcount', 'mtld', 'text',
       'text_corrected'],
      dtype='object')

In [9]:
df.shape[0]

406062

In [10]:
text = df['text']
labels = df['cefr_numeric']

In [11]:
(train_texts, test_texts, train_labels, test_labels) = train_test_split(text, labels, test_size=.1, shuffle=True)

In [12]:
len(test_texts)

40607

In [13]:
test_texts.iloc[:1000]

291489    \n\t  Crime: House theft Time: Monday 22th, Ju...
389649    \n\t  Dear Mr. Harry Martin, I have long aspir...
208666    \n\t  My name is Ivan and I'm a programmer. I ...
105479    \n\t  Hugo's birthday, Hugo's party. Come to m...
26899     \n\t  Hello teacher, How are you? My name's Mi...
                                ...                        
158628    \n\t  There are five people in my family. I'm ...
390076    \n\t  The first property is a recently renovat...
178737    \n\t  Name: Willian  Age: 29  Birthday : 30 Ma...
255783    \n\t  My name is Amedeo Da Ros. I was born in ...
277702    \n\t  Monkeys are so beautiful.they live in th...
Name: text, Length: 1000, dtype: object

In [31]:
# valid_texts = train_texts.loc[:40607]
# valid_labels = train_labels.loc[:40607]
# train_texts = train_texts.loc[40607:]
# train_labels = train_labels.loc[40607:]

#smaller 
valid_texts_s = train_texts.iloc[:1000]
valid_labels_s = train_labels.iloc[:1000]
train_texts_s = train_texts.iloc[1000:11000]
train_labels_s = train_labels.iloc[1000:11000]
test_texts_s = test_texts.iloc[:1000]
test_labels_s = test_labels.iloc[:1000]

In [161]:
m = "intfloat/multilingual-e5-large-instruct"
tokenizer = AutoTokenizer.from_pretrained(m)
model = TFAutoModel.from_pretrained(m)

All PyTorch model weights were used when initializing TFXLMRobertaModel.

All the weights of TFXLMRobertaModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


In [162]:
max_length = 200

In [163]:
train_encodings = tokenizer(list(train_texts_s), truncation=True, padding=True, max_length=max_length, return_tensors='tf')
valid_encodings = tokenizer(list(valid_texts_s), truncation=True, padding=True, max_length=max_length, return_tensors='tf')
test_encodings = tokenizer(list(test_texts_s), truncation=True, padding=True, max_length=max_length, return_tensors='tf')

In [164]:
train_encodings.input_ids[:1]

<tf.Tensor: shape=(1, 200), dtype=int32, numpy=
array([[     0,  64672,  62163,   1257,     99,    483,     36,     25,
           238,  21135,      5,   4687,  60899,     47,   4488,     99,
          3569,   1837,      5,   4687,   1556,  40781,     99,    427,
            36,     25,    238,  21135,      5,  20414,  60899,     47,
          5368,     99,   2289,   1837,      5,    360,     70, 105216,
             4,   2412,  39544,     90,   1910,    136,   2412,  11301,
             7,  69686,      5,   4687,   1556,  94000,     99, 159968,
             5,   4687,  60899,     47,  11958,     99,    483,     36,
            25,    238,  21135,      5,      2,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,
             1, 

In [165]:
def create_multiclass_model(model,
                             num_classes = 5,
                             hidden_size = 128,
                             dropout=0.3,
                             learning_rate=0.001):
    """
    """
    model.traimable = True

    input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_ids_layer')
    #token_type_ids = tf.keras.layers.Input(shape=(300,), dtype=tf.int32, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='attention_mask_layer')

    model_inputs = [input_ids, attention_mask]
    model_out = model(input_ids=input_ids, attention_mask=attention_mask)

    pooler_token = model_out[1]

    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(pooler_token)
    hidden = tf.keras.layers.Dropout(dropout)(hidden)


    classification = tf.keras.layers.Dense(num_classes, activation='softmax',name='classification_layer')(hidden)

    classification_model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=[classification])

    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                                 metrics='accuracy')


    return classification_model

In [166]:
test_model = create_multiclass_model(model=model, num_classes=5)

In [167]:
test_model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids_layer (InputLayer)   [(None, None)]       0           []                               
                                                                                                  
 attention_mask_layer (InputLay  [(None, None)]      0           []                               
 er)                                                                                              
                                                                                                  
 tfxlm_roberta_model (TFXLMRobe  TFBaseModelOutputWi  559890432  ['input_ids_layer[0][0]',        
 rtaModel)                      thPoolingAndCrossAt               'attention_mask_layer[0][0]']   
                                tentions(last_hidde                                         

In [168]:
test_model_history = test_model.fit([train_encodings.input_ids, train_encodings.attention_mask],
                                      train_labels_s,
                                      validation_data=([valid_encodings.input_ids, valid_encodings.attention_mask],
                                      valid_labels_s),
                                      batch_size=8,
                                      epochs=5)

Epoch 1/5


2024-06-24 20:44:35.791985: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


 144/1250 [==>...........................] - ETA: 3:49:07 - loss: 1.4807 - accuracy: 0.3872

KeyboardInterrupt: 

In [41]:
len(train_labels_s)

10000

In [115]:
import os 

df2 = pd.DataFrame(columns=[['text', 'labels']])
d = "OneStopEnglishCorpus/Texts-SeparatedByReadingLevel"

for level in os.listdir(d):
    try:
        for f in os.listdir(os.path.join(d, level)):
            try:
                temp = pd.read_csv(os.path.join(d, level, f), delimiter='\t', header=None, encoding='ISO-8859-1')
                temp.columns = [['text']]
                temp['labels'] = level
                df2 = pd.concat([df2, temp])
            except:
                print(f)
    except:
        print(d)

        

.DS_Store
OneStopEnglishCorpus/Texts-SeparatedByReadingLevel


In [102]:
# d = "OneStopEnglishCorpus/Texts-Together-OneCSVperFile"
# dfs = []
# for f in os.listdir(d):
#     file_path = os.path.join(d, f)
#     if os.path.isfile(file_path) and f.endswith('.csv'):
#         try:
#             temp = pd.read_csv(file_path, encoding='ISO-8859-1')
#             temp.columns = ["Elementary", "Intermediate", "Advanced"]
#             dfs.append(temp)
#         except:
#             print(f)

# df2 = pd.concat(dfs, ignore_index=True)

pd.read_csv(os.path.join("OneStopEnglishCorpus/Texts-SeparatedByReadingLevel/Ele-Txt/Amsterdam-ele.txt"), delimiter='\t', header=None)

0    To tourists, Amsterdam still seems very libera...
1    The Mayor, Eberhard van der Laan, says his new...
2    Bartho Boer, a spokesman for the Mayor, says t...
3    People found guilty of violent harassment will...
4    One Dutch newspaper wrote that in the 19th cen...
5    They are “scum houses” not scum villages, says...
6    Police will watch the temporary accommodation,...
Name: 0, dtype: object

In [109]:
for level in os.listdir(d):
    for f in os.listdir(os.path.join(d, level)):
        print(f)

Spain-int.txt
Royal Baby-int.txt
WNL The lightweight-int.txt
Arctic mapping-int.txt
WNL Mystery Shopper-int.txt
WNL Star Wars-int.txt
WNL Four new elements-int.txt
Int-Txt
WNL Morocco-int.txt
Zero Hours-int.txt
Teff-int.txt
WNL Scientist-int.txt
WNL Are MOOCs the future-int.txt
WNL Bangladeshi organization-int.txt
Coal to challenge oil-int.txt
WNL The age of music-int.txt
WNL Google boss-int.txt
Exercise-int.txt
WNL On the trail-int.txt
WNL SeaWorld-int.txt
WNL Novel way-int.txt
Malala-int.txt
Lie detector-int.txt
WNL shark-int.txt
Syria-int.txt
Japan-int.txt
WNL New Orleans-int.txt
Pope-int.txt
WNL Google car-int.txt
WNL First high resolution images-int.txt
Billionaires-int.txt
WNL Goodbye fish and chips-int.txt
.DS_Store
WNL Calais Migrants-int.txt
WNL Extreme heat-int.txt
WNL School-int.txt
WNL Spain Robin Hood-int.txt
WNL Revolution-int.txt
WNL Vienna-int.txt
Japan menu-int.txt
Crowdfunding-int.txt
Rats-int.txt
Richard III-int.txt
Organs-int.txt
WNL Switched Babies-int.txt
WNL Scar

NotADirectoryError: [Errno 20] Not a directory: 'OneStopEnglishCorpus/Texts-SeparatedByReadingLevel/.DS_Store'

In [171]:
del model

# Trying Roberta + CNN-LSTM

In [15]:
from transformers import RobertaTokenizer, TFRobertaModel

In [32]:
rtokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = TFRobertaModel.from_pretrained('roberta-base')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.embeddings.position_ids', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

In [33]:
max_length = 200

In [34]:
train_encodings = rtokenizer(list(train_texts_s), truncation=True, padding=True, max_length=max_length, return_tensors='tf')
valid_encodings = rtokenizer(list(valid_texts_s), truncation=True, padding=True, max_length=max_length, return_tensors='tf')
test_encodings = rtokenizer(list(test_texts_s), truncation=True, padding=True, max_length=max_length, return_tensors='tf')

In [75]:
def create_roberta_model(model,
                         num_classes = 5,
                         dropout=0.3,
                         learning_rate=0.0001):
    """
    """
    model.traimable = True

    input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_ids_layer')
    #token_type_ids = tf.keras.layers.Input(shape=(300,), dtype=tf.int32, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='attention_mask_layer')

    model_inputs = [input_ids, attention_mask]
    model_out = model(input_ids=input_ids, attention_mask=attention_mask)

    model_out = model_out[0]

    conv = tf.keras.layers.Conv1D(filters=256, kernel_size=3, activation='relu')(model_out)
    conv = tf.keras.layers.MaxPooling1D(pool_size=2)(conv)
    conv = tf.keras.layers.Dropout(dropout)(conv)
    lstm = tf.keras.layers.LSTM(units=256, return_sequences=False, return_state=False)(conv)
    lstm = tf.keras.layers.Dropout(dropout)(lstm)

    classification = tf.keras.layers.Dense(num_classes, activation='softmax',name='classification_layer')(lstm)

    classification_model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=[classification])

    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                                 metrics='accuracy')

    return classification_model

In [76]:
test_model2 = create_roberta_model(model=roberta_model, num_classes=5)

In [77]:
test_model2.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids_layer (InputLayer)   [(None, None)]       0           []                               
                                                                                                  
 attention_mask_layer (InputLay  [(None, None)]      0           []                               
 er)                                                                                              
                                                                                                  
 tf_roberta_model_1 (TFRobertaM  TFBaseModelOutputWi  124645632  ['input_ids_layer[0][0]',        
 odel)                          thPoolingAndCrossAt               'attention_mask_layer[0][0]']   
                                tentions(last_hidde                                         

In [78]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [None]:
test_model_history = test_model2.fit([train_encodings.input_ids, train_encodings.attention_mask],
                                      train_labels_s,
                                      validation_data=([valid_encodings.input_ids, valid_encodings.attention_mask],
                                      valid_labels_s),
                                      batch_size=8,
                                      epochs=5)

Epoch 1/5
