In [1]:
!pip install transformers
!pip install datasets



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
from nltk.tokenize import sent_tokenize
from scipy.special import softmax
from transformers import AutoTokenizer, DefaultDataCollator, TFAutoModelForSequenceClassification
from datasets import Dataset
nltk.download('punkt')


model_name = "dbmdz/bert-base-italian-xxl-cased"
model_path = "drive/MyDrive/"
train_data_path = "drive/MyDrive/train_data.csv"
test_data_path = "drive/MyDrive/test_data.txt"
predictions_path = "drive/MyDrive/predictions.csv"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
train_data = pd.read_csv(train_data_path)
X_train = train_data['text']
sent_X_train = [np.array(sent_tokenize(x)) for x in X_train]

y_train = train_data['label'].astype(int)
sent_y_train = [np.full(len(x), y_train[i]) for i, x in enumerate(X_train)]

sent_X_train = pd.DataFrame(np.squeeze((np.array(sent_X_train))))
sent_y_train = pd.DataFrame(np.squeeze((np.array(sent_y_train))))

train_df = pd.concat([X_train, y_train], axis = 1)
train_df.columns = ['text', 'label']
train_dataset = Dataset.from_pandas(train_df)

  
  if __name__ == '__main__':


In [5]:
dial_label = {
    'EML': 0,
    'NAP': 1,
    'PMS': 2,
    'FUR': 3,
    'LLD': 4,
    'LIJ': 5,
    'LMO': 6,
    'ROA_TARA': 7,
    'SCN': 8, 
    'VEC': 9,
    'SC': 10
}

test_data = []
with open(test_data_path, 'r', encoding='utf-8') as f:
     for line in f:
        sample = line.rstrip().split("\t")
        if len(sample)==2:
          label, clean = sample[0], sample[1]
          label = dial_label[label]
          test_data.append([clean, label])
        elif len(sample)==3:
          label, clean = sample[0], sample[1]+sample[2]
          label = dial_label[label]
          test_data.append([clean, label])

test_data = pd.DataFrame(test_data)
X_test = test_data.iloc[:, 0]
y_test = test_data.iloc[:, 1]
test_df = pd.concat([X_test, y_test], axis = 1)
test_df.columns = ['text', 'label']
test_dataset = Dataset.from_pandas(test_df)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(sentence):
    return tokenizer(sentence['text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/230k [00:00<?, ?B/s]

  0%|          | 0/246 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [9]:
data_collator = DefaultDataCollator(return_tensors="tf")

tf_train_dataset = train_dataset.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=128,
)

tf_test_dataset = test_dataset.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=128,
)

In [10]:
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=11)

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

model.fit(tf_train_dataset,
            epochs=1,
            verbose=2,
            )
  
model.save_pretrained(model_path+model_name)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-italian-xxl-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ResourceExhaustedError: ignored

In [None]:
y_pred = model.predict(tf_test_dataset)[0]
y_pred = list(map(lambda x : softmax(x), y_pred))
y_pred = np.argmax(np.array(y_pred), axis=-1)

def write_output(file_name, Y):
  f  = open(file_name, "w")
  for y in Y:
    f.write(str(y) + "\n")
  f.close()

write_output(predictions_path, y_pred)