In [1]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 54.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 2.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 48.8 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
from nltk.tokenize import sent_tokenize
from scipy.special import softmax
from transformers import AutoTokenizer, DefaultDataCollator, TFAutoModelForSequenceClassification
from datasets import Dataset
nltk.download('punkt')


model_name = "dbmdz/bert-base-italian-xxl-cased"
model_path = "drive/MyDrive/"
train_data_path = "drive/MyDrive/train_data.csv"
test_data_path = "drive/MyDrive/test_data.txt"
predictions_path = "drive/MyDrive/predictions.csv"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
train_data = pd.read_csv(train_data_path)
X_train = train_data['text']
y_train = train_data['label'].astype(int)

sent_X_train = []
sent_y_train = []
for i, article in enumerate(X_train):
  for sentence in sent_tokenize(article):
    sent_X_train.append(sentence)
    sent_y_train.append(y_train[i])

sent_X_train = pd.DataFrame(sent_X_train)
sent_y_train = pd.DataFrame(sent_y_train)


train_df = pd.concat([sent_X_train, sent_y_train], axis = 1)
train_df.columns = ['text', 'label']
train_dataset = Dataset.from_pandas(train_df)

In [5]:
train_df

Unnamed: 0,text,label
0,'l è 'l nòm 'd un domìni genèric.,0
1,Al funsiòuna da 'l 30 'd utóber dal 2016.,0
2,'l è 'l nòm 'd un domìni genèric.,0
3,Al funsiòuna da 'l 30 'd utóber dal 2016.,0
4,'l è 'l nòm 'd un domìni genèric.,0
...,...,...
696706,"A pustis, colat dae sas poesias a sas cummèdias.",10
696707,Sa prima cummèdia sua est Sa rebelliòni de is...,10
696708,In su 2010 l'ant torrada a rapresentare.,10
696709,Sa de duas cummèdia est Sa littra de mariedda...,10


In [5]:
dial_label = {
    'EML': 0,
    'NAP': 1,
    'PMS': 2,
    'FUR': 3,
    'LLD': 4,
    'LIJ': 5,
    'LMO': 6,
    'ROA_TARA': 7,
    'SCN': 8, 
    'VEC': 9,
    'SC': 10
}

test_data = []
with open(test_data_path, 'r', encoding='utf-8') as f:
     for line in f:
        sample = line.rstrip().split("\t")
        if len(sample)==2:
          label, clean = sample[0], sample[1]
          label = dial_label[label]
          test_data.append([clean, label])
        elif len(sample)==3:
          label, clean = sample[0], sample[1]+sample[2]
          label = dial_label[label]
          test_data.append([clean, label])

test_data = pd.DataFrame(test_data)
X_test = test_data.iloc[:, 0]
y_test = test_data.iloc[:, 1]
test_df = pd.concat([X_test, y_test], axis = 1)
test_df.columns = ['text', 'label']
test_dataset = Dataset.from_pandas(test_df)

In [15]:
tmp = list(map(lambda x : len(x.split(" ")), X_train))
print(max(tmp))
print(min(tmp))
print(sum(tmp)/len(tmp))

9105
4
48.61321081760083


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(sentence):
    return tokenizer(sentence['text'], padding=True, truncation=True, max_length=50)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/230k [00:00<?, ?B/s]

  0%|          | 0/697 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [17]:
data_collator = DefaultDataCollator(return_tensors="tf")

tf_train_dataset = train_dataset.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=256,
)

tf_test_dataset = test_dataset.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=256,
)

In [19]:
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=11)

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

model.fit(tf_train_dataset,
            epochs=1,
            verbose=2,
            )
  
model.save_pretrained(model_path+model_name)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-italian-xxl-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


UnknownError: ignored

In [None]:
y_pred = model.predict(tf_test_dataset)[0]
y_pred = list(map(lambda x : softmax(x), y_pred))
y_pred = np.argmax(np.array(y_pred), axis=-1)

def write_output(file_name, Y):
  f  = open(file_name, "w")
  for y in Y:
    f.write(str(y) + "\n")
  f.close()

write_output(predictions_path, y_pred)