## Global Imports and ENV setup

In [None]:
from google.colab import drive, userdata

drive.mount('/content/drive')
github_token = userdata.get('github_token')
github_username = userdata.get('github_username')

In [None]:
!git clone https://$github_username:$github_token@github.com/developer-sidani/CharBERT.git

In [None]:
!pip install -r /content/CharBERT/requirements.txt

## Download Dataset

In [None]:
import wget
import os
import datasets
import pandas as pd
import random

# Define a function to check if the directory exists and print the dataset information if it does
def check_and_create_directory(path):
    if os.path.exists(path):
        print(f"Dataset already exists at {path}.")
        return True
    os.makedirs(path, exist_ok=True)
    return False

# Define a function to download files if the directory does not exist
def download_file(url, output_path):
    if not os.path.exists(output_path):
        wget.download(url, output_path)

# 1. Download CoNLL-2003 dataset
def download_conll2003():
    path = '/content/drive/MyDrive/data/CoNLL2003/'
    if check_and_create_directory(path):
        return

    download_file("https://raw.githubusercontent.com/chnsh/BERT-NER-CoNLL/master/data/train.txt", os.path.join(path, "train.txt"))
    download_file("https://raw.githubusercontent.com/chnsh/BERT-NER-CoNLL/master/data/valid.txt", os.path.join(path, "val.txt"))
    download_file("https://raw.githubusercontent.com/chnsh/BERT-NER-CoNLL/master/data/valid.txt", os.path.join(path, "dev.txt"))
    download_file("https://raw.githubusercontent.com/chnsh/BERT-NER-CoNLL/master/data/test.txt", os.path.join(path, "test.txt"))
    print("CoNLL-2003 dataset downloaded.")

# 2. Download and process Italian CoNLL-2003 dataset
def download_conll2003_ita():
    path = '/content/drive/MyDrive/data/CoNLL2003_ita/'
    if check_and_create_directory(path):
        return

    def conllu_to_txt(conllu_file, n_stop=100_000):
        name_new_file = conllu_file.split(".")[0] + ".txt"
        with open(os.path.join(path, name_new_file), "w") as f_out:
            with open(conllu_file, 'r', encoding='utf-8') as f_in:
                for i, line in enumerate(f_in):
                    if i >= n_stop:
                        break
                    if len(line.split("\t")) == 3:
                        if line.split("\t")[0] == '0':
                            f_out.write("\n")
                        f_out.write(line.split("\t")[1] + " " + line.split("\t")[2])
    
    download_file("https://raw.githubusercontent.com/Babelscape/wikineural/master/data/wikineural/it/train.conllu", "train.conllu")
    download_file("https://raw.githubusercontent.com/Babelscape/wikineural/master/data/wikineural/it/val.conllu", "val.conllu")
    download_file("https://raw.githubusercontent.com/Babelscape/wikineural/master/data/wikineural/it/test.conllu", "test.conllu")

    conllu_to_txt("train.conllu", n_stop=200_000)
    conllu_to_txt("val.conllu")
    conllu_to_txt("test.conllu")
    os.remove("train.conllu")
    os.remove("val.conllu")
    os.remove("test.conllu")
    print("CoNLL-2003 Italian dataset processed.")

# 3. Download and process Wikipedia datasets
def download_wikipedia():
    path = '/content/drive/MyDrive/data/wiki/'
    if check_and_create_directory(path):
        return

    def merge_and_shuffle(file1_path, file2_path, output_file_path):
        with open(file1_path, 'r') as file1, open(file2_path, 'r') as file2:
            content1 = file1.readlines()
            content2 = file2.readlines()
        merged_content = content1 + content2
        random.shuffle(merged_content)

        with open(output_file_path, 'w') as output_file:
            output_file.writelines(merged_content)

    dataset = datasets.load_dataset("wikipedia", "20220301.simple")['train']
    dataset = dataset.select(range(12000)).to_pandas()
    train = dataset.iloc[:10000]
    val = dataset.iloc[10000:11000]
    test = dataset.iloc[11000:12000]

    train.to_csv(os.path.join(path, 'wiki_eng_train.csv'), index=False)
    val.to_csv(os.path.join(path, 'wiki_eng_val.csv'), index=False)
    test.to_csv(os.path.join(path, 'wiki_eng_test.csv'), index=False)

    little_train = dataset.iloc[:3500]
    little_train.to_csv(os.path.join(path, 'wikil_eng_train.csv'), index=False)

    dataset_it = datasets.load_dataset("wikipedia", "20220301.it")['train']
    dataset_it = dataset_it.select(range(2000)).to_pandas()
    train_it = dataset_it.iloc[:1300]
    val_it = dataset_it.iloc[1300:1600]
    test_it = dataset_it.iloc[1600:2000]

    train_it.to_csv(os.path.join(path, 'wiki_ita_train.csv'), index=False)
    val_it.to_csv(os.path.join(path, 'wiki_ita_val.csv'), index=False)
    test_it.to_csv(os.path.join(path, 'wiki_ita_test.csv'), index=False)

    files = ["wiki_eng_train.csv", "wiki_eng_val.csv", "wiki_eng_test.csv", "wikil_eng_train.csv", "wiki_ita_train.csv", "wiki_ita_val.csv", "wiki_ita_test.csv"]
    for file_ in files:
        dataset = pd.read_csv(os.path.join(path, file_))
        path_txt = os.path.join(path, f"{file_.split('/')[-1].split('.')[0]}.txt")
        with open(path_txt, 'w') as f:
            for row in dataset['text'][:len(dataset)//20]:
                f.write(row)

    merge_and_shuffle(os.path.join(path, 'wikil_eng_train.txt'), os.path.join(path, 'wiki_ita_train.txt'), os.path.join(path, 'wikil_eng_wiki_ita_train.txt'))
    merge_and_shuffle(os.path.join(path, 'wiki_eng_val.txt'), os.path.join(path, 'wiki_ita_val.txt'), os.path.join(path, 'wikil_eng_wiki_ita_val.txt'))
    merge_and_shuffle(os.path.join(path, 'wiki_eng_test.txt'), os.path.join(path, 'wiki_ita_test.txt'), os.path.join(path, 'wikil_eng_wiki_ita_test.txt'))

    for file_ in ["wiki_eng_train.txt", "wiki_eng_val.txt", "wiki_eng_test.txt"]:
        os.replace(file_, os.path.join(path, file_))

    for f in files:
        os.remove(os.path.join(path, f))
    print("Wikipedia datasets processed.")

# 4. Download AG News dataset
def download_ag_news():
    path = '/content/drive/MyDrive/data/news_domain/'
    if check_and_create_directory(path):
        return

    dataset = datasets.load_dataset("ag_news")['train'].shuffle(seed=42)
    train_size = int(0.8 * len(dataset))
    val_size = int(0.1 * len(dataset))
    test_size = len(dataset) - train_size - val_size

    train_dataset = dataset.select(range(train_size))
    val_dataset = dataset.select(range(train_size, train_size + val_size))
    test_dataset = dataset.select(range(train_size + val_size, len(dataset)))

    def write_to_file(data_split, filename):
        with open(os.path.join(path, filename), 'w', encoding="utf-8") as f:
            for example in data_split:
                f.write(example['text'] + '\n')

    write_to_file(train_dataset, 'train.txt')
    write_to_file(val_dataset, 'val.txt')
    write_to_file(test_dataset, 'test.txt')
    print("AG News dataset processed.")

# 5. Download WNUT 17 dataset
def download_wnut_17():
    path = '/content/drive/MyDrive/data/news_domain_ner/'
    if check_and_create_directory(path):
        return

    dataset = datasets.load_dataset("wnut_17")
    val_test = dataset['validation']
    val = val_test.shard(num_shards=2, index=0)
    test = val_test.shard(num_shards=2, index=1)

    dataset['validation'] = val
    dataset['test'] = test

    int_to_key = dataset['train'].features["ner_tags"].feature.int2str

    def write_to_file(split, filename):
        with open(os.path.join(path, filename), 'w') as f:
            all_tokens = dataset[split]['tokens']
            all_tags = dataset[split]['ner_tags']
            for tokens, tags in zip(all_tokens, all_tags):
                for token, tag in zip(tokens, tags):
                    f.write(f'{token} {int_to_key(tag)}\n')
                f.write('\n')

    write_to_file('train', 'train.txt')
    write_to_file('validation', 'val.txt')
    write_to_file('test', 'test.txt')

    with open(os.path.join(path, 'labels.txt'), 'w') as f:
        labels = dataset['train'].features['ner_tags'].feature.names
        for label in labels[:-1]:
            f.write(label + '\n')
        f.write(labels[-1])
    print("WNUT 17 dataset processed.")

# Main function to download all datasets
def main():
    download_conll2003()
    download_conll2003_ita()
    download_wikipedia()
    download_ag_news()
    download_wnut_17()

if __name__ == "main":
    main()

## Baseline

### BERT

#### Finetuning on wikidataset

In [None]:
!python3 /content/CharBERT/run_lm_finetuning.py \
    --model_type bert \
    --model_name_or_path bert-base-cased \
    --do_train \
    --do_eval \
    --train_data_file /content/drive/MyDrive/NLP/data/wiki/wiki_eng_train.txt \
    --eval_data_file  /content/drive/MyDrive/NLP/data/wiki/wiki_eng_eval.txt \
    --term_vocab /content/CharBERT/data/dict/term_vocab \
    --learning_rate 3e-5 \
    --num_train_epochs 1 \
    --char_vocab /content/CharBERT/data/dict/bert_char_vocab \
    --mlm_probability 0.10 \
    --input_nraws 1000 \
    --per_gpu_train_batch_size 4 \
    --per_gpu_eval_batch_size 4 \
    --save_steps 10000 \
    --block_size 384 \
    --mlm \
    --overwrite_output_dir \
    --output_dir  /content/drive/MyDrive/NLP/output/wiki-eng/bert_base_cased_wiki_eng
    

#### Ner CoNLL-2003

In [None]:
!python3 /content/CharBERT/run_ner.py --data_dir /content/drive/MyDrive/NLP/data/CoNLL2003 \
                --model_type bert \
                --model_name_or_path /content/drive/MyDrive/NLP/output/wiki-eng/bert_base_cased_wiki_eng \
                --output_dir /content/drive/MyDrive/NLP/output/conll2003_ner \
                --num_train_epochs 1 \
                --learning_rate 3e-5 \
                --char_vocab /content/CharBERT/data/dict/bert_char_vocab \
                --per_gpu_train_batch_size 6 \
                --do_train \
                --do_predict \
                --overwrite_output_dir

### RoBERTa

#### Finetuning on wikidataset

In [None]:
!python3 /content/CharBERT/run_lm_finetuning.py \
    --model_type roberta \
    --model_name_or_path roberta-base \
    --do_train \
    --do_eval \
    --train_data_file /content/drive/MyDrive/NLP/data/wiki/wiki_eng_train.txt \
    --eval_data_file  /content/drive/MyDrive/NLP/data/wiki/wiki_eng_eval.txt \
    --term_vocab /content/CharBERT/data/dict/term_vocab \
    --learning_rate 3e-5 \
    --num_train_epochs 1 \
    --char_vocab /content/CharBERT/data/dict/roberta_char_vocab \
    --mlm_probability 0.10 \
    --input_nraws 1000 \
    --per_gpu_train_batch_size 4 \
    --per_gpu_eval_batch_size 4 \
    --save_steps 10000 \
    --block_size 384 \
    --mlm \
    --overwrite_output_dir \
    --output_dir /content/drive/MyDrive/NLP/output/wiki-eng/robert_base_wiki_eng
    

#### Ner CoNLL-2003

In [None]:
!python3 /content/CharBERT/run_ner.py --data_dir /content/drive/MyDrive/NLP/data/CoNLL2003 \
                --model_type roberta \
                --model_name_or_path /content/drive/MyDrive/NLP/output/wiki-eng/robert_base_wiki_eng \
                --output_dir /content/drive/MyDrive/NLP/output/conll2003_ner \
                --num_train_epochs 1 \
                --learning_rate 3e-5 \
                --char_vocab /content/CharBERT/data/dict/roberta_char_vocab \
                --per_gpu_train_batch_size 6 \
                --do_train \
                --do_predict \
                --overwrite_output_dir

## Domain Adaptation:

Running domain adaptation on AG News Dataset

### BERT

#### Finetune on News Dataset

In [None]:
!python /content/CharBERT/run_lm_finetuning.py \
    --model_type bert \
    --model_name_or_path bert-base-cased \
    --output_dir /content/drive/MyDrive/NLP/output/news/mlm_bert_base/ \
    --train_data_file /content/drive/MyDrive/NLP/data/news_domain/train.txt \
    --eval_data_file  /content/drive/MyDrive/NLP/data/news_domain/val.txt \
    --do_train \
    --do_eval \
    --term_vocab /content/CharBERT/data/dict/term_vocab \
    --learning_rate 3e-5 \
    --num_train_epochs 1 \
    --char_vocab /content/CharBERT/data/dict/bert_char_vocab \
    --mlm_probability 0.10 \
    --input_nraws 1000 \
    --per_gpu_train_batch_size 4 \
    --per_gpu_eval_batch_size 4 \
    --save_steps 2000 \
    --block_size 384 \
    --mlm \
    --overwrite_output_dir 


#### NER on CoNLL-2003 dataset

!python /content/CharBERT/run_ner.py --data_dir /content/drive/MyDrive/NLP/data/CoNLL2003/ \
                            --model_type bert \
                            --model_name_or_path /content/drive/MyDrive/NLP/output/news/mlm_bert_base/ \
                            --output_dir /content/drive/MyDrive/NLP/output/news/NER_cased/conll2003 \
                            --num_train_epochs 1 \
                            --learning_rate 3e-5 \
                            --char_vocab /content/CharBERT/data/dict/bert_char_vocab \
                            --per_gpu_train_batch_size 6 \
                            --do_train \
                            --do_predict \
                            --overwrite_output_dir \
                            --save_steps 10000

#### NER on WNUT 17 dataset

In [None]:
!python /content/CharBERT/run_ner.py --data_dir /content/drive/MyDrive/NLP/data/legal_domain_ner/ \
                            --model_type bert \
                            --labels /content/drive/MyDrive/NLP/data/legal_domain_ner/labels.txt \
                            --model_name_or_path /content/drive/MyDrive/NLP/output/news/mlm_bert_base/ \
                            --output_dir /content/drive/MyDrive/NLP/output/news/NER_cased/legal_domain_ner \
                            --num_train_epochs 1 \
                            --learning_rate 3e-5 \
                            --char_vocab /content/CharBERT/data/dict/bert_char_vocab \
                            --per_gpu_train_batch_size 6 \
                            --do_train \
                            --do_predict \
                            --overwrite_output_dir \
                            --save_steps 10000

### bert-base-cased-ag-news

#### Clone the PLM

In [None]:
!git clone https://huggingface.co/lucasresck/bert-base-cased-ag-news

#### Finetune on News Dataset

In [None]:
!python /content/CharBERT/run_lm_finetuning.py \
    --model_type bert \
    --model_name_or_path /content/bert-base-cased-ag-news \
    --output_dir /content/drive/MyDrive/NLP/output/news/bert-base-cased-ag-news/ \
    --train_data_file /content/drive/MyDrive/NLP/data/news_domain/train.txt \
    --eval_data_file  /content/drive/MyDrive/NLP/data/news_domain/val.txt \
    --do_train \
    --do_eval \
    --term_vocab /content/CharBERT/data/dict/term_vocab \
    --learning_rate 3e-5 \
    --num_train_epochs 1 \
    --char_vocab /content/CharBERT/data/dict/bert_char_vocab \
    --mlm_probability 0.10 \
    --input_nraws 1000 \
    --per_gpu_train_batch_size 4 \
    --per_gpu_eval_batch_size 4 \
    --save_steps 2000 \
    --block_size 384 \
    --mlm \
    --overwrite_output_dir

#### NER on CoNLL-2003 dataset

In [None]:
!python /content/CharBERT/run_ner.py --data_dir /content/drive/MyDrive/NLP/data/CoNLL2003/ \
                            --model_type bert \
                            --model_name_or_path /content/drive/MyDrive/NLP/output/news/bert-base-cased-ag-news/ \
                            --output_dir /content/drive/MyDrive/NLP/output/news/NER_cased/conll2003_bert-base-cased-ag-news \
                            --num_train_epochs 1 \
                            --learning_rate 3e-5 \
                            --char_vocab /content/CharBERT/data/dict/bert_char_vocab \
                            --per_gpu_train_batch_size 6 \
                            --do_train \
                            --do_predict \
                            --overwrite_output_dir \
                            --save_steps 10000

#### NER on WNUT 17 dataset

In [None]:
!python /content/CharBERT/run_ner.py --data_dir /content/drive/MyDrive/NLP/data/legal_domain_ner/ \
                            --model_type bert \
                            --labels /content/drive/MyDrive/NLP/data/legal_domain_ner/labels.txt \
                            --model_name_or_path /content/drive/MyDrive/NLP/output/news/bert-base-cased-ag-news/ \
                            --output_dir /content/drive/MyDrive/NLP/output/news/NER_cased/legal_domain_ner_bert-base-cased-ag-news \
                            --num_train_epochs 1 \
                            --learning_rate 3e-5 \
                            --char_vocab /content/CharBERT/data/dict/bert_char_vocab \
                            --per_gpu_train_batch_size 6 \
                            --do_train \
                            --do_predict \
                            --overwrite_output_dir \
                            --save_steps 10000

### RoBERTa

#### Finetune on News Dataset

In [None]:
!python /content/CharBERT/run_lm_finetuning.py \
    --model_type roberta \
    --model_name_or_path roberta-base \
    --output_dir /content/drive/MyDrive/NLP/output/news/mlm_roberta_base/ \
    --train_data_file /content/drive/MyDrive/NLP/data/news_domain/train.txt \
    --eval_data_file  /content/drive/MyDrive/NLP/data/news_domain/val.txt \
    --do_train \
    --do_eval \
    --term_vocab /content/CharBERT/data/dict/term_vocab \
    --learning_rate 3e-5 \
    --num_train_epochs 1 \
    --char_vocab /content/CharBERT/data/dict/roberta_char_vocab \
    --mlm_probability 0.10 \
    --input_nraws 1000 \
    --per_gpu_train_batch_size 4 \
    --per_gpu_eval_batch_size 4 \
    --save_steps 2000 \
    --block_size 384 \
    --mlm \
    --overwrite_output_dir

#### NER on CoNLL-2003 dataset

In [None]:
!python /content/CharBERT/run_ner.py --data_dir /content/drive/MyDrive/NLP/data/CoNLL2003/ \
                            --model_type roberta \
                            --model_name_or_path /content/drive/MyDrive/NLP/output/news/mlm_roberta_base/ \
                            --output_dir /content/drive/MyDrive/NLP/output/news/NER_cased/conll2003 \
                            --num_train_epochs 1 \
                            --learning_rate 3e-5 \
                            --char_vocab /content/CharBERT/data/dict/roberta_char_vocab \
                            --per_gpu_train_batch_size 6 \
                            --do_train \
                            --do_predict \
                            --overwrite_output_dir \
                            --save_steps 10000

#### NER on WNUT 17 dataset

In [None]:
!python /content/CharBERT/run_ner.py --data_dir /content/drive/MyDrive/NLP/data/legal_domain_ner/ \
                            --model_type roberta \
                            --labels /content/drive/MyDrive/NLP/data/legal_domain_ner/labels.txt \
                            --model_name_or_path /content/drive/MyDrive/NLP/output/news/mlm_roberta_base/ \
                            --output_dir /content/drive/MyDrive/NLP/output/news/NER_cased/legal_domain_ner \
                            --num_train_epochs 1 \
                            --learning_rate 3e-5 \
                            --char_vocab /content/CharBERT/data/dict/roberta_char_vocab \
                            --per_gpu_train_batch_size 6 \
                            --do_train \
                            --do_predict \
                            --overwrite_output_dir \
                            --save_steps 10000

### roberta-base-ag-news

#### Clone the PLM

In [None]:
!git clone https://huggingface.co/textattack/roberta-base-ag-news

#### Finetune on News Dataset

In [None]:
!python /content/CharBERT/run_lm_finetuning.py \
    --model_type roberta \
    --model_name_or_path /content/roberta-base-ag-news \
    --output_dir /content/drive/MyDrive/NLP/output/news/roberta-base-ag-news/ \
    --train_data_file /content/drive/MyDrive/NLP/data/news_domain/train.txt \
    --eval_data_file  /content/drive/MyDrive/NLP/data/news_domain/val.txt \
    --do_train \
    --do_eval \
    --term_vocab /content/CharBERT/data/dict/term_vocab \
    --learning_rate 3e-5 \
    --num_train_epochs 1 \
    --char_vocab /content/CharBERT/data/dict/roberta_char_vocab \
    --mlm_probability 0.10 \
    --input_nraws 1000 \
    --per_gpu_train_batch_size 4 \
    --per_gpu_eval_batch_size 4 \
    --save_steps 2000 \
    --block_size 384 \
    --mlm \
    --overwrite_output_dir

#### NER on CoNLL-2003 dataset

In [None]:
!python /content/CharBERT/run_ner.py --data_dir /content/drive/MyDrive/NLP/data/CoNLL2003/ \
                            --model_type roberta \
                            --model_name_or_path /content/drive/MyDrive/NLP/output/news/roberta-base-ag-news/ \
                            --output_dir /content/drive/MyDrive/NLP/output/news/NER_cased/conll2003_roberta-base-ag-news \
                            --num_train_epochs 1 \
                            --learning_rate 3e-5 \
                            --char_vocab /content/CharBERT/data/dict/roberta_char_vocab \
                            --per_gpu_train_batch_size 6 \
                            --do_train \
                            --do_predict \
                            --overwrite_output_dir \
                            --save_steps 10000

#### NER on WNUT 17 dataset

In [None]:
!python /content/CharBERT/run_ner.py --data_dir /content/drive/MyDrive/NLP/data/legal_domain_ner/ \
                            --model_type roberta \
                            --labels /content/drive/MyDrive/NLP/data/legal_domain_ner/labels.txt \
                            --model_name_or_path /content/drive/MyDrive/NLP/output/news/roberta-base-ag-news/ \
                            --output_dir /content/drive/MyDrive/NLP/output/news/NER_cased/legal_domain_ner_roberta-base-ag-news \
                            --num_train_epochs 1 \
                            --learning_rate 3e-5 \
                            --char_vocab /content/CharBERTdata/dict/roberta_char_vocab \
                            --per_gpu_train_batch_size 6 \
                            --do_train \
                            --do_predict \
                            --overwrite_output_dir \
                            --save_steps 10000

## Multilingual Extension

### BERT

#### Finetune on multilingual Dataset

In [None]:
!python3 /content/CharBERT/run_lm_finetuning.py \
    --model_type bert \
    --model_name_or_path bert-base-multilingual-cased \
    --do_train \
    --do_eval \
    --train_data_file /content/drive/MyDrive/NLP/data/wiki/wikil_eng_wiki_ita_train.txt \
    --eval_data_file  /content/drive/MyDrive/NLP/data/wiki/wikil_eng_wiki_ita_val.txt \
    --term_vocab /content/CharBERT/data/dict/term_vocab \
    --learning_rate 3e-5 \
    --num_train_epochs 1 \
    --char_vocab /content/CharBERT/data/dict/bert_char_vocab \
    --mlm_probability 0.10 \
    --input_nraws 1000 \
    --per_gpu_train_batch_size 4 \
    --per_gpu_eval_batch_size 4 \
    --save_steps 10000 \
    --block_size 384 \
    --mlm \
    --overwrite_output_dir \
    --output_dir  /content/drive/MyDrive/NLP/output/multilingual/MLM_cased/wikil_eng_wiki_ita     

#### Ner Multilingual CoNLL2003 ENGLISH

In [None]:
!python3 /content/CharBERT/run_ner.py \
                    --data_dir /content/drive/MyDrive/NLP/data/CoNLL2003 \
                    --model_type bert \
                    --model_name_or_path /content/drive/MyDrive/NLP/output/multilingual/MLM_cased/wikil_eng_wiki_ita \
                    --output_dir  /content/drive/MyDrive/NLP/output/multilingual/NER_cased/conll2003 \
                    --num_train_epochs 1 \
                    --learning_rate 3e-5 \
                    --char_vocab /content/CharBERT/data/dict/bert_char_vocab \
                    --per_gpu_train_batch_size 6 \
                    --do_train \
                    --do_predict \
                    --overwrite_output_dir \
                    --save_steps 10000

#### Ner Multilingual CoNLL2003 Italian

In [None]:
!python /content/CharBERT/run_ner.py \
                    --data_dir /content/drive/MyDrive/NLP/data/CoNLL2003_ita \
                    --model_type bert \
                    --model_name_or_path /content/drive/MyDrive/NLP/output/multilingual/MLM_cased/wikil_eng_wiki_ita \
                    --output_dir  /content/drive/MyDrive/NLP/output/multilingual/NER_cased/conll2003_ita \
                    --num_train_epochs 1 \
                    --learning_rate 3e-5 \
                    --char_vocab /content/CharBERT/data/dict/bert_char_vocab \
                    --per_gpu_train_batch_size 6 \
                    --do_train \
                    --do_predict \
                    --overwrite_output_dir \
                    --save_steps 10000

### RoBERTa

#### Finetune on multilingual Dataset

In [None]:
!python3 /content/CharBERT/run_lm_finetuning.py \
    --model_type roberta \
    --model_name_or_path bert-base-multilingual-cased \
    --do_train \
    --do_eval \
    --train_data_file /content/drive/MyDrive/NLP/data/wiki/wikil_eng_wiki_ita_train.txt \
    --eval_data_file  /content/drive/MyDrive/NLP/data/wiki/wikil_eng_wiki_ita_val.txt \
    --term_vocab /content/CharBERT/data/dict/term_vocab \
    --learning_rate 3e-5 \
    --num_train_epochs 1 \
    --char_vocab /content/CharBERT/data/dict/roberta_char_vocab \
    --mlm_probability 0.10 \
    --input_nraws 1000 \
    --per_gpu_train_batch_size 4 \
    --per_gpu_eval_batch_size 4 \
    --save_steps 10000 \
    --block_size 384 \
    --mlm \
    --overwrite_output_dir \
    --output_dir  /content/drive/MyDrive/NLP/output/multilingual/MLM_cased/wikil_eng_wiki_ita     

#### Ner Multilingual CoNLL2003 ENGLISH

In [None]:
!python3 /content/CharBERT/run_ner.py \
                    --data_dir /content/drive/MyDrive/NLP/data/CoNLL2003 \
                    --model_type roberta \
                    --model_name_or_path /content/drive/MyDrive/NLP/output/multilingual/MLM_cased/wikil_eng_wiki_ita \
                    --output_dir  /content/drive/MyDrive/NLP/output/multilingual/NER_cased/conll2003 \
                    --num_train_epochs 1 \
                    --learning_rate 3e-5 \
                    --char_vocab /content/CharBERT/data/dict/roberta_char_vocab \
                    --per_gpu_train_batch_size 6 \
                    --do_train \
                    --do_predict \
                    --overwrite_output_dir \
                    --save_steps 10000

#### Ner Multilingual CoNLL2003 Italian

In [None]:
!python /content/CharBERT/run_ner.py \
                    --data_dir /content/drive/MyDrive/NLP/data/CoNLL2003_ita \
                    --model_type roberta \
                    --model_name_or_path /content/drive/MyDrive/NLP/output/multilingual//MLM_cased/wikil_eng_wiki_ita \
                    --output_dir  /content/drive/MyDrive/NLP/output/multilingual/NER_cased/conll2003_ita \
                    --num_train_epochs 1 \
                    --learning_rate 3e-5 \
                    --char_vocab /content/CharBERT/data/dict/roberta_char_vocab \
                    --per_gpu_train_batch_size 6 \
                    --do_train \
                    --do_predict \
                    --overwrite_output_dir \
                    --save_steps 10000

## Delete Runtime from Colab

In [None]:
from google.colab import runtime
runtime.unassign()