### Running Baseline

In [None]:
from google.colab import drive, userdata

drive.mount('/content/drive')
github_token = userdata.get('github_token')
github_username = userdata.get('github_username')
try:
  wandb_key = userdata.get('wandb_key')
except userdata.SecretNotFoundError: 
 wandb_key = ''

In [None]:
!git clone https://$github_username:$github_token@github.com/developer-sidani/CharBERT.git

In [None]:
!pip install -r CharBERT/requirements.txt

### Baseline Bert

In [None]:
!python3 /content/CharBERT/run_lm_finetuning.py \
    --model_type bert \
    --model_name_or_path bert-base-cased \
    --do_train \
    --do_eval \
    --wandb_key $wandb_key \
    --wandb_project CharBERT \
    --wandb_run_name "bert_base_cased_wiki_eng" \
    --train_data_file "/content/drive/MyDrive/NLP/data/wiki-eng/train.txt" \
    --eval_data_file  "/content/drive/MyDrive/NLP/data/wiki-eng/eval.txt" \
    --term_vocab "/content/CharBERT/data/dict/term_vocab" \
    --learning_rate 3e-5 \
    --num_train_epochs 3 \
    --char_vocab "/content/CharBERT/data/dict/bert_char_vocab" \
    --mlm_probability 0.10 \
    --input_nraws 1000 \
    --per_gpu_train_batch_size 4 \
    --per_gpu_eval_batch_size 4 \
    --save_steps 10000 \
    --block_size 384 \
    --mlm \
    --overwrite_output_dir \
    --output_dir  "/content/drive/MyDrive/NLP/output/wiki-eng/bert_base_cased_wiki_eng"

### Ner

In [None]:
DATA_DIR='/content/drive/MyDrive/NLP/data/CoNLL2003'
MODEL_DIR='/content/drive/MyDrive/NLP/output/wiki-eng/bert_base_cased_wiki_eng'
OUTPUT_DIR='/content/drive/MyDrive/NLP/output/conll2003_ner'
!python3 run_ner.py --data_dir ${DATA_DIR} \
                --model_type bert \
                --wandb_key $wandb_key \
                --wandb_project CharBERT \
                --wandb_run_name "ner_conll2003" \
                --model_name_or_path $MODEL_DIR \
                --output_dir ${OUTPUT_DIR} \
                --num_train_epochs 3 \
                --learning_rate 3e-5 \
                --char_vocab /content/CharBERT/data/dict/bert_char_vocab \
                --per_gpu_train_batch_size 6 \
                --do_train \
                --do_predict \
                --overwrite_output_dir

## Multilingual Extension Bert cased

In [None]:
!python3 ./CharBERT/run_lm_finetuning.py \
    --model_type bert \
    --model_name_or_path bert-base-multilingual-cased \
    --do_train \
    --do_eval \
    --train_data_file "/content/drive/MyDrive/NLP/data/wiki/wikil_eng_wiki_ita_train.txt" \
    --eval_data_file  "/content/drive/MyDrive/NLP/data/wiki/wikil_eng_wiki_ita_val.txt" \
    --term_vocab "./CharBERT/data/dict/term_vocab" \
    --learning_rate 3e-5 \
    --num_train_epochs 3 \
    --char_vocab "/content/CharBERT/data/dict/bert_char_vocab" \
    --mlm_probability 0.10 \
    --wandb_key $wandb_key \
    --wandb_project CharBERT \
    --input_nraws 1000 \
    --per_gpu_train_batch_size 4 \
    --per_gpu_eval_batch_size 4 \
    --save_steps 10000 \
    --block_size 384 \
    --mlm \
    --overwrite_output_dir \
    --output_dir  "/content/drive/MyDrive/NLP/output/multilingual/MLM_cased/wikil_eng_wiki_ita" \
     

### Ner Multilingual CoNLL2003 ENGLISH

In [None]:
!python3 ./CharBERT/run_ner.py \
                    --data_dir "/content/drive/MyDrive/NLP/data/CoNLL2003" \
                    --model_type bert \
                    --model_name_or_path "/content/drive/MyDrive/NLP/output/multilingual/MLM_cased/wikil_eng_wiki_ita" \
                    --output_dir  "/content/drive/MyDrive/NLP/output/multilingual/NER_cased/conll2003" \
                    --num_train_epochs 3 \
                    --wandb_key $wandb_key \
                    --wandb_project CharBERT \
                    --learning_rate 3e-5 \
                    --char_vocab "/content/CharBERT/data/dict/bert_char_vocab" \
                    --per_gpu_train_batch_size 6 \
                    --do_train \
                    --do_predict \
                    --overwrite_output_dir \
                    --save_steps 10000

### Ner Multilingual CoNLL2003 Italian

In [None]:
!python ./CharBERT/run_ner.py \
                    --data_dir "/content/drive/MyDrive/NLP/data/CoNLL2003_ita" \
                    --model_type bert \
                    --model_name_or_path "/content/drive/MyDrive/NLP/output/multilingual//MLM_cased/wikil_eng_wiki_ita" \
                    --output_dir  "/content/drive/MyDrive/NLP/output/multilingual/NER_cased/conll2003_ita" \
                    --num_train_epochs 3 \
                    --wandb_key $wandb_key \
                    --wandb_project CharBERT \
                    --learning_rate 3e-5 \
                    --char_vocab "/content/CharBERT/data/dict/bert_char_vocab" \
                    --per_gpu_train_batch_size 6 \
                    --do_train \
                    --do_predict \
                    --overwrite_output_dir \
                    --save_steps 10000

In [None]:
from google.colab import runtime
runtime.unassign()