<a href="https://colab.research.google.com/github/Chiamakac/IgboNER-Models/blob/main/Model/IgboBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Training an Igbo language model from scratch using Transformers and Tokenizers**

**1. Getting the data.**

In [None]:
!wget -c https://github.com/IgnatiusEzeani/IGBONLP/raw/master/ig_monoling/text.zip
!wget -c https://raw.githubusercontent.com/chiamaka249/lacuna_pos_ner/main/language_corpus/ibo/ibo.txt
!wget -c https://raw.githubusercontent.com/Chiamakac/IboBETA/main/config.json

In [None]:
#Unzip the zipped file and remove the zipped file after unzipping
import zipfile
import os


def unzip(zipfilename):
  try:
    with zipfile.ZipFile(zipfilename, 'r') as zip_ref:
      zip_ref.extractall(zipfilename[:-4])
      return f"'{zipfilename}' unzipped!"
  except FileNotFoundError:
    print(f"Cannot find '{zipfilename}' file")

unzip("text.zip")
!rm text.zip

In [None]:
#copies the file "ibo.txt" into the folder "text"
import shutil
shutil.move('/content/ibo.txt', '/content/text')



'/content/text/ibo.txt'

In [None]:

# import os
#import shutil
dir_name = "/content/text"
text=""
for fname in os.listdir(dir_name):
  fname = os.path.join(dir_name, fname)
  with open(fname, "r", encoding="utf8") as datafile:
    text = text+"\n"+datafile.read()

with open("data.txt", "w", encoding="utf8") as datafile:
  datafile.write(text)

shutil.rmtree("text")

**2.  Import Transformers, Tokenizer and Train the tokenizer**

In [None]:
# We won't need TensorFlow here
!pip uninstall -y tensorflow

# Install `transformers` from master stating the version
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'

# transformers version at notebook update --- 2.11.0
# tokenizers version at notebook update --- 0.8.0rc1

In [None]:

%%time 
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

# Describing the path to all of our Igbo data 
paths = [str(x) for x in Path(".").glob("**/*.txt")]
print(paths)

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",#Beginning of sequence (BOS) or classifier (CLS) token
    "<pad>",# Padding token
    "</s>",#End of sequence (EOS) or seperator (SEP) token
    "<unk>",# Unknown token
    "<mask>", # Masking token
])

['data.txt']
CPU times: user 18.9 s, sys: 1.2 s, total: 20.1 s
Wall time: 6.07 s


In [None]:
#Our tokenizer is now ready and we have two files that define our new IgboBert tokenizer( a vocab.json-which is a list of the most frequent tokens ranked by frequency and a merges.txt list of merges)
#we then save the file for later use

!mkdir IgboBert
tokenizer.save_model("IgboBert")

['IgboBert/vocab.json', 'IgboBert/merges.txt']

In [None]:
shutil.move('/content/config.json', '/content/IgboBert')

'/content/IgboBert/config.json'

**2. Initializing the Tokenizer**

Let's initialize our tokenizer. This way we can use it as we would use any other from_pretrained tokenizer.

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    "./IgboBert/vocab.json",
    "./IgboBert/merges.txt",
)

In [None]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [None]:
tokenizer.encode("Simone gara ·ª•ka ·ª•nyah·ª• gu·ªç egwu ma ga-kwa taa.", "Aha ya b·ª• ifeoma.").tokens

['<s>',
 'Simone',
 'ƒ†gara',
 'ƒ†√°¬ª¬•ka',
 'ƒ†√°¬ª¬•nyah√°¬ª¬•',
 'ƒ†gu',
 '√°¬ªƒØ',
 'ƒ†egwu',
 'ƒ†ma',
 'ƒ†ga',
 '-',
 'kwa',
 'ƒ†taa',
 '.',
 '</s>',
 'Aha',
 'ƒ†ya',
 'ƒ†b√°¬ª¬•',
 'ƒ†ife',
 'oma',
 '.',
 '</s>']

In [None]:
# Check that we have a GPU
!nvidia-smi

Wed Dec 15 20:56:17 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

True

In [None]:
#For training, we need a raw (not pre-trained) BERTLMHeadModel. 
#To create that, we first need to create a RoBERTa config object to describe the parameters we‚Äôd like to initialize IgboBERT with.

from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)


In [None]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./IgboBert", max_len=512, config=config)

In [None]:
#We import and initialize our RoBERTa model with a language modeling (LM) head.

from transformers import RobertaForMaskedLM
model = RobertaForMaskedLM(config=config)

In [None]:
model.num_parameters()
# => 83 million parameters

83504416

In [None]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = "/content/data.txt",
    block_size = 128
)



CPU times: user 33.3 s, sys: 1.27 s, total: 34.5 s
Wall time: 15.6 s


In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./IgboBert",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
%%time
trainer.train()

In [None]:
trainer.save_model("./IgboBert")

Saving model checkpoint to ./IgboBert
Configuration saved in ./IgboBert/config.json
Model weights saved in ./IgboBert/pytorch_model.bin


# **4. Test the Model**

We first initialize a pipeline object, using the 'fill-mask' argument. Then begin testing our model like so



In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./IgboBert",
    tokenizer="./IgboBert"
)

In [None]:
# The sun <mask>.
# =>

fill_mask("Ab·ª• m Maaz·ªã <mask>.") #= okafor/·ªåkaf·ªç
# fill_mask("Nwaany·ªã na <mask> ji na akara.") #=eri

[{'score': 0.008856686763465405,
  'sequence': 'Ab·ª• m Maaz·ªã Mohammed.',
  'token': 3231,
  'token_str': ' Mohammed'},
 {'score': 0.007911812514066696,
  'sequence': 'Ab·ª• m Maaz·ªã O.',
  'token': 381,
  'token_str': ' O'},
 {'score': 0.00748562254011631,
  'sequence': 'Ab·ª• m Maaz·ªã ·ªåkaf·ªç.',
  'token': 5307,
  'token_str': ' ·ªåkaf·ªç'},
 {'score': 0.006802904885262251,
  'sequence': 'Ab·ª• m Maaz·ªã A.',
  'token': 348,
  'token_str': ' A'},
 {'score': 0.0032701685559004545,
  'sequence': 'Ab·ª• m Maaz·ªã Rutherford.',
  'token': 5113,
  'token_str': ' Rutherford'}]

In [None]:
# The sun <mask>.
# =>

fill_mask("Nwaany·ªã na <mask> ji na akara.") #= eri
# fill_mask("Nwaany·ªã na <mask> ji na akara.") #=eri

[{'score': 0.1661785989999771,
  'sequence': 'Nwaany·ªã na ya ji na akara.',
  'token': 289,
  'token_str': ' ya'},
 {'score': 0.11589646339416504,
  'sequence': 'Nwaany·ªã na nwaany·ªã ji na akara.',
  'token': 623,
  'token_str': ' nwaany·ªã'},
 {'score': 0.03902192786335945,
  'sequence': 'Nwaany·ªã na nwunye ji na akara.',
  'token': 724,
  'token_str': ' nwunye'},
 {'score': 0.038711633533239365,
  'sequence': 'Nwaany·ªã na nna ji na akara.',
  'token': 713,
  'token_str': ' nna'},
 {'score': 0.02920209988951683,
  'sequence': 'Nwaany·ªã na- ji na akara.',
  'token': 17,
  'token_str': '-'}]

In [None]:
# The sun <mask>.
# =>

fill_mask("Chineke ga- ebibikwa nd·ªã niile na- eme ihe <mask>.") #=·ªçj·ªç·ªç
# fill_mask("Nwaany·ªã na <mask> ji na akara.") #=eri

[{'score': 0.24352985620498657,
  'sequence': 'Chineke ga- ebibikwa nd·ªã niile na- eme ihe ·ªçma.',
  'token': 496,
  'token_str': ' ·ªçma'},
 {'score': 0.16800811886787415,
  'sequence': 'Chineke ga- ebibikwa nd·ªã niile na- eme ihe ·ªçj·ªç·ªç.',
  'token': 707,
  'token_str': ' ·ªçj·ªç·ªç'},
 {'score': 0.15601330995559692,
  'sequence': 'Chineke ga- ebibikwa nd·ªã niile na- eme ihe a.',
  'token': 266,
  'token_str': ' a'},
 {'score': 0.11959126591682434,
  'sequence': 'Chineke ga- ebibikwa nd·ªã niile na- eme ihe niile.',
  'token': 427,
  'token_str': ' niile'},
 {'score': 0.02550322934985161,
  'sequence': 'Chineke ga- ebibikwa nd·ªã niile na- eme ihe oriri.',
  'token': 1580,
  'token_str': ' oriri'}]

In [None]:
fill_mask("·ªçba akw·ª•kw·ªç ·ªåkamm·ª•ta Kenneth Dike d·ªã <mask>.") #n'Awka

# This is the beginning of a beautiful <mask>.
# =>

[{'score': 0.10755623877048492,
  'sequence': '·ªçba akw·ª•kw·ªç ·ªåkamm·ª•ta Kenneth Dike d·ªã mkpa.',
  'token': 607,
  'token_str': ' mkpa'},
 {'score': 0.07236427068710327,
  'sequence': '·ªçba akw·ª•kw·ªç ·ªåkamm·ª•ta Kenneth Dike d·ªã nso.',
  'token': 604,
  'token_str': ' nso'},
 {'score': 0.067164845764637,
  'sequence': '·ªçba akw·ª•kw·ªç ·ªåkamm·ª•ta Kenneth Dike d·ªã mma.',
  'token': 347,
  'token_str': ' mma'},
 {'score': 0.054604168981313705,
  'sequence': '·ªçba akw·ª•kw·ªç ·ªåkamm·ª•ta Kenneth Dike d·ªã iche.',
  'token': 462,
  'token_str': ' iche'},
 {'score': 0.04094060882925987,
  'sequence': '·ªçba akw·ª•kw·ªç ·ªåkamm·ª•ta Kenneth Dike d·ªã ukwuu.',
  'token': 1009,
  'token_str': ' ukwuu'}]

In [None]:
# The sun <mask>.
# =>

fill_mask("Nwaany·ªã na eri <mask> na akara.") #= ji
# fill_mask("Nwaany·ªã na <mask> ji na akara.") #=eri

[{'score': 0.12033308297395706,
  'sequence': 'Nwaany·ªã na eri ya na akara.',
  'token': 289,
  'token_str': ' ya'},
 {'score': 0.07522733509540558,
  'sequence': 'Nwaany·ªã na eri nri na akara.',
  'token': 870,
  'token_str': ' nri'},
 {'score': 0.0471433624625206,
  'sequence': 'Nwaany·ªã na eri ihe na akara.',
  'token': 300,
  'token_str': ' ihe'},
 {'score': 0.02259232848882675,
  'sequence': 'Nwaany·ªã na eri eri na akara.',
  'token': 957,
  'token_str': ' eri'},
 {'score': 0.01788548193871975,
  'sequence': 'Nwaany·ªã na eri nwaany·ªã na akara.',
  'token': 623,
  'token_str': ' nwaany·ªã'}]

In [None]:
# The sun <mask>.
# =>

fill_mask("Gaan·ª• mee nd·ªã <mask> niile ka ha b·ª•r·ª• nd·ªã na- eso ·ª•z·ªç m  .") #= mba


[{'score': 0.3522748351097107,
  'sequence': 'Gaan·ª• mee nd·ªã a niile ka ha b·ª•r·ª• nd·ªã na- eso ·ª•z·ªç m .',
  'token': 266,
  'token_str': ' a'},
 {'score': 0.08272776752710342,
  'sequence': 'Gaan·ª• mee nd·ªã ·ªçz·ªç niile ka ha b·ª•r·ª• nd·ªã na- eso ·ª•z·ªç m .',
  'token': 434,
  'token_str': ' ·ªçz·ªç'},
 {'score': 0.06121758744120598,
  'sequence': 'Gaan·ª• mee nd·ªã ah·ª• niile ka ha b·ª•r·ª• nd·ªã na- eso ·ª•z·ªç m .',
  'token': 310,
  'token_str': ' ah·ª•'},
 {'score': 0.06120830774307251,
  'sequence': 'Gaan·ª• mee nd·ªã mmad·ª• niile ka ha b·ª•r·ª• nd·ªã na- eso ·ª•z·ªç m .',
  'token': 393,
  'token_str': ' mmad·ª•'},
 {'score': 0.04012814536690712,
  'sequence': 'Gaan·ª• mee nd·ªã Izrel niile ka ha b·ª•r·ª• nd·ªã na- eso ·ª•z·ªç m .',
  'token': 680,
  'token_str': ' Izrel'}]

In [None]:
# The sun <mask>.
# =>

fill_mask("Jehova h·ªçp·ª•tara Mozis ka ·ªç b·ª•r·ª• onye nd√∫ ·ª•m·ª• <mask>.") #= Izrel


[{'score': 0.3820941150188446,
  'sequence': 'Jehova h·ªçp·ª•tara Mozis ka ·ªç b·ª•r·ª• onye nd√∫ ·ª•m·ª• Izrel.',
  'token': 680,
  'token_str': ' Izrel'},
 {'score': 0.2837800979614258,
  'sequence': 'Jehova h·ªçp·ª•tara Mozis ka ·ªç b·ª•r·ª• onye nd√∫ ·ª•m·ª• ya.',
  'token': 289,
  'token_str': ' ya'},
 {'score': 0.10724257677793503,
  'sequence': 'Jehova h·ªçp·ª•tara Mozis ka ·ªç b·ª•r·ª• onye nd√∫ ·ª•m·ª• mmad·ª•.',
  'token': 393,
  'token_str': ' mmad·ª•'},
 {'score': 0.024513551965355873,
  'sequence': 'Jehova h·ªçp·ª•tara Mozis ka ·ªç b·ª•r·ª• onye nd√∫ ·ª•m·ª• ha.',
  'token': 296,
  'token_str': ' ha'},
 {'score': 0.01360291987657547,
  'sequence': 'Jehova h·ªçp·ª•tara Mozis ka ·ªç b·ª•r·ª• onye nd√∫ ·ª•m·ª• Igbo.',
  'token': 900,
  'token_str': ' Igbo'}]

In [None]:
# The sun <mask>.
# =>

fill_mask("·ª§m·ª•akw·ª•kw·ªç Chibok an·ªç·ªçla ·ª•b·ªçch·ªã 2000 n‚Äô aka <mask> Haram.") #= Boko


[{'score': 0.8887956738471985,
  'sequence': '·ª§m·ª•akw·ª•kw·ªç Chibok an·ªç·ªçla ·ª•b·ªçch·ªã 2000 n‚Äô aka Boko Haram.',
  'token': 2535,
  'token_str': ' Boko'},
 {'score': 0.007407982833683491,
  'sequence': '·ª§m·ª•akw·ª•kw·ªç Chibok an·ªç·ªçla ·ª•b·ªçch·ªã 2000 n‚Äô aka Manchester Haram.',
  'token': 3278,
  'token_str': ' Manchester'},
 {'score': 0.007324530277401209,
  'sequence': '·ª§m·ª•akw·ª•kw·ªç Chibok an·ªç·ªçla ·ª•b·ªçch·ªã 2000 n‚Äô aka Super Haram.',
  'token': 3199,
  'token_str': ' Super'},
 {'score': 0.0033930952195078135,
  'sequence': '·ª§m·ª•akw·ª•kw·ªç Chibok an·ªç·ªçla ·ª•b·ªçch·ªã 2000 n‚Äô aka ·ªãgba Haram.',
  'token': 874,
  'token_str': ' ·ªãgba'},
 {'score': 0.0025665571447461843,
  'sequence': '·ª§m·ª•akw·ª•kw·ªç Chibok an·ªç·ªçla ·ª•b·ªçch·ªã 2000 n‚Äô aka G·ªçvan·ªç Haram.',
  'token': 1692,
  'token_str': ' G·ªçvan·ªç'}]

In [None]:
# The sun <mask>.
# =>

fill_mask("Nwunye G·ªçvan·ªç Ekiti steeti b·ª• Bisi Fayemi so na nd·ªã na- akwado <mask> ·ªçh·ª•r·ª• a.") #= iwu


[{'score': 0.1165534108877182,
  'sequence': 'Nwunye G·ªçvan·ªç Ekiti steeti b·ª• Bisi Fayemi so na nd·ªã na- akwado ·ªçch·ªãch·ªã ·ªçh·ª•r·ª• a.',
  'token': 719,
  'token_str': ' ·ªçch·ªãch·ªã'},
 {'score': 0.056705061346292496,
  'sequence': 'Nwunye G·ªçvan·ªç Ekiti steeti b·ª• Bisi Fayemi so na nd·ªã na- akwado ·ªçr·ª• ·ªçh·ª•r·ª• a.',
  'token': 477,
  'token_str': ' ·ªçr·ª•'},
 {'score': 0.04727887734770775,
  'sequence': 'Nwunye G·ªçvan·ªç Ekiti steeti b·ª• Bisi Fayemi so na nd·ªã na- akwado ·ªçn·ªçd·ª• ·ªçh·ª•r·ª• a.',
  'token': 1036,
  'token_str': ' ·ªçn·ªçd·ª•'},
 {'score': 0.042590655386447906,
  'sequence': 'Nwunye G·ªçvan·ªç Ekiti steeti b·ª• Bisi Fayemi so na nd·ªã na- akwado ·ª•wa ·ªçh·ª•r·ª• a.',
  'token': 556,
  'token_str': ' ·ª•wa'},
 {'score': 0.0251015517860651,
  'sequence': 'Nwunye G·ªçvan·ªç Ekiti steeti b·ª• Bisi Fayemi so na nd·ªã na- akwado obodo ·ªçh·ª•r·ª• a.',
  'token': 576,
  'token_str': ' obodo'}]

In [None]:
# The sun <mask>.
# =>

fill_mask(" <mask> s·ªã ka ehiwe ·ª•l·ªçikpe p·ª•r·ª•iche maka mp·ª•.") #= Buhari


[{'score': 0.44573667645454407,
  'sequence': 'A s·ªã ka ehiwe ·ª•l·ªçikpe p·ª•r·ª•iche maka mp·ª•.',
  'token': 37,
  'token_str': 'A'},
 {'score': 0.230531707406044,
  'sequence': '·ªå s·ªã ka ehiwe ·ª•l·ªçikpe p·ª•r·ª•iche maka mp·ª•.',
  'token': 336,
  'token_str': '·ªå'},
 {'score': 0.03158653527498245,
  'sequence': 'Ha s·ªã ka ehiwe ·ª•l·ªçikpe p·ª•r·ª•iche maka mp·ª•.',
  'token': 513,
  'token_str': 'Ha'},
 {'score': 0.02112211100757122,
  'sequence': 'Igbo s·ªã ka ehiwe ·ª•l·ªçikpe p·ª•r·ª•iche maka mp·ª•.',
  'token': 3656,
  'token_str': 'Igbo'},
 {'score': 0.011359370313584805,
  'sequence': 'O s·ªã ka ehiwe ·ª•l·ªçikpe p·ª•r·ª•iche maka mp·ª•.',
  'token': 51,
  'token_str': 'O'}]

In [None]:
# The sun <mask>.
# =>

fill_mask("Ala <mask>  ga- eweta ezi ·ªçn·ªçd·ª• nchekwa maka nd·ªã ch·ªçr·ªç ·ªãwebata ego n‚Äô ·ªçr·ª• ugbo.") #= Na·ªãjir·ªãa


[{'score': 0.10420944541692734,
  'sequence': 'Ala a  ga- eweta ezi ·ªçn·ªçd·ª• nchekwa maka nd·ªã ch·ªçr·ªç ·ªãwebata ego n‚Äô ·ªçr·ª• ugbo.',
  'token': 266,
  'token_str': ' a'},
 {'score': 0.07349573075771332,
  'sequence': 'Ala ah·ª•  ga- eweta ezi ·ªçn·ªçd·ª• nchekwa maka nd·ªã ch·ªçr·ªç ·ªãwebata ego n‚Äô ·ªçr·ª• ugbo.',
  'token': 310,
  'token_str': ' ah·ª•'},
 {'score': 0.030463453382253647,
  'sequence': 'Ala Igbo  ga- eweta ezi ·ªçn·ªçd·ª• nchekwa maka nd·ªã ch·ªçr·ªç ·ªãwebata ego n‚Äô ·ªçr·ª• ugbo.',
  'token': 900,
  'token_str': ' Igbo'},
 {'score': 0.02490180917084217,
  'sequence': 'Ala Nigeria  ga- eweta ezi ·ªçn·ªçd·ª• nchekwa maka nd·ªã ch·ªçr·ªç ·ªãwebata ego n‚Äô ·ªçr·ª• ugbo.',
  'token': 1570,
  'token_str': ' Nigeria'},
 {'score': 0.01187092810869217,
  'sequence': 'Ala ·ªçr·ª•  ga- eweta ezi ·ªçn·ªçd·ª• nchekwa maka nd·ªã ch·ªçr·ªç ·ªãwebata ego n‚Äô ·ªçr·ª• ugbo.',
  'token': 477,
  'token_str': ' ·ªçr·ª•'}]

In [None]:
# The sun <mask>.
# =>

fill_mask("·ªå b·ª• <mask>a ka a na- ar·ªãa .") #= mmad·ª•
# fill_mask("Nwaany·ªã na <mask> ji na akara.") #=eri

[{'score': 0.044008973985910416,
  'sequence': '·ªå b·ª•-a ka a na- ar·ªãa.',
  'token': 17,
  'token_str': '-'},
 {'score': 0.0439983569085598,
  'sequence': '·ªå b·ª• nwaa ka a na- ar·ªãa.',
  'token': 419,
  'token_str': ' nwa'},
 {'score': 0.041657522320747375,
  'sequence': '·ªå b·ª• Nwannaa ka a na- ar·ªãa.',
  'token': 1459,
  'token_str': ' Nwanna'},
 {'score': 0.018149062991142273,
  'sequence': '·ªå b·ª• ·ªçr·ªãaa ka a na- ar·ªãa.',
  'token': 956,
  'token_str': ' ·ªçr·ªãa'},
 {'score': 0.009843925014138222,
  'sequence': '·ªå b·ª• naan·ªãa ka a na- ar·ªãa.',
  'token': 769,
  'token_str': ' naan·ªã'}]

In [None]:
#mount gdrive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
#move model to gdrive
shutil.move('model path','drive path')

'/content/gdrive/MyDrive/IBO_BETA/IgboBert'