# Capstone 3 - Exploratory Data Analysis

In [70]:
import json
import pandas as pd
from transformers import RobertaTokenizerFast, RobertaModel, AutoConfig, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import DatasetDict
from huggingface_hub import notebook_login
from sklearn.decomposition import PCA

In [28]:
df = pd.read_csv('HebrewBiblebyVerse.csv', index_col='Verse')

df.tail()

Unnamed: 0_level_0,Text
Verse,Unnamed: 1_level_1
2 Chr 36:19,wa yiśrəpû ʔet bêt hā ʔĕlōhîm wa yənattəṣû...
2 Chr 36:20,wa yegel ha šəʔērît min ha ḥereb ʔel bābe...
2 Chr 36:21,lə mallôt dəbar yəhwāh bə pî yirməyāhû ʕa...
2 Chr 36:22,û bi šənat ʔaḥat lə kôreš melek pāras li ...
2 Chr 36:23,kō ʔāmar kôreš melek pāras kol mamləkôt ...


## Training the Tokenizer

In [50]:
verses = df['Text'].tolist()

verses[:11]

['bə rēšît  bārā  ʔĕlōhîm  ʔēt  ha šāmayim  wə ʔēt  hā ʔāreṣ',
 'wə hā ʔāreṣ  hāyətā  tōhû  wā bōhû  wə ḥōšek  ʕal  pənê  təhôm  wə rûaḥ  ʔĕlōhîm  məraḥepet  ʕal  pənê  ha māyim',
 'wa yōmer  ʔĕlōhîm  yəhî  ʔôr  wa yəhî  ʔôr',
 'wa yar  ʔĕlōhîm  ʔet  hā ʔôr  kî  ṭôb  wa yabdēl  ʔĕlōhîm  bên  hā ʔôr  û bên  ha ḥōšek',
 'wa yiqrā  ʔĕlōhîm  lā  ʔôr  yôm  wə la  ḥōšek  qārā  lāyəlā  wa yəhî  ʕereb  wa yəhî  bōqer  yôm  ʔeḥād',
 'wa yōmer  ʔĕlōhîm  yəhî  rāqîaʕ  bə tôk  ha māyim  wi yhî  mabdîl  bên  mayim  lā māyim',
 'wa yaʕaś  ʔĕlōhîm  ʔet  hā rāqîaʕ  wa yabdēl  bên  ha mayim  ʔăšer  mi taḥat  lā  rāqîaʕ  û bên  ha mayim  ʔăšer  mē ʕal  lā  rāqîaʕ  wa yəhî  kēn',
 'wa yiqrā  ʔĕlōhîm  lā  rāqîaʕ  šāmāyim  wa yəhî  ʕereb  wa yəhî  bōqer  yôm  šēnî',
 'wa yōmer  ʔĕlōhîm  yiqqāwû  ha mayim  mi taḥat  ha šāmayim  ʔel  māqôm  ʔeḥād  wə tērāʔê  ha yabbāšā  wa yəhî  kēn',
 'wa yiqrā  ʔĕlōhîm  la  yabbāšā  ʔereṣ  û lə miqwē  ha mayim  qārā  yammîm  wa yar  ʔĕlōhîm  kî  ṭôb',
 'wa yōmer  ʔĕlōhîm  

In [30]:
def get_training_corpus():
    dataset = verses
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples
        
training_corpus = get_training_corpus()

In [31]:
old_tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)






In [39]:
tokenizer.save_pretrained("BERiT")

('BERiT/tokenizer_config.json',
 'BERiT/special_tokens_map.json',
 'BERiT/vocab.json',
 'BERiT/merges.txt',
 'BERiT/added_tokens.json',
 'BERiT/tokenizer.json')

In [40]:
with open('BERiT/vocab.json', 'r') as vocab_json:
    tokens = json.load(vocab_json)

In [47]:
tokens

{'<s>': 0,
 '<pad>': 1,
 '</s>': 2,
 '<unk>': 3,
 '<mask>': 4,
 'a': 5,
 'b': 6,
 'd': 7,
 'e': 8,
 'g': 9,
 'h': 10,
 'i': 11,
 'k': 12,
 'l': 13,
 'm': 14,
 'n': 15,
 'o': 16,
 'p': 17,
 'q': 18,
 'r': 19,
 's': 20,
 't': 21,
 'u': 22,
 'w': 23,
 'y': 24,
 'z': 25,
 '¡': 26,
 '£': 27,
 '¥': 28,
 'ª': 29,
 '®': 30,
 '´': 31,
 'µ': 32,
 '¸': 33,
 '¹': 34,
 '»': 35,
 'Ã': 36,
 'Ä': 37,
 'Å': 38,
 'É': 39,
 'Ê': 40,
 'á': 41,
 'Ġ': 42,
 'ģ': 43,
 'ĥ': 44,
 'į': 45,
 'Ĵ': 46,
 'ĵ': 47,
 'Ķ': 48,
 'ķ': 49,
 'Ļ': 50,
 'Ľ': 51,
 'Ŀ': 52,
 'Ń': 53,
 'Äģ': 54,
 'ÉĻ': 55,
 'ĠÊ': 56,
 'ĠÊĶ': 57,
 'Ã®': 58,
 'Å¡': 59,
 'Ġy': 60,
 'Äĵ': 61,
 'Åį': 62,
 'Ã»': 63,
 'Ġh': 64,
 'Ã´': 65,
 'Ġw': 66,
 'Ġb': 67,
 'Ġl': 68,
 'Ãª': 69,
 'Ġm': 70,
 '¸¥': 71,
 'á¸¥': 72,
 'Äĥ': 73,
 'ĠÊķ': 74,
 'á¹': 75,
 'Êķ': 76,
 'ĠwÉĻ': 77,
 'Ġha': 78,
 'Ġk': 79,
 'ĠÊĶe': 80,
 'á¹£': 81,
 'ÊĶ': 82,
 'ĠÅ¡': 83,
 'Ã®m': 84,
 'Äģm': 85,
 'ÅĽ': 86,
 'ĠbÉĻ': 87,
 'er': 88,
 'ĠyÉĻ': 89,
 'ĠÊĶÄĥ': 90,
 'Äģh': 91,
 'Ġn': 92,
 'Ġ

In [48]:
encoding = tokenizer(verses[1])

encoding 

{'input_ids': [0, 136, 103, 258, 42, 948, 42, 4743, 42, 300, 15109, 42, 77, 1862, 42, 146, 42, 259, 42, 4508, 42, 77, 726, 42, 307, 42, 31962, 42, 146, 42, 259, 42, 78, 1245, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

## Training the Language Model

In [67]:
raw_datasets = DataSetDict({'train' : , 'valid' :})

{'input_ids': [0, 2, 2], 'attention_mask': [1, 1, 1]}

In [None]:
def tokenize(verse):
    return tokenizer(verse, truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize, batched=True)

In [59]:
config = AutoConfig.from_pretrained("roberta-base", vocab_size=len(tokenizer), n_ctx= , bos_token_id=0, eos_toke_id=2)

47460

In [60]:
model = RobertaModel(config)

AttributeError: 'RobertaTokenizerFast' object has no attribute 'bos_token_ind'

In [None]:
tokenizer.pad_token = tokenizer.eos_token

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=True, *** .15)

In [None]:
notebook_login()

In [None]:
args = TrainingArguments(output_dir=)

trainer = Trainer(model=model, tokenizer=tokenizer, args=args, data_collator=data_collator, train_dataset=, eval_dataset=)

In [None]:
trainer.train()