# Guilianelli & Fernandez Exploration
Various model training (GPT/RNN) in EN/FR and executions
* Exploration of pretraining  - models uploaded to HuggingFace
* Exploration of perplexity / epochs
* Exploration of influence of separator between sentences

Results are generated as CSV and explored elsewhere.

**Note**: some cells are blanked out due to the use of API keys, must be turned to code / filled before executing

In [1]:
# Taking modifications to library into account
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os, sys, re 
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import itertools
import logging

from argparse import Namespace
from types import SimpleNamespace

In [3]:
%%capture
!pip install transformers
!pip install datasets
!pip install huggingface_hub

In [4]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers.data.data_collator import default_data_collator
from transformers import TextDataset, DataCollatorForLanguageModeling
from torch.nn.functional import log_softmax

In [5]:
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM

from datasets import load_dataset, Dataset, DatasetDict

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**HuggingFace parameterizing**

In [7]:
%%capture
!sudo apt-get install git-lfs

In [8]:
## FILL
%%bash
git config --global user.email # FILL
#git clone https://USER:KEY@huggingface.co/USER/MODEL_NAME

Cloning into 'gpt2-fr-paco-cheese-finetuned'...
Cloning into 'gpt2-en-maptask-finetuned'...
Cloning into 'dialogpt-maptask-finetuned'...
Cloning into 'dbddv01-gpt2-french-small_space_orfeo-cid-paco-cheese'...


In [9]:
# Save and load models from huggingface
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [73]:
!cd gpt2-fr-paco-cheese-finetuned && git lfs install && git lfs pull
!cd dbddv01-gpt2-french-small_space_orfeo-cid-paco-cheese && git lfs install && git lfs pull

!cd gpt2-en-maptask-finetuned && git lfs install && git lfs pull
!cd dialogpt-maptask-finetuned && git lfs install && git lfs pull

Updated git hooks.
Git LFS initialized.
Updated git hooks.
Git LFS initialized.
Updated git hooks.
Git LFS initialized.
Updated git hooks.
Git LFS initialized.


In [11]:
SEED = 42
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

In [12]:
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [13]:
from sklearn.model_selection import train_test_split

## Downloading repos from GitHub
* Mine

In [14]:
%%bash
rm -rf multimodal-itmodels
git clone https://APIKEY@github.com/Neako/multimodal-itmodels.git

Cloning into 'multimodal-itmodels'...


In [15]:
UTILS_PATH = "/content/multimodal-itmodels/utils"
sys.path.append(UTILS_PATH)

* Guilliani & Fernandez

In [16]:
!rm -rf uid-dialogue
!git clone https://github.com/dmg-illc/uid-dialogue.git
GF_PATH = "/content/uid-dialogue/conll2021/src"
sys.path.append(GF_PATH)

Cloning into 'uid-dialogue'...
remote: Enumerating objects: 70, done.[K
remote: Counting objects: 100% (70/70), done.[K
remote: Compressing objects: 100% (65/65), done.[K
remote: Total 70 (delta 24), reused 21 (delta 4), pack-reused 0[K
Unpacking objects: 100% (70/70), done.


## Parameters

In [17]:
from dataloaders import _add_to_text, create_context, create_full_context, create_context_dataset_from_df, create_context_dataset, get_perplexity_encodings, _anytype_context
from entropy_computation import sentence_predict, test_predict_entropy, batch_predict_entropy, results_to_df
from entropy_computation import batch_predict_logits_rnn, batch_predict_logits_lm, compute_perplexity
from entropy_computation import pivot_results_df

In [18]:
TOK_MODELS = {
    'FR': 'dbddv01/gpt2-french-small',
    'EN': 'gpt2',
}
CAUS_MODELS = {'untrained': {
        'FR': ['dbddv01/gpt2-french-small'],
        'EN': ['gpt2','microsoft/DialoGPT-small']
    },
    'trained':{
        'FR': ['gpt2-fr-paco-cheese-finetuned','dbddv01-gpt2-french-small_space_orfeo-cid-paco-cheese'],
        'EN': ['gpt2-en-maptask-finetuned','dialogpt-maptask-finetuned']
    }
}

In [19]:
datasets = {
    'EN': 'maptask', 'FR': 'paco-cheese'
}

original_data_folder = {
    "maptask": "/content/multimodal-itmodels/data/hcrc_maptask/maptask-v2.1.csv", 
    "paco-cheese": "/content/multimodal-itmodels/data/paco-cheese/paco-cheese.csv",
    "cid-paco-cheese": "/content/multimodal-itmodels/data/cid/paco-cheese-cid.csv",
    "cid": "/content/multimodal-itmodels/data/cid/cid-flow-ordered.csv",
    "orfeo": "/content/multimodal-itmodels/data/orfeo/orfeo.csv",
}

In [74]:
LANGUAGE = 'EN'
data_file = datasets[LANGUAGE]
original_data_path = original_data_folder[data_file]

MODE = 'trained'
MODEL_NAME = CAUS_MODELS[MODE][LANGUAGE][0]

In [75]:
BATCH_SIZE = 16
log_2 = torch.log(torch.tensor(2.))


## Data

In [79]:
tokenizer = AutoTokenizer.from_pretrained(TOK_MODELS[LANGUAGE], truncation_side='left') # padding_side='left'

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [80]:
# Adding tokens for padding
tokenizer.pad_token = tokenizer.eos_token
tokenizer.sep_token = tokenizer.eos_token
# Adding speakers
if False: # TODO: clear up issue here
    tokenizer.add_special_tokens(['<f>','<g>'])

In [81]:
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm=False)

In [82]:
df = pd.read_csv(original_data_path, keep_default_na=False, na_values=[''])
df.head()

Unnamed: 0,file,speaker,theme_id,transaction_type,move_number,move_type,index,theme_index,duration,text
0,q1ec1,g,1,normal,1,ready,1,1,0.3294,okay
1,q1ec1,g,1,normal,2,instruct,2,2,3.1785,starting off we are above a caravan park
2,q1ec1,f,1,normal,1,acknowledge,3,3,0.3459,mmhmm
3,q1ec1,g,1,normal,3,instruct,4,4,9.7612,we are going to go due south straight south an...
4,q1ec1,f,1,normal,2,check,5,5,1.5487,due south and then back up again


In [83]:
file_col = 'file'
index_col = 'index'
speaker_col = 'speaker'
text_col = 'text'

In [84]:
def get_train_test_files(df:pd.DataFrame, is_full_paco_cheese:bool=False, file_col:str='file'):
    if is_full_paco_cheese:
        files = df[df.has_theme][file_col].unique()
        files_train, files_test = train_test_split(files, random_state=SEED, test_size=0.4)
        files_train = np.concatenate([files_train, df[~df.has_theme].file.unique()])
    else:
        files = df[file_col].unique()
        files_train, files_test = train_test_split(files, random_state=SEED, test_size=0.3)

    print(len(files_train), len(files_test))
    return files_train, files_test

In [85]:
files_train, files_test = get_train_test_files(df, is_full_paco_cheese = (LANGUAGE == 'FR'))

89 39


In [29]:
dataset_c, df2 = create_context_dataset(df, tokenizer, files_train, files_test, sep_token=tokenizer.eos_token, max_length=1024)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [30]:
encodings = get_perplexity_encodings(df[df.file.isin(files_test)], tokenizer)

## Fine tuning gpt2 model with context

In [77]:
%%capture
gptmodel = AutoModelForCausalLM.from_pretrained(CAUS_MODELS['untrained'][LANGUAGE][0])
gptmodel = gptmodel.to(DEVICE)

In [86]:
dataset_c, df2 = create_context_dataset(df, tokenizer, files_train, files_test, sep_token=tokenizer.eos_token, max_length=150)

In [87]:
MODEL_NAME = f"{CAUS_MODELS['untrained'][LANGUAGE][0]}-{data_file}-GF"

In [None]:
x = ' '.join(df.text.tolist()).replace('  ', ' ').split(' ')
sl = 16
n = len(x) // (2*sl)
#sx = [' '.join(x[(i-1)*sl:(i+1)*sl]) for i in range(1,n-1)]
sx = [tokenizer.eos_token.join(x[(i-1)*sl:(i+1)*sl]) for i in range(1,n-1)]
tdf = pd.DataFrame({'text':sx, 'tt': np.random.choice(2,len(sx),p=[0.7, 0.3])})

dataset_t = DatasetDict({
            'train': Dataset.from_pandas(tdf[tdf.tt == 0]),
            'test': Dataset.from_pandas(tdf[tdf.tt == 1])
        }) 
dataset_t = dataset_t.map(lambda x: tokenizer(x['text'], truncation=True, padding=True, max_length=sl), batched=True, batch_size=8)#BATCH_SIZE)

In [93]:
training_args = TrainingArguments(
    output_dir = f"./{MODEL_NAME}",
    overwrite_output_dir=True,
    num_train_epochs = 45,
    #max_steps=400,
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_steps=2000,
    save_steps=20000,
    warmup_steps=100,
    prediction_loss_only=False,#True
)
trainer = Trainer(
    model = gptmodel,
    args = training_args,
    data_collator = data_collator,
    train_dataset = dataset_c['train'],
    eval_dataset = dataset_c['test']
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: theme_id, move_type, text_u, index, length, move_number, text, duration, text_input_ids, text_u_full, start_idx, text_input_ids_full, theme_index, __index_level_0__, file, speaker, transaction_type. If theme_id, move_type, text_u, index, length, move_number, text, duration, text_input_ids, text_u_full, start_idx, text_input_ids_full, theme_index, __index_level_0__, file, speaker, transaction_type are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 19151
  Num Epochs = 45
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 53865


Step,Training Loss
500,3.1502
1000,2.63
1500,2.4917
2000,2.3931
2500,2.3066
3000,2.218
3500,2.1427
4000,2.0644
4500,1.9937
5000,1.9265


In [None]:
trainer.save_model()

In [None]:
# https://huggingface.co/docs/transformers/model_sharing
trainer.push_to_hub(MODEL_NAME)

## Using model to get predictions

In [None]:
%%capture
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.to(DEVICE)

In [72]:
md_name = MODEL_NAME.lower().replace('/','-')

Final output - same parametrisation as GF

In [33]:
test_dataframe = df[df.file.isin(files_test)]

dataset_c.set_format(type='torch', columns=['input_ids', 'start_idx', 'attention_mask'])
test_dataloader = DataLoader(dataset_c['test'], collate_fn=data_collator, batch_size=1, worker_init_fn=SEED)
sent_avg_logp, tokens_logp, sent_length, sentence_tokens = test_predict_entropy(model, test_dataloader, tokenizer, DEVICE, batch_predict_logits_lm)
test_dataframe = results_to_df(test_dataframe, sent_avg_logp, tokens_logp, sent_length, 
                                out_file_name = f'{md_name}-{data_file}-fin', sentence_tokens = sentence_tokens)

Iteration: 100%|██████████| 3816/3816 [07:09<00:00,  8.89it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['normalised_h'] = sent_avg_logp
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['length'] = sent_length
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['tokens_h'] = tokens_logp
A value is trying to be set on

Experiments with fixed context length:

In [None]:
test_dataframe = df[df.file.isin(files_test)]#.apply(lambda x: x.strip().replace('  ',' ')) 
sep_token = tokenizer.eos_token
sep_tk = 'space' if sep_token == ' ' else 'eos'

for i in range(0,8):
    print(f'\n------------- CONTEXT {i} -------------')
    if i == 0:
        df['context'] = _add_to_text(df, file_col = file_col)
    else:
        df['context'] = create_context(df, context_len = i, file_col = file_col, sep_token=sep_token) # from parameters at top

    # Compute
    dataset_c = create_context_dataset_from_df(df, tokenizer, f"context", files_train, files_test, batch_size=BATCH_SIZE, max_length=1024)
    dataset_c.set_format(type='torch', columns=['input_ids', 'start_idx', 'attention_mask', '__index_level_0__'])
    test_dataloader = DataLoader(dataset_c['test'], 
                                collate_fn=data_collator, batch_size=BATCH_SIZE, worker_init_fn=SEED)
    sent_avg_logp, tokens_logp, sent_length, sentence_tokens = test_predict_entropy(model, test_dataloader, 
                                                                   tokenizer, DEVICE, batch_predict_logits_lm)
    test_dataframe = results_to_df(test_dataframe, sent_avg_logp, tokens_logp, sent_length, 
                                   out_file_name = None, sentence_tokens = sentence_tokens, column_post=f'_{sep_tk}_{i}')

In [None]:
test_dataframe.to_csv(f"{MODEL_NAME.replace('/','-')}-{data_file}-{sep_tk}-c0-8.csv",index=False)

In [36]:
test_dataframe.head(2)

Unnamed: 0,corpus,file,dyad,index,speaker,start,stop,text,theme,theme_role,...,normalised_h_space_6,length_space_6,tokens_h_space_6,sum_h_space_6,xu_h_space_6,normalised_h_space_7,length_space_7,tokens_h_space_7,sum_h_space_7,xu_h_space_7
0,cheese,Cheese-AA_OR,AAOR,0,AA,4.54,4.83993,tu as,hetero selection,g,...,10.182384,1,[-10.182384490966797],10.182384,1.448603,10.182384,1,[-10.182384490966797],10.182384,1.446912
1,cheese,Cheese-AA_OR,AAOR,1,OR,5.14,5.82492,mh ouais si tu veux,hetero selection,f,...,4.147297,7,"[-10.792579650878906, -0.26334628462791443, -4...",29.03108,0.848037,4.147297,7,"[-10.792580604553223, -0.2633461654186249, -4....",29.031081,0.849066


In [None]:
pivot_results_df(test_dataframe, post_patterns=[f'_{sep_tk}_{i}' for i in range(0,8)]).to_csv(
    f'{MODEL_NAME.replace("/","-")}-{data_file}-{sep_tk}-c0-8-p.csv',index=False)

Comparison with full context (throw in batch sizes):

In [38]:
param_conf = [
    #{'sep_token':' ', 'sep_context_sent':False, 'add_ipu_speaker_tokens':False, 'add_speaker_tokens':False},
    #{'sep_token':tokenizer.eos_token, 'sep_context_sent':False, 'add_ipu_speaker_tokens':False, 'add_speaker_tokens':False},
    {'sep_token':tokenizer.sep_token, 'sep_context_sent':True, 'add_ipu_speaker_tokens':False, 'add_speaker_tokens':False},
    #{'sep_token':' ', 'sep_context_sent':False, 'add_ipu_speaker_tokens':True, 'add_speaker_tokens':False},
    #{'sep_token':' ', 'sep_context_sent':True, 'add_ipu_speaker_tokens':False, 'add_speaker_tokens':True},
]

In [39]:
#test_dataframe = df[df.file.isin(files_test)]
#
#ptest_dataframe = []
#for param in param_conf:
#    col = f"context_{''.join([x if isinstance(x,str) else str(int(x)) for x in param.values()])}"
#    print(f'\n------------- CONCAT PARAM {col.upper()} -------------')
#    df["context"] = create_full_context(df, file_col=file_col, index_col = index_col, **param) 
#
#    for bs in [BATCH_SIZE]: # could test 1 also
#        dataset_c = create_context_dataloader(df, "context", files_train, files_test, batch_size=bs, max_length=1024)
#        dataset_c.set_format(type='torch', columns=['input_ids', 'start_idx', 'attention_mask', '__index_level_0__'])
#        test_dataloader = DataLoader(dataset_c['test'], 
#                                    collate_fn=data_collator, batch_size=bs, worker_init_fn=SEED)
#        sent_avg_logp, tokens_logp, sent_length, sentence_tokens = test_predict_entropy(model, test_dataloader, 
#                                                                        tokenizer, DEVICE, batch_predict_logits_lm)
#        test_dataframe = results_to_df(test_dataframe, sent_avg_logp, tokens_logp, sent_length, 
#                                        out_file_name = None, sentence_tokens = sentence_tokens)
#        test_dataframe['model'] = f'{md_name}-{col}—bs{bs}'
#        ptest_dataframe.append(test_dataframe.copy())
#
#pd.concat(ptest_dataframe, axis=0, ignore_index=True).to_csv(f'{md_name}-{data_file}-cf.csv',index=False)

In [None]:
test_dataframe = df[df.file.isin(files_test)]

ptest_dataframe = []
for sep_token in [' ', tokenizer.eos_token]:
    col = f"context_full_sep_{'space' if sep_token == ' ' else 'eos'}"
    print(f'\n------------- PARAM {col.upper()} -------------')

    dataset_c, df2 = create_context_dataset(df, tokenizer, files_train, files_test, sep_token=sep_token, max_length=1024)
    dataset_c.set_format(type='torch', columns=['input_ids', 'start_idx', 'attention_mask'])
    test_dataloader = DataLoader(dataset_c['test'], collate_fn=data_collator, batch_size=1, worker_init_fn=SEED)
    sent_avg_logp, tokens_logp, sent_length, sentence_tokens = test_predict_entropy(model, test_dataloader, tokenizer, DEVICE, batch_predict_logits_lm)
    test_dataframe = results_to_df(test_dataframe, sent_avg_logp, tokens_logp, sent_length, sentence_tokens = sentence_tokens, out_file_name = None)

    test_dataframe['model'] = f'{md_name}-{col}'
    ptest_dataframe.append(test_dataframe.copy())

pd.concat(ptest_dataframe, axis=0, ignore_index=True).to_csv(f'{md_name}-{data_file}-sep.csv',index=False)


------------- PARAM CONTEXT_FULL_SEP_SPACE -------------


Iteration: 100%|██████████| 3816/3816 [07:11<00:00,  8.85it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['normalised_h'] = sent_avg_logp
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['length'] = sent_length
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['tokens_h'] = tokens_logp
A value is trying to be set on


------------- PARAM CONTEXT_FULL_SEP_EOS -------------


Iteration:  87%|████████▋ | 3330/3816 [06:17<01:00,  7.97it/s]

In [None]:
pd.concat(ptest_dataframe, axis=0, ignore_index=True).head(2)

## Same with RNN

### Model/Train definition

In [None]:
class GRUModel(nn.Module):
    def __init__(self, tokenizer, embed_size=128, hidden_size=128):
        super().__init__()
        self.embed = nn.Embedding(tokenizer.vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, hidden_size, num_layers=1, bidirectional=False, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.decision = nn.Linear(hidden_size * 1 * 1, tokenizer.vocab_size)
        
    def forward(self, x, return_dict:bool=False):
        embed = self.embed(x)
        output, hidden = self.rnn(embed)
        drop = self.dropout(output) # utilisation de hidden ou output selon le besoin - 
        logits = self.decision(drop)#.transpose(0, 1).contiguous().view(x.size(0), -1))
        if not return_dict:
            return logits
        else:
            return {'logits': logits}

In [54]:
def perf(model, eval_dataloader, criterion = nn.CrossEntropyLoss(reduction='sum')):
    model.eval()
    val_loss = 0
    for batch, data in tqdm(enumerate(eval_dataloader)):
        x = data['input_ids'][:,:-1].to(DEVICE)
        y = data['input_ids'][:,1:].to(DEVICE)
        with torch.no_grad():
            y_pred = model(x)
            loss = criterion(y_pred.transpose(1, 2), y)
            val_loss += loss.item()
    return val_loss / np.prod(x.shape)

def train(model, train_dataloader, eval_dataloader,
          max_epoch:int=10, start_epoch:int=0, save_every:int=20,
          save_path:str=None,
          optimizer=None, train_criterion=nn.CrossEntropyLoss(), eval_criterion=nn.CrossEntropyLoss(reduction='sum')):
    if optimizer is None:
        optimizer = optim.Adam(model.parameters())#, lr=0.001)
    if (save_path is not None) and not os.path.exists(save_path):
        os.makedirs(save_path)


    for epoch in range(start_epoch, max_epoch):
        print(f"epoch {epoch}", end=" ")
        model.train()
        train_loss = 0
        for batch, data in tqdm(enumerate(train_dataloader)):
            x = data['input_ids'][:,:-1].to(DEVICE)
            y = data['input_ids'][:,1:].to(DEVICE)

            optimizer.zero_grad()
            y_pred = model(x)
            loss = train_criterion(y_pred.transpose(1, 2), y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = perf(model, eval_dataloader, eval_criterion)
        val_ppl = torch.exp(torch.Tensor([val_loss])).item()

        print({ 'epoch': epoch, 'train_loss': train_loss, 'test_loss': val_loss, 'test_ppl': val_ppl })

        if (save_path is not None) & ((epoch+1) % save_every == 0):
            # save current model
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
                }, os.path.join(save_path, f'model_epoch_{epoch+1}.pt'))

In [None]:
def checkpoint_load(path:str, model, optimizer=None):
    """
    Input:
    -----------
    path: str
        path to the .pt checkpoint
    
    model: loaded model class with arguments
    optimizer: loaded optimizer class with arguments
    """
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    if optimizer is not None:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    return model, optimizer, epoch, loss

### Training model on wikipedia, to then finetune on paco-cheese
* HuggingFace page: https://huggingface.co/datasets/wikipedia
* Models of interest here: `20220301.fr`, `20220301.en`
* How to take just a fraction: https://huggingface.co/docs/datasets/v1.11.0/splits.html

In [None]:
wikids = load_dataset("wikipedia", "20220301.fr", split=['train[:1%]']) # only contains train?
wikids = wikids[0]

In [48]:
tok_cont_kwargs = {'truncation':True, 'padding':'max_length', 'max_length':1024}
wikids = wikids.map(lambda x: tokenizer(x['text'], **tok_cont_kwargs), batched=True, batch_size=512)

  0%|          | 0/47 [00:00<?, ?ba/s]

In [49]:
rnnmodel = GRUModel(tokenizer)
rnnmodel = rnnmodel.to(DEVICE)

In [52]:
wikids.set_format(type='torch', columns=['input_ids','attention_mask'])
train_dataloader = DataLoader(wikids, collate_fn=data_collator, batch_size=1, worker_init_fn=SEED)

In [None]:
# take subset for tests
#type(wikids[:16]) # type: dict
#tmp = torch.utils.data.Subset(wikids, list(range(20)))

In [55]:
train(rnnmodel, train_dataloader, test_dataloader, max_epoch=1, save_every=1, save_path=f'./rnnmodels-fr-wiki1pt')

epoch 0 

24021it [52:44,  7.59it/s]
477it [00:29, 16.41it/s]


{'epoch': 0, 'train_loss': 97112.36849631369, 'test_loss': 3175.855364592179, 'test_ppl': inf}


In [56]:
torch.save(rnnmodel.state_dict(), './rnnmodels-wikip-epoch1.pt')
#!cp /content/drive/MyDrive/Colab\ Notebooks/rnnmodels-wikip-epoch2.pt .

### Fine Tune on MapTask

In [57]:
dataset_c.set_format(type='torch', columns=['input_ids','attention_mask'])
train_dataloader = DataLoader(dataset_c['train'], collate_fn=data_collator, batch_size=8, worker_init_fn=SEED)
test_dataloader = DataLoader(dataset_c['test'], collate_fn=data_collator, batch_size=8, worker_init_fn=SEED)

In [61]:
train(rnnmodel, train_dataloader, test_dataloader, max_epoch=4, save_every=1, save_path=f'./rnnmodels-ft-fr')

epoch 0 

1355it [02:32,  8.87it/s]
477it [00:29, 16.24it/s]


{'epoch': 0, 'train_loss': 5000.340625166893, 'test_loss': 1641.1240767603335, 'test_ppl': inf}
epoch 1 

1355it [02:32,  8.90it/s]
477it [00:29, 16.24it/s]


{'epoch': 1, 'train_loss': 4107.413211464882, 'test_loss': 1594.180188966834, 'test_ppl': inf}
epoch 2 

1355it [02:31,  8.93it/s]
477it [00:29, 16.39it/s]


{'epoch': 2, 'train_loss': 3781.5790241360664, 'test_loss': 1592.6714351073556, 'test_ppl': inf}
epoch 3 

1355it [02:31,  8.94it/s]
477it [00:29, 16.40it/s]


{'epoch': 3, 'train_loss': 3549.2667949199677, 'test_loss': 1603.6423958902774, 'test_ppl': inf}


In [None]:
torch.save(rnnmodel.state_dict(), './rnnmodels-ft-maptask-epoch10.pt')

In [None]:
!cp rnnmodels-ft-maptask-epoch10.pt /content/drive/MyDrive/Colab\ Notebooks/
#!cp rnnmodels-wikip-epoch2.pt /content/drive/MyDrive/Colab\ Notebooks/

In [None]:
!cp ./rnnmodels-ft-maptask/model_epoch_2.pt /content/drive/MyDrive/Colab\ Notebooks/rnnmodels-ft-maptask-epoch2.pt

tensor(10.8308, device='cuda:0')

### Tests on paco-cheese
Basic prediction with trained model

In [65]:
model = GRUModel(tokenizer)
model = model.to(DEVICE)
model, _, epoch, _ = checkpoint_load("/content/rnnmodels-ft-fr/model_epoch_2.pt", model)

In [66]:
test_dataframe = df[df.file.isin(files_test)]
md_name = "rnn_ft_pc2"

dataset_c.set_format(type='torch', columns=['input_ids', 'start_idx', 'attention_mask'])
test_dataloader = DataLoader(dataset_c['test'], collate_fn=data_collator, batch_size=1, worker_init_fn=SEED)
sent_avg_logp, tokens_logp, sent_length, sentence_tokens = test_predict_entropy(model, test_dataloader, tokenizer, DEVICE, batch_predict_logits_rnn)
test_dataframe = results_to_df(test_dataframe, sent_avg_logp, tokens_logp, sent_length, 
                                out_file_name = f'{md_name}-{data_file}', sentence_tokens = sentence_tokens)

Iteration: 100%|██████████| 3816/3816 [00:31<00:00, 122.38it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['normalised_h'] = sent_avg_logp
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['length'] = sent_length
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['tokens_h'] = tokens_logp
A value is trying to be set o

Experiments with context

In [None]:
test_dataframe = df[df.file.isin(files_test)]


for i in range(0,8):
    print(f'\n------------- CONTEXT {i} SENTENCES -------------')
    dataset_c = create_context_dataloader(df, ctx_len=i)
    dataset_c.set_format(type='torch', columns=['input_ids', 'start_idx', 'attention_mask', '__index_level_0__'])
    train_dataloader = DataLoader(dataset_c['train'], collate_fn=data_collator, batch_size=BATCH_SIZE, 
                                shuffle=True,  worker_init_fn=SEED)
    test_dataloader = DataLoader(dataset_c['test'], 
                                collate_fn=data_collator, batch_size=BATCH_SIZE, worker_init_fn=SEED)
    # Train
    rnnmodel = GRUModel(tokenizer)
    rnnmodel = rnnmodel.to(DEVICE)
    if not os.path.exists(f'./rnnmodels-c{i}/model_epoch_3.pt'):
        train(rnnmodel, train_dataloader, test_dataloader, max_epoch=3, save_every=1, save_path=f'./rnnmodels-c{i}')
    else:
        print('-- Loading model from checkpoint --')
        checkpoint = torch.load(f'./rnnmodels-c{i}/model_epoch_3.pt')
        rnnmodel.load_state_dict(checkpoint['model_state_dict'])
    # Test
    sent_avg_logp, tokens_logp, sent_length, sentence_tokens = test_predict_entropy(rnnmodel, test_dataloader, 
                                                                   tokenizer, DEVICE, batch_predict_logits_rnn)
    # Save
    print(f'\nChecking shapes: {test_dataframe.shape}, {len(sent_avg_logp)}')
    test_dataframe = results_to_df(test_dataframe, sent_avg_logp, tokens_logp, sent_length, 
                                   out_file_name = None, sentence_tokens = sentence_tokens, column_post=f'rnn-train-c{i}')

test_dataframe.to_csv(f'rnn-traintest-{data_file}-c0-8.csv',index=False)

In [None]:
test_dataframe.head()

In [None]:
pivot_results_df(test_dataframe, post_patterns=[f'rnn-train-c{i}' for i in range(0,8)]).to_csv(f'rnn-traintest-{data_file}c0-8-p.csv',index=False)

Same but with only the last model

In [67]:
test_dataframe = df[df.file.isin(files_test)]#.apply(lambda x: x.strip().replace('  ',' ')) 

for i in range(0,8):
    print(f'\n------------- CONTEXT {i} -------------')
    if i == 0:
        df['context'] = _add_to_text(df, file_col = file_col)
    else:
        df['context'] = create_context(df, context_len = i, file_col = file_col) # from parameters at top

    # Compute
    dataset_c = create_context_dataset_from_df(df, tokenizer, f"context", files_train, files_test, batch_size=BATCH_SIZE, max_length=1024)
    dataset_c.set_format(type='torch', columns=['input_ids', 'start_idx', 'attention_mask', '__index_level_0__'])
    test_dataloader = DataLoader(dataset_c['test'], 
                                collate_fn=data_collator, batch_size=BATCH_SIZE, worker_init_fn=SEED)
    sent_avg_logp, tokens_logp, sent_length, sentence_tokens = test_predict_entropy(model, test_dataloader, 
                                                                   tokenizer, DEVICE, batch_predict_logits_rnn)
    test_dataframe = results_to_df(test_dataframe, sent_avg_logp, tokens_logp, sent_length, 
                                   out_file_name = None, sentence_tokens = sentence_tokens, column_post=f'_{i}')


------------- CONTEXT 0 -------------


  0%|          | 0/1355 [00:00<?, ?ba/s]

  0%|          | 0/477 [00:00<?, ?ba/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

  0%|          | 0/1355 [00:00<?, ?ba/s]

  0%|          | 0/477 [00:00<?, ?ba/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

  batch_avg_logp.append(- sentence_logp.sum()/sentence_logp.shape[0])
Iteration: 100%|██████████| 477/477 [00:15<00:00, 29.91it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['normalised_h'] = sent_avg_logp
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['length'] = sent_length
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



------------- CONTEXT 1 -------------


  0%|          | 0/1355 [00:00<?, ?ba/s]

  0%|          | 0/477 [00:00<?, ?ba/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

  0%|          | 0/1355 [00:00<?, ?ba/s]

  0%|          | 0/477 [00:00<?, ?ba/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

Iteration: 100%|██████████| 477/477 [00:17<00:00, 26.90it/s]



------------- CONTEXT 2 -------------


  0%|          | 0/1355 [00:00<?, ?ba/s]

  0%|          | 0/477 [00:00<?, ?ba/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

  0%|          | 0/1355 [00:00<?, ?ba/s]

  0%|          | 0/477 [00:00<?, ?ba/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

Iteration: 100%|██████████| 477/477 [00:18<00:00, 25.81it/s]



------------- CONTEXT 3 -------------


  0%|          | 0/1355 [00:00<?, ?ba/s]

  0%|          | 0/477 [00:00<?, ?ba/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

  0%|          | 0/1355 [00:00<?, ?ba/s]

  0%|          | 0/477 [00:00<?, ?ba/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

Iteration: 100%|██████████| 477/477 [00:19<00:00, 24.08it/s]



------------- CONTEXT 4 -------------


  0%|          | 0/1355 [00:00<?, ?ba/s]

  0%|          | 0/477 [00:00<?, ?ba/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

  0%|          | 0/1355 [00:00<?, ?ba/s]

  0%|          | 0/477 [00:00<?, ?ba/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

Iteration: 100%|██████████| 477/477 [00:20<00:00, 23.65it/s]



------------- CONTEXT 5 -------------


  0%|          | 0/1355 [00:00<?, ?ba/s]

  0%|          | 0/477 [00:00<?, ?ba/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

  0%|          | 0/1355 [00:00<?, ?ba/s]

  0%|          | 0/477 [00:00<?, ?ba/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

Iteration: 100%|██████████| 477/477 [00:21<00:00, 22.63it/s]



------------- CONTEXT 6 -------------


  0%|          | 0/1355 [00:00<?, ?ba/s]

  0%|          | 0/477 [00:00<?, ?ba/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

  0%|          | 0/1355 [00:00<?, ?ba/s]

  0%|          | 0/477 [00:00<?, ?ba/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

Iteration: 100%|██████████| 477/477 [00:21<00:00, 22.64it/s]



------------- CONTEXT 7 -------------


  0%|          | 0/1355 [00:00<?, ?ba/s]

  0%|          | 0/477 [00:00<?, ?ba/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

  0%|          | 0/1355 [00:00<?, ?ba/s]

  0%|          | 0/477 [00:00<?, ?ba/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

  0%|          | 0/10834 [00:00<?, ?ex/s]

  0%|          | 0/3816 [00:00<?, ?ex/s]

Iteration: 100%|██████████| 477/477 [00:21<00:00, 21.80it/s]


In [68]:
test_dataframe.to_csv(f'rnn_ft_pc2-{data_file}-c0-8.csv',index=False)

In [69]:
pivot_results_df(test_dataframe, post_patterns=[f'_{i}' for i in range(0,8)]).to_csv(f'rnn_ft_pc2-{data_file}-c0-8-p.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['model'] = pat


### Perplexity
Nope

In [None]:
#rnnmodel.load_state_dict(torch.load('rnnmodels-wikip-epoch2.pt'))
rnnmodel.load_state_dict(torch.load('rnnmodels-ft-maptask-epoch3.pt'))

<All keys matched successfully>

In [None]:
encodings = get_perplexity_encodings(df, tokenizer, sep_token=' ', files_test=files_test)

In [59]:
ppl = compute_perplexity(rnnmodel, encodings, DEVICE, stride=16, max_length=1024, model_is_lm=False)
ppl

100%|██████████| 2187/2187 [02:48<00:00, 13.00it/s]


In [None]:
torch.cuda.empty_cache()

In [64]:
model = GRUModel(tokenizer)
model = model.to(DEVICE)

for path in sorted([os.path.join('/content/rnnmodels-ft-fr',x) for x in os.listdir('/content/rnnmodels-ft-fr') if '.pt' in x]):
    model, _, epoch, _ = checkpoint_load(path, model)
    ppl = compute_perplexity(model, encodings, DEVICE, stride=16, max_length=1024, model_is_lm=False)
    print(ppl, path)

100%|██████████| 2187/2187 [02:47<00:00, 13.05it/s]


88.20404815673828 /content/rnnmodels-ft-fr/model_epoch_1.pt


100%|██████████| 2187/2187 [02:47<00:00, 13.06it/s]


83.16426849365234 /content/rnnmodels-ft-fr/model_epoch_2.pt


100%|██████████| 2187/2187 [02:47<00:00, 13.03it/s]


85.24154663085938 /content/rnnmodels-ft-fr/model_epoch_3.pt


100%|██████████| 2187/2187 [02:48<00:00, 13.00it/s]

90.45860290527344 /content/rnnmodels-ft-fr/model_epoch_4.pt





## RNN 3-gram

### pretraining wikipedia

In [None]:
wiki_data = {
    'FR': '20220301.fr', 'EN': "20220301.en"
}

In [None]:
LANGUAGE = 'EN'

In [None]:
wikids = load_dataset("wikipedia", wiki_data[LANGUAGE], split=['train[:1%]']) # only contains train?
wikids = wikids[0]

Downloading and preparing dataset wikipedia/20220301.en (download: 19.18 GiB, generated: 18.88 GiB, post-processed: Unknown size, total: 38.07 GiB) to /root/.cache/huggingface/datasets/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559...


Downloading:   0%|          | 0.00/15.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/20.3G [00:00<?, ?B/s]

Dataset wikipedia downloaded and prepared to /root/.cache/huggingface/datasets/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
df_wikipedia = pd.DataFrame(wikids)

In [None]:
df_wikipedia.head()

Unnamed: 0,id,url,title,text
0,12,https://en.wikipedia.org/wiki/Anarchism,Anarchism,Anarchism is a political philosophy and moveme...
1,25,https://en.wikipedia.org/wiki/Autism,Autism,Autism is a neurodevelopmental disorder charac...
2,39,https://en.wikipedia.org/wiki/Albedo,Albedo,Albedo (; ) is the measure of the diffuse refl...
3,290,https://en.wikipedia.org/wiki/A,A,"A, or a, is the first letter and the first vow..."
4,303,https://en.wikipedia.org/wiki/Alabama,Alabama,Alabama () is a state in the Southeastern regi...


In [None]:
wiki_sent = df_wikipedia.text.apply(lambda x: re.split('(\n|\.|\.\.\.)',x)).explode()
print(wiki_sent.shape)
wiki_sent.head()

(30399551,)


0    Anarchism is a political philosophy and moveme...
0                                                    .
0     Anarchism calls for the abolition of the stat...
0                                                    .
0     As a historically left-wing movement, placed ...
Name: text, dtype: object

In [None]:
import string
pct = string.punctuation.replace("'","")

In [None]:
# https://datagy.io/python-remove-punctuation-from-string/
#df_wikipedia.text = df_wikipedia.text.apply(lambda x: re.sub(r'[^\w\s]', '', x.replace('\n',' ')).replace('  ',' ').strip()) # also removes apostrophe
df_wikipedia.text = df_wikipedia.text.apply(lambda x: x.replace('\n',' ').replace('  ',' ').strip().translate(str.maketrans('', '', pct)))
df_wikipedia.head()

Unnamed: 0,id,url,title,text
0,3,https://fr.wikipedia.org/wiki/Antoine%20Meillet,Antoine Meillet,Paul Jules Antoine Meillet né le à Moulins All...
1,7,https://fr.wikipedia.org/wiki/Alg%C3%A8bre%20l...,Algèbre linéaire,L’algèbre linéaire est la branche des mathémat...
2,9,https://fr.wikipedia.org/wiki/Alg%C3%A8bre%20g...,Algèbre générale,L'algèbre générale ou algèbre abstraite est la...
3,10,https://fr.wikipedia.org/wiki/Algorithmique,Algorithmique,Lalgorithmique est l'étude et la production de...
4,11,https://fr.wikipedia.org/wiki/Politique%20en%2...,Politique en Argentine,L'Argentine est une république présidentielle ...


In [None]:
wiki_sent = wiki_sent.apply(lambda x: x.replace('\n',' ').replace('  ',' ').strip().translate(str.maketrans('', '', pct)))

In [None]:
re.split('(\n|\.|\.\.\.)', "\n\nBiographie \nD'origine bourbonnaise, fils")

['', '\n', '', '\n', 'Biographie ', '\n', "D'origine bourbonnaise, fils"]

In [None]:
wiki_sent = wiki_sent[wiki_sent != '']
print(wiki_sent.shape)
wiki_sent.to_csv(f'wiki_{LANGUAGE}_1pt.txt', index=False, header=False)

(10795891,)


In [None]:
nex = 40
df_wikipedia_mini = df_wikipedia.iloc[np.random.choice(range(df_wikipedia.shape[0]), nex, replace=False)]
df_wikipedia_mini.text = df_wikipedia_mini.text.apply(lambda x: ' '.join(x.split(' ')[500:1000]))
df_wikipedia_mini.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,id,url,title,text
2009,3470,https://fr.wikipedia.org/wiki/1921,1921,le « cabinet honnête » et annule les réformes ...
5846,10656,https://fr.wikipedia.org/wiki/L%C3%A9on%20Vachet,Léon Vachet,
12733,23699,https://fr.wikipedia.org/wiki/Liste%20des%20ca...,Liste des cantons de la Haute-Saône,cantons du département de la HauteSaône par ar...
23526,50205,https://fr.wikipedia.org/wiki/Jean-Claude%20Br...,Jean-Claude Brialy,de Claude Chabrol qui lui apportent la célébri...
12759,23753,https://fr.wikipedia.org/wiki/Liste%20d%27assa...,Liste d'assassins notoires,1763 Amedy Coulibaly assassin d'une policière ...


In [None]:
def line_to_ngram(df:pd.DataFrame, n:int=3, text_col:str='text', file_col:str="file"):
    df['split_text'] = df[text_col].apply(lambda x: x.split())
    df_ngram = df.explode('split_text')
    print('Tokenizing...')
    #df_ngram['text_input_ids'] = df_ngram['split_text'].apply(lambda x: tokenizer(x, truncation=True, padding=False)['input_ids'])
    df_ngram['text_input_ids'] = np.vstack(map(lambda x: tokenizer(x, truncation=True, padding=False)['input_ids'], df_ngram['split_text']))
    print('Merging...')
    prev_sentences = _anytype_context(df_ngram, context_len=n, text_col='text_input_ids', file_col=file_col)
    prev_sentences.fillna([], inplace=True)
    # columns are (normally) ordered to be joined correctly
    df_ngram['context_input_ids'] = prev_sentences.apply(lambda x: list(itertools.chain(*list(x))), axis=1)
    print('To dataset.')
    df_ngram['length'] = df_ngram['text_input_ids'].apply(len)
    df_ngram['ct_length'] = df_ngram['context_input_ids'].apply(len)
    df_ngram['start_idx'] = df_ngram.ct_length - df_ngram.length

    dataset_3 = DatasetDict({
        'train': Dataset.from_pandas(df_ngram[df_ngram.file.isin(files_train)]),
        'test': Dataset.from_pandas(df_ngram[df_ngram.file.isin(files_test)])
    })
    dataset_3.set_format(type='torch', columns=['input_ids', 'attention_mask'])
    return dataset_3, df_ngram

In [None]:
import time

In [None]:
start_time = time.time()
dataset_3, df_ngram = line_to_ngram(df_wikipedia, n=3, file_col='url')
print(f'number of seconds: {time.time() - start_time}')

In [None]:
train_dataloader = DataLoader(dataset_3['train'], collate_fn=data_collator, batch_size=8, 
                                shuffle=True,  worker_init_fn=SEED)
test_dataloader = DataLoader(dataset_3['test'], collate_fn=data_collator, batch_size=8, worker_init_fn=SEED)

rnnmodel = GRUModel(tokenizer)
rnnmodel = rnnmodel.to(DEVICE)

In [None]:
train(rnnmodel, train_dataloader, test_dataloader, max_epoch=2, save_every=1, save_path='./rnnmodels-wiki-3')

In [None]:
torch.save(rnnmodel.state_dict(), './rnnmodels-3/rnn-model-3-2.pt')