<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/Translate_GPTj4all_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install -q transformers datasets sentencePiece ftfy

In [2]:
import os
import re
import ftfy
import torch
import spacy
import pickle
import collections
import warnings
warnings.filterwarnings("ignore")

from ftfy import fix_encoding
from tqdm.auto import tqdm
from spacy.lang.en import English

from datasets import load_dataset, Dataset
from transformers import MarianMTModel, MarianTokenizer

# ---------------------------------------------------------------------------------------

if torch.cuda.is_available(): 
   dev = "cuda:0"
else: 
   dev = "cpu" 
print(dev, torch.cuda.get_device_name(0))
device = torch.device(dev)


# Model
model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
marian_tokenizer = MarianTokenizer.from_pretrained(model_name)
marian_model = MarianMTModel.from_pretrained(model_name)

cuda:0 Tesla T4


In [3]:
data_en = load_dataset('nomic-ai/gpt4all-j-prompt-generations')
data_en



  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response', 'source'],
        num_rows: 808812
    })
})

In [4]:
data_en['train']['prompt'][0]

'<p>Good morning</p>\n\n<p>I have a Wpf datagrid that is displaying an observable collection of a custom type</p>\n\n<p>I group the data using a collection view source in XAML on two seperate properties, and I have styled the groups to display as expanders.</p>\n\n<p>For clarity, as there is a lot of data I feel I have to use margins and spacing otherwise things look very cluttered.</p>\n\n<p>My problem is that with two levels of hierarchical expanders the column data is now substantially offset from the column headers meaning that they do not properly line up.</p>\n\n<p>I have tried several thing, like setting the margin of the column headers and the width (both actual and normal). However all of my attempts end up resizing the whole column so that the offset stays the same but the columns move.</p>\n\n<p>so my question:</p>\n\n<p><strong>How can I change the visible width or offset of a column header to ensure that the headers line up with the data</strong></p>\n\n<ul>\n<li>Visual St

In [5]:
# Clean htlm tags
def clean_html(text):
    text['prompt'] = re.sub('<[^<]+?>', '', text['prompt'])
    text['response'] = re.sub('<[^<]+?>', '', text['response'])
    return text

data_en = data_en.map(clean_html, batched=False, num_proc=os.cpu_count())
data_en



DatasetDict({
    train: Dataset({
        features: ['prompt', 'response', 'source'],
        num_rows: 808812
    })
})

In [6]:
data_en['train']['prompt'][0]

'Good morning\n\nI have a Wpf datagrid that is displaying an observable collection of a custom type\n\nI group the data using a collection view source in XAML on two seperate properties, and I have styled the groups to display as expanders.\n\nFor clarity, as there is a lot of data I feel I have to use margins and spacing otherwise things look very cluttered.\n\nMy problem is that with two levels of hierarchical expanders the column data is now substantially offset from the column headers meaning that they do not properly line up.\n\nI have tried several thing, like setting the margin of the column headers and the width (both actual and normal). However all of my attempts end up resizing the whole column so that the offset stays the same but the columns move.\n\nso my question:\n\nHow can I change the visible width or offset of a column header to ensure that the headers line up with the data\n\n\nVisual Studio 2012\nWpf\nC#\nDataGrid\n\n\nEDIT This is what I mean\n\n\n\nEDIT 2 - MY Xam

In [7]:
collections.Counter(data_en['train']['source'])

Counter({'pacovaldez/stackoverflow-questions': 256231,
         '': 17806,
         'unified_multi_sum': 57352,
         'nomic-ai': 203630,
         'unified_abstract_infill_output-100-000-x.jsonl': 26057,
         'unified_abstract_infill_output_0-100_000.jsonl': 26505,
         'output_unified_unifiedskg.jsonl': 19987,
         'unified_unifiedskg_instructions': 61154,
         'unified_hc3_human': 23145,
         'laion/unified_chip2': 61969,
         'unified_chip2': 54976})

In [8]:
nlp = English()
nlp.add_pipe('sentencizer')
def chunkstring_spacy(text):
    """
    Segment text and prepare to translation

    Args:
      text: Sentence to be translated
      
    Returns:
      Segmented text.
    """
    chunck_sentences = []
    doc = nlp(str(text.replace('', '')))
    for sent in doc.sents:
        chunck_sentences.append('>>pt_br<<' + ' ' + sent.text)
        
    return chunck_sentences
    
def translate(aux_sent):
    """
    Translate text

    Args:
      aux_sent: Sentence to be translated
      
    Returns:
      Translated text.
    """
    max_length = 512
    num_beams = 1

    sentence = chunkstring_spacy(aux_sent)

    #Move o modelo para a GPU
    marian_model.to(device)
    marian_model.eval()

    tokenized_text = marian_tokenizer.prepare_seq2seq_batch(sentence, max_length=max_length, return_tensors='pt')

    with torch.no_grad():
        translated = marian_model.generate(input_ids=tokenized_text['input_ids'].to(device), 
                                        max_length=max_length, 
                                        num_beams=num_beams, 
                                        early_stopping=True, 
                                        do_sample=False)
                        
    tgt_text = [fix_encoding(marian_tokenizer.decode(t, skip_special_tokens=True)) for t in translated]
    return ' '.join(tgt_text)

In [9]:
def pickle_file(path, data=None):
    if data is None:
        with open(path, 'rb') as f:
            return pickle.load(f)
    if data is not None:
        with open(path, 'wb') as handle:
            pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

path_save = '/content/drive/MyDrive/LLMs/data_gpt4all/gptj4all_ptbr/'

prompt, response, source = [], [], []

SAVE_EVERY = 10_000 # +- 81 checkpoints in 808k samples

loop = tqdm(data_en['train'], leave=True)
for ix, inputs in enumerate(loop, 1):
    prompt_ptbr = translate(inputs['prompt'])
    response_ptbr = translate(inputs['response'])
    prompt.append(prompt_ptbr)
    response.append(response_ptbr)
    source.append(inputs['source'])
    
    
    if len(prompt) == len(response) and len(prompt) == SAVE_EVERY: 
        print(f"\n{ix}: {inputs['response']}")
        print(f"\n{ix}: {response_ptbr}\n")

        pickle_file(path=path_save+f'prompt_{ix}', data=prompt)
        pickle_file(path=path_save+f'response_{ix}', data=response)
        pickle_file(path=path_save+f'source_{ix}', data=source)
        prompt, response, source = [], [], []

    loop.set_description("Examples Processed")

  0%|          | 0/808812 [00:00<?, ?it/s]



5: The issue seems to be with the layout parameters being set in the `getView()` method. The code is setting `RelativeLayout.LayoutParams` to the `convertView`. However, `ListView` requires `AbsListView.LayoutParams`, which is a specific subclass of `ViewGroup.LayoutParams`. 

To fix the issue, replace `RelativeLayout.LayoutParams` with `AbsListView.LayoutParams` like below:

```
AbsListView.LayoutParams params = new AbsListView.LayoutParams(ViewGroup.LayoutParams.MATCH_PARENT, Utils.getInstance().dpToPx(mContext, 260));
convertView.setLayoutParams(params);
```

Also, in the XML layout, change the parent layout `RelativeLayout` to `LinearLayout` to avoid any layout conflicts.
5: O problema parece ser que os parâmetros de layout estão sendo definidos no método `getView()`. O código está configurando "RelativeLayout". LayoutParams` para a `ConvertView`. No entanto, "ListView" requer "AbsListView". LayoutParams`, que é uma subclasse específica de `ViewGroup. LayoutParams`. Para resolver

In [10]:
import glob

all_prompt, all_response, all_source = [],[],[]

for prompt_file, response_file, source_file in zip(
    sorted(glob.glob(path_save+'prompt*')), 
    sorted(glob.glob(path_save+'response*')),
    sorted(glob.glob(path_save+'source*'))):
    
    if prompt_file[-3:] == response_file[-3:]:
        all_prompt += pickle_file(prompt_file)
        all_response += pickle_file(response_file)
        all_source += pickle_file(source_file)

assert len(all_prompt) == len(all_response), "load pickle FAILED"

# ----
data_ptbr = Dataset.from_dict(
    {'prompt': all_prompt, 'response': all_response, 'source': all_source}
    )

In [11]:
data_ptbr

Dataset({
    features: ['prompt', 'response', 'source'],
    num_rows: 10
})

In [12]:
data_ptbr['response'][0]

'Uma solução possível é usar uma largura fixa para o cabeçalho GroupItem e alinhar o cabeçalho e os dados usando um DataTemplate personalizado para os cabeçalhos. Primeiro, remova as margens ou espaçamentos que possam afetar o alinhamento das colunas. Então, defina uma largura fixa para os cabeçalhos GroupItem usando a propriedade MinWidth do Expander. Em seguida, defina um DataTemplate personalizado para os cabeçalhos que inclui um TextBlock com uma largura fixa que combina com a largura da coluna. Isso garantirá que o cabeçalho e os dados estejam alinhados. Aqui está um exemplo de como o XAML para o DataTemplate poderia parecer: ``` ```` Este DataTemplate inclui dois TextBlocks, um para o nome do grupo e outro para a contagem de itens no grupo. O primeiro TextBlock tem uma largura fixa que combina com a largura da coluna. Para usar este DataTemplate para os cabeçalhos GroupItem, definir o grupo de estilo. CabeçalhoPropriedade de Template para a chave do DataTemplate: ```` `` Isso dev

In [13]:
data_en['train']['response'][0]

"One possible solution is to use a fixed width for the GroupItem header and align the header and the data using a custom DataTemplate for the headers.\n\nFirst, remove any margins or spacing that may affect the alignment of the columns. Then, define a fixed width for the GroupItem headers using the Expander's MinWidth property.\n\nNext, define a custom DataTemplate for the headers that includes a TextBlock with a fixed width that matches the width of the column. This will ensure that the header and the data are aligned.\n\nHere's an example of what the XAML for the DataTemplate could look like:\n\n```\n\n    \n        \n            \n            \n        \n        \n        \n    \n\n```\n\nThis DataTemplate includes two TextBlocks, one for the group name and another for the count of items in the group. The first TextBlock has a fixed width that matches the width of the column. \n\nTo use this DataTemplate for the GroupItem headers, set the GroupStyle.HeaderTemplate property to the ke