# Feature Transformation with Scikit-Learn In This Notebook

In this notebook, we convert raw text into feature embeddings.  This will allow us to perform natural language processing tasks.


# Understand Embeddings

* For more details on Transformers Architecture, see [Attention Is All You Need](https://arxiv.org/abs/1706.03762).

* **input_ids**: 
The id from the pre-trained vocabulary that represents the token. (Padding of 0 will be used if the # of tokens is less than max_seq_length)

* **attention_mask**: 
Specifies which tokens should pay attention to (0 or 1). Padded input_ids will have 0 in each of these vector elements.

In [2]:
import psutil

notebook_memory = psutil.virtual_memory()

if notebook_memory.total < 32 * 1024 * 1024:
    print('*******************************************')    
    print('YOU ARE NOT USING THE CORRECT INSTANCE TYPE')
    print('PLEASE CHANGE INSTANCE TYPE TO  m5.2xlarge ')
    print('*******************************************')
else:
    correct_instance_type=True
    print(notebook_memory)

svmem(total=32890294272, available=20486737920, percent=37.7, used=11931193344, free=11216105472, active=16317833216, inactive=4215857152, buffers=0, cached=9742995456, shared=1789952, slab=494268416)


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from pathlib import Path
import csv

def _transform_to_dataset(file, 
                          output_data, 
                          train_split_percentage, 
                          validation_split_percentage, 
                          test_split_percentage, 
                          model_checkpoint, 
                          dataset_templates_name, 
                          prompt_template_name):
    print("file {}".format(file))

    # Read the file
    df = pd.read_csv(file, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")

    df.isna().values.any()
    df = df.dropna()
    df = df.reset_index(drop=True)    
        
    # Split data    
    print("Shape of dataframe before splitting {}".format(df.shape))

    print("train split percentage {}".format(train_split_percentage))
    print("validation split percentage {}".format(validation_split_percentage))
    print("test split percentage {}".format(test_split_percentage))

    holdout_percentage = 1.00 - train_split_percentage
    print("validation holdout percentage {}".format(holdout_percentage))
    
    df_train, df_holdout = train_test_split(df, test_size=holdout_percentage)

    test_holdout_percentage = test_split_percentage / holdout_percentage
    
    print("test holdout percentage {}".format(test_holdout_percentage))
    
    df_validation, df_test = train_test_split(
        df_holdout, test_size=test_holdout_percentage)

    df_train = df_train.reset_index(drop=True)
    df_validation = df_validation.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    print("Shape of train dataframe {}".format(df_train.shape))
    print("Shape of validation dataframe {}".format(df_validation.shape))
    print("Shape of test dataframe {}".format(df_test.shape))
    
    # Convert Pandas dataframes into Datasets
    import datasets
    from datasets import Dataset

    # Create Dataset objects (Arrow PyTables) from Pandas dataframes
    dataset_train = Dataset.from_pandas(df_train).select([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
    dataset_validation = Dataset.from_pandas(df_validation).select([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
    dataset_test = Dataset.from_pandas(df_test).select([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])

    # Apply prompt  
    from promptsource.templates import DatasetTemplates
    prompt_templates = DatasetTemplates(dataset_templates_name) 
    
    for template in prompt_templates.templates.values():
        print(template.get_name())
    
    prompt = prompt_templates[prompt_template_name]
    print(prompt.answer_choices)    
    print(prompt.__dict__)
    
    dataset_train = dataset_train \
        .map(lambda row : {'prompt': 'PROMPT: ' + prompt.apply(row)[0] + '\nSTAR_RATING: ' + prompt.apply(row)[1] + '\n\n'})        
    dataset_validation = dataset_validation \
        .map(lambda row : {'prompt': 'PROMPT: ' + prompt.apply(row)[0] + '\nSTAR_RATING: ' + prompt.apply(row)[1] + '\n\n'})
    dataset_test = dataset_test \
        .map(lambda row : {'prompt': 'PROMPT: ' + prompt.apply(row)[0] + '\nSTAR_RATING: ' + prompt.apply(row)[1] + '\n\n'})
                  
    # Tokenize    
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

    text_column_name = 'prompt'

    def tokenize_function(examples):        
        tokenized = tokenizer(examples[text_column_name])
        return tokenized

    import multiprocessing

    num_cpus = multiprocessing.cpu_count()
    print('num_cpus {}'.format(num_cpus))

    # if using .tsv, the data will have `product_category`, but not `year`:  https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt
    # if using .parquet, the data will have also have `year`:  https://s3.amazonaws.com/amazon-reviews-pds/readme.html
    tokenized_dataset_train = dataset_train.map(tokenize_function, batched=True, num_proc=num_cpus, remove_columns=[
        'marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category',
        'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
        'review_headline', 'review_date', 'review_body', text_column_name]) # 'year'

    tokenized_dataset_validation = dataset_validation.map(tokenize_function, batched=True, num_proc=num_cpus, remove_columns=[
        'marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category',
        'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
        'review_headline', 'review_date', 'review_body', text_column_name]) # 'year'

    tokenized_dataset_test = dataset_validation.map(tokenize_function, batched=True, num_proc=num_cpus, remove_columns=[
        'marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category',
        'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
        'review_headline', 'review_date', 'review_body', text_column_name]) # 'year'
        
    
    # Group into blocks and save to S3/disk

    block_size = 128

    def group_texts(examples):    
        # Concatenate all texts.
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
        total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

    lm_dataset_train = tokenized_dataset_train.map(
        group_texts,
        batched=True,
        batch_size=10,
        num_proc=num_cpus,
    )
    print(lm_dataset_train)
    
    lm_dataset_validation = tokenized_dataset_validation.map(
       group_texts,
       batched=True,
       batch_size=10,
       num_proc=num_cpus,
    )
    print(lm_dataset_validation)
    
    lm_dataset_test = tokenized_dataset_test.map(
       group_texts,
       batched=True,
       batch_size=10,
       num_proc=num_cpus,
    )
    print(lm_dataset_test)
    
    print(tokenizer.decode(lm_dataset_train[1]["input_ids"]))
    print(tokenizer.decode(lm_dataset_validation[1]["input_ids"]))
    print(tokenizer.decode(lm_dataset_test[1]["input_ids"]))
        
    filename_without_extension = Path(Path(file).stem).stem

    os.makedirs('{}/train/'.format(output_data), exist_ok=True)
    os.makedirs('{}/validation/'.format(output_data), exist_ok=True)
    os.makedirs('{}/test/'.format(output_data), exist_ok=True)
    
    lm_dataset_train.to_parquet('{}/train/{}.parquet'.format(output_data, filename_without_extension))    
    lm_dataset_validation.to_parquet('{}/validation/{}.parquet'.format(output_data, filename_without_extension))
    lm_dataset_test.to_parquet('{}/test/{}.parquet'.format(output_data, filename_without_extension))

In [4]:
import functools
import multiprocessing
import glob
import os

def process(args):

    input_files = glob.glob("{}/*.tsv.gz".format(args.input_data))
    print(input_files)

    print("Listing contents of {}".format(args.input_data))
    dirs_input = os.listdir(args.input_data)
    for file in dirs_input:
        print(file)

    train_data = "{}/train".format(args.output_data, args.model_checkpoint)
    validation_data = "{}/validation".format(args.output_data, args.model_checkpoint)
    test_data = "{}/test".format(args.output_data, args.model_checkpoint)

    transform_to_dataset = functools.partial(
        _transform_to_dataset,
        output_data=args.output_data,
        train_split_percentage=args.train_split_percentage, 
        validation_split_percentage=args.validation_split_percentage, 
        test_split_percentage=args.test_split_percentage,
        model_checkpoint=args.model_checkpoint,
        dataset_templates_name=args.dataset_templates_name,
        prompt_template_name=args.prompt_template_name
    )

    num_cpus = multiprocessing.cpu_count()
    print("num_cpus {}".format(num_cpus))

    p = multiprocessing.Pool(num_cpus)
    p.map(transform_to_dataset, input_files)

    print("Listing contents of {}".format(args.output_data))
    dirs_output = os.listdir(args.output_data)
    for file in dirs_output:
        print(file)

    print("Listing contents of {}".format(train_data))
    dirs_output = os.listdir(train_data)
    for file in dirs_output:
        print(file)

    print("Listing contents of {}".format(validation_data))
    dirs_output = os.listdir(validation_data)
    for file in dirs_output:
        print(file)

    print("Listing contents of {}".format(test_data))
    dirs_output = os.listdir(test_data)
    for file in dirs_output:
        print(file)


In [5]:
class Args:
    input_data: str
    output_data: str
    train_split_percentage: float
    validation_split_percentage: float
    test_split_percentage: float
    model_checkpoint: str
    dataset_templates_name: str
    prompt_template_name: str

args = Args()    
    
args.model_checkpoint = 'facebook/opt-350m'
args.dataset_templates_name = 'amazon_us_reviews/Wireless_v1_00'
args.prompt_template_name = 'Given the review body return a categorical rating'
args.input_data = './data-tsv'
args.output_data = './data'
args.train_split_percentage = .80
args.validation_split_percentage = .10
args.test_split_percentage = .10

process(args)


['./data-tsv/amazon_reviews_us_Gift_Card_v1_00.tsv.gz', './data-tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', './data-tsv/amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz']
Listing contents of ./data-tsv
amazon_reviews_us_Gift_Card_v1_00.tsv.gz
.ipynb_checkpoints
amazon_reviews_us_Digital_Software_v1_00.tsv.gz
amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz
num_cpus 8
file ./data-tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz
file ./data-tsv/amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz
file ./data-tsv/amazon_reviews_us_Gift_Card_v1_00.tsv.gz
Shape of dataframe before splitting (149081, 15)
train split percentage 0.8
validation split percentage 0.1
test split percentage 0.1
validation holdout percentage 0.19999999999999996
test holdout percentage 0.5000000000000001
Shape of train dataframe (119264, 15)
Shape of validation dataframe (14908, 15)
Shape of test dataframe (14909, 15)
Shape of dataframe before splitting (102084, 15)
train split percentage 0.8
validatio

  0%|          | 0/15 [00:00<?, ?ex/s]

  0%|          | 0/15 [00:00<?, ?ex/s]

Generate review headline based on review body
Generate review based on rating and category
Given the review headline return a categorical rating
Generate review headline based on rating
Given the review body return a categorical rating
1 ||| 2 ||| 3 ||| 4 ||| 5
{'answer_choices': '1 ||| 2 ||| 3 ||| 4 ||| 5', 'id': 'e6a1bbde-715d-4dad-9178-e2bcfaf5c646', 'jinja': "Given the following review:\n{{review_body}}\npredict the associated rating from the following choices (1 being lowest and 5 being highest)\n- {{ answer_choices | join('\\n- ') }} \n|||\n{{answer_choices[star_rating-1]}}", 'metadata': <promptsource.templates.Template.Metadata object at 0x7fbde0304a10>, 'name': 'Given the review body return a categorical rating', 'reference': 'Given the review body, return a categorical rating. '}


  0%|          | 0/15 [00:00<?, ?ex/s]

  0%|          | 0/15 [00:00<?, ?ex/s]

  0%|          | 0/15 [00:00<?, ?ex/s]

  0%|          | 0/15 [00:00<?, ?ex/s]

Generate review headline based on review body
Generate review based on rating and category
Given the review headline return a categorical rating
Generate review headline based on rating
Given the review body return a categorical rating
1 ||| 2 ||| 3 ||| 4 ||| 5
{'answer_choices': '1 ||| 2 ||| 3 ||| 4 ||| 5', 'id': 'e6a1bbde-715d-4dad-9178-e2bcfaf5c646', 'jinja': "Given the following review:\n{{review_body}}\npredict the associated rating from the following choices (1 being lowest and 5 being highest)\n- {{ answer_choices | join('\\n- ') }} \n|||\n{{answer_choices[star_rating-1]}}", 'metadata': <promptsource.templates.Template.Metadata object at 0x7fbdd7f87990>, 'name': 'Given the review body return a categorical rating', 'reference': 'Given the review body, return a categorical rating. '}


  0%|          | 0/15 [00:00<?, ?ex/s]

  0%|          | 0/15 [00:00<?, ?ex/s]

  0%|          | 0/15 [00:00<?, ?ex/s]

num_cpus 8
num_cpus 8
num_cpus 8
                         

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

                         

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

                         

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

                                 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

        Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 15
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 14
})
                        

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

          

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

    

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 6
})
        Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 13
})
            Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 10
})  
   

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

        

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

    

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 7
})
     

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 13
})
</s>PROMPT: Given the following review:
VERY BASIC FUN GAME AT FIRST, VERY BASIC PLAY IF YOU HAVE ANY EXP. BUT AFTER THE THIRD LEVEL OR VILLAGE IT SHOWS YOU STAGE COMPLETE THEN JUST STAYS THERE IF YOU LEAVE THE GAME YOU LOSE THAT LEVEL EXP. TRIED MULTIP TIMES OVER AND ALL IT WOULD SAY WAS LEVEL COMPLETE, FIREWORKS AND THAT IS IT  NOT WORTH THE AGGRAvATION EVEN FOR 6.99
predict the associated rating from the following choices (1 being lowest and 5 being highest)
. (not really the point of the game anyways)<br />-Personally I find it very hard to start a multiplayer game with more than one or two people. (Matches last many gaming sessions and for long periods and if you start without them their cities disappear)
predict the associated rating from the following choices (1 being lowest and 5 being highest)
- 1
- 2
- 3
- 4
- 5
STAR_RATING: 5

</s>PROMPT: Given the following review:
Having ordered this in

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 7
})
</s>PROMPT: Given the following review:
good
predict the associated rating from the following choices (1 being lowest and 5 being highest)
- 1
- 2
- 3
- 4
- 5
STAR_RATING: 5

</s>PROMPT: Given the following review:
Everything I expected except for the $10.00 credit I was supposed to earn by buying $50.00 worth of gift cards. That's why only one star.
predict the associated rating from the following choices (1 being lowest and 5 being highest)
- 1
- 2
- 3

</s>PROMPT: Given the following review:
When I want to send a gift to friends in Europe or Australia, this is the best. Always delivered instantaneously, to it's on time and always arrives, unlike objects sent by post.
predict the associated rating from the following choices (1 being lowest and 5 being highest)
- 1
- 2
- 3
- 4
- 5
STAR_RATING: 5

</s>PROMPT: Given the following review:
Love it, this gift card the amazon is awsome for my family and i,

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Listing contents of ./data
test
facebook
train
validation
Listing contents of ./data/train
amazon_reviews_us_Gift_Card_v1_00.parquet
amazon_reviews_us_Digital_Video_Games_v1_00.parquet
amazon_reviews_us_Digital_Software_v1_00.parquet
Listing contents of ./data/validation
amazon_reviews_us_Gift_Card_v1_00.parquet
amazon_reviews_us_Digital_Video_Games_v1_00.parquet
amazon_reviews_us_Digital_Software_v1_00.parquet
Listing contents of ./data/test
amazon_reviews_us_Gift_Card_v1_00.parquet
amazon_reviews_us_Digital_Video_Games_v1_00.parquet
amazon_reviews_us_Digital_Software_v1_00.parquet


In [6]:
from datasets import Dataset

reloaded_dataset_train = Dataset.from_parquet('./data/{}/train/*.parquet'.format(args.model_checkpoint))
reloaded_dataset_validation = Dataset.from_parquet('./data/{}/validation/*.parquet'.format(args.model_checkpoint))
reloaded_dataset_test = Dataset.from_parquet('./data/{}/test/*.parquet'.format(args.model_checkpoint))

Using custom data configuration default-67b3fc066ced7972
Found cached dataset parquet (/root/.cache/huggingface/datasets/parquet/default-67b3fc066ced7972/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Using custom data configuration default-dcc7bee66fca673f
Found cached dataset parquet (/root/.cache/huggingface/datasets/parquet/default-dcc7bee66fca673f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Using custom data configuration default-ccf0fa78a4b77ba6
Found cached dataset parquet (/root/.cache/huggingface/datasets/parquet/default-ccf0fa78a4b77ba6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [7]:
reloaded_dataset_train

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 257939
})

In [8]:
reloaded_dataset_validation

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 32340
})

In [9]:
reloaded_dataset_test

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 32340
})

# Release Resources

In [10]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>