[Data Exploration >>](02_data_explore.ipynb)

In [1]:
import multiprocessing
import html
import re
import yaml
from pathlib import Path
from pprint import pprint

import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer

import myutilpy.data_processing as dprep

In [2]:
num_cores_avail = max(1, multiprocessing.cpu_count() - 1)

In [3]:
config_id = "mlml6_rate_pred_cls"

In [4]:
with open(f"../experiments/configs/{config_id}/main.yaml", 'r') as f:
    main_config = yaml.safe_load(f)

In [5]:
dataset_checkpoint = main_config["dataset_checkpoint"]
dataset_checkpoint_revision = main_config["dataset_checkpoint_revision"]
pt_model_checkpoint = main_config["pt_model_checkpoint"]
pt_model_checkpoint_revision = main_config["pt_model_checkpoint_revision"]
dataset_id = main_config["dataset_id"]

In [6]:
dataset_id

'minilm_l6'

In [7]:
root_dataset_dir = f"../data/pitchfork/{dataset_id}"
raw_data_cache_dir = f"../data/pitchfork/raw/cache"
Path(raw_data_cache_dir).mkdir(parents=True, exist_ok=True)
Path(root_dataset_dir).mkdir(parents=True, exist_ok=True)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(
    pt_model_checkpoint,
    revision=pt_model_checkpoint_revision
)

In [10]:
# Make sure to specify "reviews.csv" since it will default to album images
raw_datasets = load_dataset(
    dataset_checkpoint,
    revision=dataset_checkpoint_revision,
    data_files=["reviews.csv"],
    cache_dir=raw_data_cache_dir
)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [11]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url'],
        num_rows: 25709
    })
})

# Clean raw dataset

In [12]:
dataset = raw_datasets["train"]

In [13]:
blacklist_pattern = dprep.get_blacklist_pattern(dataset_id)

In [14]:
# The artist, album, review, and reviewer columns should be strings (e.g., should not be None)
dataset = dataset.filter(
    lambda examples: dprep.detect_wrong_type(examples, ["artist", "album", "review", "reviewer"], str),
    batched=True,
    num_proc=num_cores_avail
)

# Replace known "unk" tokens
dataset = dataset.map(
    lambda examples: dprep.replace_known_unk_tokens_batched(examples, ["artist", "album", "review", "reviewer"], blacklist_pattern),
    batched=True,
    num_proc=num_cores_avail
)

Filter (num_proc=15):   0%|          | 0/25709 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/23034 [00:00<?, ? examples/s]

In [15]:
dataset

Dataset({
    features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url'],
    num_rows: 23034
})

In [16]:
dataset = dataset.map(
    lambda examples: dprep.get_n_tokens_batched(examples, "review", tokenizer),
    batched=True,
    num_proc=num_cores_avail
)

dataset = Dataset.from_pandas(
    dataset.to_pandas().drop_duplicates().reset_index(drop=True)
)

Map (num_proc=15):   0%|          | 0/23034 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (709 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (607 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (721 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (579 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (888 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

In [17]:
dataset_leftover = dataset.filter(
    lambda examples: dprep.detect_unk_batched(examples, ["artist", "album"], tokenizer),
    batched=True
)

Filter:   0%|          | 0/22063 [00:00<?, ? examples/s]

In [18]:
len(dataset_leftover)

16

In [19]:
unk_tokens = set()
for i in range(len(dataset_leftover)):
    text = dataset_leftover[i]["review"]
    inputs = tokenizer(text, return_offsets_mapping=True)
    ids = inputs.input_ids
    offsets = inputs.offset_mapping
    
    for j, id in enumerate(ids):
        if id == tokenizer.unk_token_id:
            unk_tokens.add(text[offsets[j][0]: offsets[j][1]])

Token indices sequence length is longer than the specified maximum sequence length for this model (773 > 512). Running this sequence through the model will result in indexing errors


In [20]:
print(*unk_tokens)

狗 過 節 去 개꿈 九 詩 念 呼 象 波 卡 希 敗 重 霊 박혜진 殺 廁 佛 YTI⅃AƎЯ 與 冥 观 物 夢 來 鬼 戰 偉 玉 ؟ 浴 共 愚 蒸 奏 客 幽 轉 靈 ゾット 所 吸 音 疊 七 只 曜 隠 矮 會 乱 閃 市 駭 印


In [21]:
dataset_leftover = dataset.filter(
    lambda examples: dprep.detect_only_unk_batched(examples, ["artist", "album"], tokenizer),
    batched=True
)

Filter:   0%|          | 0/22063 [00:00<?, ? examples/s]

In [22]:
dataset_leftover

Dataset({
    features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url', 'review_n_tokens'],
    num_rows: 0
})

In [23]:
meta_dataset = dataset.map(
    remove_columns=["year_released", "small_text", "album_art_url", "review"]
)

Map:   0%|          | 0/22063 [00:00<?, ? examples/s]

In [24]:
meta_dataset_df = meta_dataset.to_pandas()

In [25]:
len(dataset)

22063

In [26]:
# First, split the dataset into train and temp
# Going with a 70-15-15 train-val-test split.
# 70% for training is solid for fine-tuning.
# 15% each for val and test for reliable overfitting estimates and testing.
# A 60-20-20 split would be better with a smaller dataset or a simpler model.
datasets = dataset.train_test_split(test_size=0.3)

# Now, split the temp dataset into validation and test sets
datasets_val_test = datasets.pop("test").train_test_split(test_size=0.5)
datasets["validation"] = datasets_val_test.pop("train")
datasets["test"] = datasets_val_test.pop("test")
# datasets["validation"], datasets["test"] = datasets.pop("test").train_test_split(test_size=0.5)

In [27]:
datasets

DatasetDict({
    train: Dataset({
        features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url', 'review_n_tokens'],
        num_rows: 15444
    })
    validation: Dataset({
        features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url', 'review_n_tokens'],
        num_rows: 3309
    })
    test: Dataset({
        features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url', 'review_n_tokens'],
        num_rows: 3310
    })
})

In [28]:
meta_dataset_df.to_csv(f"{root_dataset_dir}/meta_df.csv", index=False)
datasets.save_to_disk(f"{root_dataset_dir}/dataset")

Saving the dataset (0/1 shards):   0%|          | 0/15444 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3309 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3310 [00:00<?, ? examples/s]