[Data Exploration >>](02_data_explore.ipynb)

In [57]:
import multiprocessing
import html
import re
import yaml
from pathlib import Path
from pprint import pprint

import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer

In [2]:
num_cores_avail = max(1, multiprocessing.cpu_count() - 1)

In [3]:
config_id = "mlml6_rate_pred_cls"

In [4]:
with open(f"../experiments/configs/{config_id}/main.yaml", 'r') as f:
    main_config = yaml.safe_load(f)

In [5]:
dataset_checkpoint = main_config["dataset_checkpoint"]
dataset_checkpoint_revision = main_config["dataset_checkpoint_revision"]
pt_model_checkpoint = main_config["pt_model_checkpoint"]
pt_model_checkpoint_revision = main_config["pt_model_checkpoint_revision"]
dataset_id = main_config["dataset_id"]

In [6]:
dataset_id

'minilm_l6'

In [91]:
root_dataset_dir = f"../data/pitchfork/{dataset_id}"
raw_data_cache_dir = f"../data/pitchfork/raw/cache"
Path(raw_data_cache_dir).mkdir(parents=True, exist_ok=True)
Path(root_dataset_dir).mkdir(parents=True, exist_ok=True)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(
    pt_model_checkpoint,
    revision=pt_model_checkpoint_revision
)

In [26]:
# Make sure to specify "reviews.csv" since it will default to album images
raw_datasets = load_dataset(
    dataset_checkpoint,
    revision=dataset_checkpoint_revision,
    data_files=["reviews.csv"],
    cache_dir=raw_data_cache_dir
)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [27]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url'],
        num_rows: 25709
    })
})

# Clean raw dataset

In [73]:
# def detect_nonstring(examples):
#     is_string = []
#     for artist, album, review, reviewer in zip(examples["artist"], examples["album"], examples["review"], examples["reviewer"]):
#         is_string_ = isinstance(artist, str)
#         is_string_ &= isinstance(album, str)
#         is_string_ &= isinstance(review, str)
#         is_string_ &= isinstance(reviewer, str)
#         is_string.append(is_string_)
#     return is_string

def detect_wrong_type(examples, cols, dtype):
    is_dtype = []
    # All columns should be of the same length
    for i in range(len(examples[cols[0]])):
        # Make sure entry is of the specified dtype for each column of interest
        is_dtype_ = all(isinstance(examples[col][i], dtype) for col in cols)
        is_dtype.append(is_dtype_)
    return is_dtype


# def get_review_n_tokens_batched(examples):
#     inputs = tokenizer(examples["review"], truncation=False)
#     review_n_tokens = [len(inp_ids) for inp_ids in inputs.input_ids]
#     input_ids = inputs.input_ids
#     return {"review_n_tokens": review_n_tokens}

def get_n_tokens_batched(examples, text_col, tokenizer):
    inputs = tokenizer(examples[text_col], truncation=False)
    n_tokens = [len(inp_ids) for inp_ids in inputs["input_ids"]]
    return {f"{text_col}_n_tokens": n_tokens}


# def detect_unk_batched(examples):
#     batch_artist_ids = tokenizer(examples["artist"]).input_ids
#     batch_album_ids = tokenizer(examples["album"]).input_ids
#     unk_markers = []
#     for artist_ids, album_ids in zip(batch_artist_ids, batch_album_ids):
#         if (tokenizer.unk_token_id in artist_ids) or (tokenizer.unk_token_id in album_ids):
#             unk_markers.append(True)
#         else:
#             unk_markers.append(False)
#     return unk_markers


def detect_unk_batched(examples, cols, tokenizer):
    batch_ids_dict = {col: tokenizer(examples[col]).input_ids for col in cols}
    unk_markers = []
    # All columns should be of the same length
    for i in range(len(examples[cols[0]])):
        has_unk = any(tokenizer.unk_token_id in batch_ids_dict[col][i] for col in cols)
        unk_markers.append(has_unk)
        
    return unk_markers


def detect_only_unk_batched(examples, cols, tokenizer):
    batch_ids_dict = {col: tokenizer(examples[col]).input_ids for col in cols}
    only_unk_markers = []
    
    # All columns should be of the same length
    for i in range(len(examples[cols[0]])):
        # Does *any* column (attribute) contain *only* unknown tokens?
        only_unk = any(all(token_id == tokenizer.unk_token_id for token_id in batch_ids_dict[col][i]) for col in cols)
        only_unk_markers.append(only_unk)
        
    return only_unk_markers


curly_lsquote = '\u2018'
curly_rsquote = '\u2019'
curly_ldquote = '\u201C'
curly_rdquote = '\u201D'
em_dash = '\u2014'
double_plus = '\u29FA'
ellipsis = '\u2026'
en_dash = '\u2013'
uml_I = "\u00CF"
delta = "\u2206"
delta2 = "\u25B3"
degree = "\u02DA"
# ١
one = "\u0661"
# ٩
nine = "\u0669"
# Å
AA = "\u0041\u030A"
# ü
ue = "\u0075\u0308"
heart = "\u2661"

# blacklist = [
#     curly_lsquote, curly_rsquote, curly_ldquote, curly_rdquote, em_dash, double_plus, ellipsis, en_dash, uml_I, delta
# ]

blacklists = {
    "distilbert_base_multi_cased": [curly_lsquote, curly_rsquote, curly_ldquote, curly_rdquote, em_dash, double_plus, ellipsis, en_dash, uml_I, delta, AA, degree, ue],
    "minilm_l6": [delta2, degree, one, double_plus, one, nine, heart]
}

blacklist_pattern = re.compile("|".join(blacklists[dataset_id]))

# Include the curly single quotation mark in the replacement dictionary
blacklist_replace_dict = {
    curly_lsquote: "'",
    curly_rsquote: "'",
    curly_ldquote: '"',
    curly_rdquote: '"',
    em_dash: "-",
    double_plus: "++",
    ellipsis: "...",
    en_dash: "-",
    uml_I: "I",
    delta: "delta",
    AA: "AA",
    ue: "ue",
    degree: "degrees",
    delta2: "delta",
    one: "1",
    nine: "9",
    heart: "heart",
}


def token_replacer(match):
    return blacklist_replace_dict[match.group(0)]


# def replace_known_unk_tokens_batched(examples):
#     artists = []
#     albums = []
#     reviews = []
#     reviewers = []
    
#     for artist, album, review, reviewer in zip(examples["artist"], examples["album"], examples["review"], examples["reviewer"]):
#         artist_ = blacklist_pattern.sub(token_replacer, html.unescape(artist))
#         album_ = blacklist_pattern.sub(token_replacer, html.unescape(album))
#         review_ = blacklist_pattern.sub(token_replacer, html.unescape(review))
#         reviewer_ = blacklist_pattern.sub(token_replacer, html.unescape(reviewer))
#         artists.append(artist_)
#         albums.append(album_)
#         reviews.append(review_)
#         reviewers.append(reviewer_)
        
#     return {"artist": artists, "album": albums, "review": reviews, "reviewer": reviewers}


def replace_known_unk_tokens_batched(examples, cols):
    replaced_batch = {col: [] for col in cols}

    # All columns should be of the same length
    for i in range(len(examples[cols[0]])):
        for col in cols:
            text = examples[col][i]
            replaced_text = blacklist_pattern.sub(token_replacer, html.unescape(text))
            replaced_batch[col].append(replaced_text)
            
    return replaced_batch

# def replace_known_unk_tokens(examples):
#     artist = blacklist_pattern.sub(token_replacer, html.unescape(example["artist"]))
#     album = blacklist_pattern.sub(token_replacer, html.unescape(example["album"]))
#     review = blacklist_pattern.sub(token_replacer, html.unescape(example["review"]))
#     reviewer = blacklist_pattern.sub(token_replacer, html.unescape(example["reviewer"]))
#     return {"artist": artist, "album": album, "review": review, "reviewer": reviewer}

def replace_known_unk_tokens(examples, cols):
    replaced = {}
    for col in cols:
        replaced[col] = blacklist_pattern.sub(token_replacer, html.unescape(examples[col]))
    return replaced

In [74]:
dataset = raw_datasets["train"]

In [75]:
# The artist, album, review, and reviewer columns should be strings (e.g., should not be None)
dataset = dataset.filter(
    # detect_nonstring,
    lambda examples: detect_wrong_type(examples, ["artist", "album", "review", "reviewer"], str),
    batched=True,
    num_proc=num_cores_avail
)

# Replace known "unk" tokens
dataset = dataset.map(
    # replace_known_unk_tokens_batched,
    lambda examples: replace_known_unk_tokens_batched(examples, ["artist", "album", "review", "reviewer"]),
    batched=True,
    num_proc=num_cores_avail
)

In [76]:
dataset

Dataset({
    features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url'],
    num_rows: 23034
})

In [77]:
dataset = dataset.map(
    # get_review_n_tokens,
    lambda examples: get_n_tokens_batched(examples, "review", tokenizer),
    batched=True,
    num_proc=num_cores_avail
)

dataset = Dataset.from_pandas(
    dataset.to_pandas().drop_duplicates().reset_index(drop=True)
)

Map (num_proc=15):   0%|          | 0/23034 [00:00<?, ? examples/s]

In [78]:
dataset_leftover = dataset.filter(
    # detect_unk_batched,
    lambda examples: detect_unk_batched(examples, ["artist", "album"], tokenizer),
    batched=True
)

Filter:   0%|          | 0/22063 [00:00<?, ? examples/s]

In [79]:
len(dataset_leftover)

16

In [80]:
unk_tokens = set()
for i in range(len(dataset_leftover)):
    text = dataset_leftover[i]["review"]
    inputs = tokenizer(text, return_offsets_mapping=True)
    ids = inputs.input_ids
    offsets = inputs.offset_mapping
    
    for j, id in enumerate(ids):
        if id == tokenizer.unk_token_id:
            unk_tokens.add(text[offsets[j][0]: offsets[j][1]])

In [81]:
print(*unk_tokens)

霊 重 殺 物 蒸 七 乱 疊 來 偉 鬼 幽 卡 愚 浴 曜 所 與 呼 YTI⅃AƎЯ 詩 冥 去 狗 戰 玉 敗 象 客 印 波 박혜진 會 廁 개꿈 只 念 節 靈 閃 ゾット 夢 市 轉 過 共 希 ؟ 駭 矮 音 奏 佛 九 观 隠 吸


In [82]:
dataset_leftover = dataset.filter(
    # detect_unk_batched,
    lambda examples: detect_only_unk_batched(examples, ["artist", "album"], tokenizer),
    batched=True
)

Filter:   0%|          | 0/22063 [00:00<?, ? examples/s]

In [83]:
dataset_leftover

Dataset({
    features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url', 'review_n_tokens'],
    num_rows: 0
})

In [84]:
meta_dataset = dataset.map(
    remove_columns=["year_released", "small_text", "album_art_url", "review"]
)

Map:   0%|          | 0/22063 [00:00<?, ? examples/s]

In [85]:
meta_dataset_df = meta_dataset.to_pandas()

In [86]:
len(dataset)

22063

In [87]:
# First, split the dataset into train and temp
# Going with a 70-15-15 train-val-test split.
# 70% for training is solid for fine-tuning.
# 15% each for val and test for reliable overfitting estimates and testing.
# A 60-20-20 split would be better with a smaller dataset or a simpler model.
datasets = dataset.train_test_split(test_size=0.3)

# Now, split the temp dataset into validation and test sets
datasets_val_test = datasets.pop("test").train_test_split(test_size=0.5)
datasets["validation"] = datasets_val_test.pop("train")
datasets["test"] = datasets_val_test.pop("test")
# datasets["validation"], datasets["test"] = datasets.pop("test").train_test_split(test_size=0.5)

In [88]:
datasets

DatasetDict({
    train: Dataset({
        features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url', 'review_n_tokens'],
        num_rows: 15444
    })
    validation: Dataset({
        features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url', 'review_n_tokens'],
        num_rows: 3309
    })
    test: Dataset({
        features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url', 'review_n_tokens'],
        num_rows: 3310
    })
})

In [92]:
meta_dataset_df.to_csv(f"{root_dataset_dir}/meta_df.csv", index=False)
datasets.save_to_disk(f"{root_dataset_dir}/dataset")

Saving the dataset (0/1 shards):   0%|          | 0/15444 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3309 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3310 [00:00<?, ? examples/s]