[Data exploration notebook](./pitchfork_data_explore.ipynb)

In [1]:
import multiprocessing
import html
import re
import yaml
from pathlib import Path

import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer

In [2]:
num_cores_avail = max(1, multiprocessing.cpu_count() - 1)

In [3]:
config_id = "mlml6_rate_pred_cls"

In [4]:
with open(f"../experiments/configs/{config_id}/main.yaml", 'r') as f:
    main_config = yaml.safe_load(f)

In [5]:
dataset_checkpoint = main_config["dataset_checkpoint"]
dataset_checkpoint_revision = main_config["dataset_checkpoint_revision"]
pt_model_checkpoint = main_config["pt_model_checkpoint"]
pt_model_checkpoint_revision = main_config["pt_model_checkpoint_revision"]
dataset_id = main_config["dataset_id"]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    pt_model_checkpoint,
    revision=pt_model_checkpoint_revision
)

In [7]:
# Make sure to specify "reviews.csv" since it will default to album images
raw_datasets = load_dataset(
    dataset_checkpoint,
    revision=dataset_checkpoint_revision,
    data_files=["reviews.csv"]
)

In [8]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url'],
        num_rows: 25709
    })
})

In [9]:
dataset = raw_datasets["train"]

# Clean raw dataset

In [10]:
def detect_nonstring(examples):
    is_string = []
    for artist, album, review, reviewer in zip(examples["artist"], examples["album"], examples["review"], examples["reviewer"]):
        is_string_ = isinstance(artist, str)
        is_string_ &= isinstance(album, str)
        is_string_ &= isinstance(review, str)
        is_string_ &= isinstance(reviewer, str)
        is_string.append(is_string_)
    return is_string


def get_review_n_tokens(examples):
    inputs = tokenizer(examples["review"], truncation=False)
    review_n_tokens = [len(inp_ids) for inp_ids in inputs.input_ids]
    input_ids = inputs.input_ids
    return {"review_n_tokens": review_n_tokens}


def detect_unk_batched(examples):
    batch_artist_ids = tokenizer(examples["artist"]).input_ids
    batch_album_ids = tokenizer(examples["album"]).input_ids
    unk_markers = []
    for artist_ids, album_ids in zip(batch_artist_ids, batch_album_ids):
        if (tokenizer.unk_token_id in artist_ids) or (tokenizer.unk_token_id in album_ids):
            unk_markers.append(True)
        else:
            unk_markers.append(False)
    return unk_markers


curly_lsquote = '\u2018'
curly_rsquote = '\u2019'
curly_ldquote = '\u201C'
curly_rdquote = '\u201D'
em_dash = '\u2014'
double_plus = '\u29FA'
ellipsis = '\u2026'
en_dash = '\u2013'
uml_I = "\u00CF"
delta = "\u2206"
delta2 = "\u25B3"
degree = "\u02DA"
# ١
one = "\u0661"
# ٩
nine = "\u0669"
# Å
AA = "\u0041\u030A"
# ü
ue = "\u0075\u0308"
heart = "\u2661"

# blacklist = [
#     curly_lsquote, curly_rsquote, curly_ldquote, curly_rdquote, em_dash, double_plus, ellipsis, en_dash, uml_I, delta
# ]

blacklists = {
    "distilbert_base_multi_cased": [curly_lsquote, curly_rsquote, curly_ldquote, curly_rdquote, em_dash, double_plus, ellipsis, en_dash, uml_I, delta, AA, degree, ue],
    "minilm_l6": [delta2, degree, one, double_plus, one, nine, heart]
}

blacklist_pattern = re.compile("|".join(blacklists[dataset_id]))

# Include the curly single quotation mark in the replacement dictionary
blacklist_replace_dict = {
    curly_lsquote: "'",
    curly_rsquote: "'",
    curly_ldquote: '"',
    curly_rdquote: '"',
    em_dash: "-",
    double_plus: "++",
    ellipsis: "...",
    en_dash: "-",
    uml_I: "I",
    delta: "delta",
    AA: "AA",
    ue: "ue",
    degree: "degrees",
    delta2: "delta",
    one: "1",
    nine: "9",
    heart: "heart",
}


def token_replacer(match):
    return blacklist_replace_dict[match.group(0)]


def replace_known_unk_tokens_batched(examples):
    artists = []
    albums = []
    reviews = []
    reviewers = []
    
    for artist, album, review, reviewer in zip(examples["artist"], examples["album"], examples["review"], examples["reviewer"]):
        artist_ = blacklist_pattern.sub(token_replacer, html.unescape(artist))
        album_ = blacklist_pattern.sub(token_replacer, html.unescape(album))
        review_ = blacklist_pattern.sub(token_replacer, html.unescape(review))
        reviewer_ = blacklist_pattern.sub(token_replacer, html.unescape(reviewer))
        artists.append(artist_)
        albums.append(album_)
        reviews.append(review_)
        reviewers.append(reviewer_)
        
    return {"artist": artists, "album": albums, "review": reviews, "reviewer": reviewers}


def replace_known_unk_tokens(examples):
    artist = blacklist_pattern.sub(token_replacer, html.unescape(example["artist"]))
    album = blacklist_pattern.sub(token_replacer, html.unescape(example["album"]))
    review = blacklist_pattern.sub(token_replacer, html.unescape(example["review"]))
    reviewer = blacklist_pattern.sub(token_replacer, html.unescape(example["reviewer"]))
    return {"artist": artist, "album": album, "review": review, "reviewer": reviewer}

In [11]:
# All columns we indend to analyze must have non-missing data
dataset = dataset.filter(
    # lambda examples: [isinstance(r, str) for r in examples["review"]],
    detect_nonstring,
    batched=True
)

# Replace known "unk" tokens
dataset = dataset.map(
    replace_known_unk_tokens_batched,
    batched=True
)

In [12]:
dataset

Dataset({
    features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url'],
    num_rows: 23034
})

In [13]:
dataset = dataset.map(
    get_review_n_tokens,
    batched=True,
    num_proc=num_cores_avail
)

dataset = Dataset.from_pandas(
    dataset.to_pandas().drop_duplicates().reset_index(drop=True)
)

In [14]:
dataset_leftover = dataset.filter(
    detect_unk_batched,
    batched=True
)

Filter:   0%|          | 0/22063 [00:00<?, ? examples/s]

In [15]:
len(dataset_leftover)

16

In [16]:
unk_tokens = set()
for i in range(len(dataset_leftover)):
    text = dataset_leftover[i]["review"]
    inputs = tokenizer(text, return_offsets_mapping=True)
    ids = inputs.input_ids
    offsets = inputs.offset_mapping
    
    for j, id in enumerate(ids):
        if id == tokenizer.unk_token_id:
            unk_tokens.add(text[offsets[j][0]: offsets[j][1]])
unk_tokens

Token indices sequence length is longer than the specified maximum sequence length for this model (773 > 512). Running this sequence through the model will result in indexing errors


{'YTI⅃AƎЯ',
 '؟',
 'ゾット',
 '七',
 '九',
 '乱',
 '佛',
 '來',
 '偉',
 '共',
 '冥',
 '卡',
 '印',
 '去',
 '只',
 '吸',
 '呼',
 '夢',
 '奏',
 '客',
 '市',
 '希',
 '幽',
 '廁',
 '念',
 '愚',
 '戰',
 '所',
 '敗',
 '曜',
 '會',
 '殺',
 '波',
 '浴',
 '物',
 '狗',
 '玉',
 '疊',
 '矮',
 '節',
 '與',
 '蒸',
 '观',
 '詩',
 '象',
 '轉',
 '過',
 '重',
 '閃',
 '隠',
 '霊',
 '靈',
 '音',
 '駭',
 '鬼',
 '개꿈',
 '박혜진'}

In [17]:
meta_dataset = dataset.map(
    remove_columns=["year_released", "small_text", "album_art_url", "review"]
)

Map:   0%|          | 0/22063 [00:00<?, ? examples/s]

In [18]:
meta_dataset_df = meta_dataset.to_pandas()

In [19]:
len(dataset)

22063

In [20]:
# datasets = dataset.train_test_split(test_size=0.2)
# datasets["validation"] = datasets.pop("test")
# datasets

# First, split the dataset into train and temp
# Going with a 70-15-15 train-val-test split.
# 70% for training is solid for fine-tuning.
# 15% each for val and test for reliable overfitting estimates and testing.
# A 60-20-20 split would be better with a smaller dataset or a simpler model.
datasets = dataset.train_test_split(test_size=0.3)

# Now, split the temp dataset into validation and test sets
datasets_val_test = datasets.pop("test").train_test_split(test_size=0.5)
datasets["validation"] = datasets_val_test.pop("train")
datasets["test"] = datasets_val_test.pop("test")
# datasets["validation"], datasets["test"] = datasets.pop("test").train_test_split(test_size=0.5)

In [21]:
datasets

DatasetDict({
    train: Dataset({
        features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url', 'review_n_tokens'],
        num_rows: 15444
    })
    validation: Dataset({
        features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url', 'review_n_tokens'],
        num_rows: 3309
    })
    test: Dataset({
        features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url', 'review_n_tokens'],
        num_rows: 3310
    })
})

In [22]:
root_data_dir = f"../data/pitchfork/{dataset_id}"
Path(root_data_dir).mkdir(parents=True, exist_ok=True)

meta_dataset_df.to_csv(f"{root_data_dir}/meta_df.csv", index=False)
datasets.save_to_disk(f"{root_data_dir}/dataset")

Saving the dataset (0/1 shards):   0%|          | 0/15444 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3309 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3310 [00:00<?, ? examples/s]