In [1]:
import multiprocessing
import html
import re
import yaml
from pathlib import Path

import torch
from datasets import load_dataset
from transformers import AutoTokenizer

In [2]:
num_cores_avail = max(1, multiprocessing.cpu_count() - 1)

In [3]:
dataset_checkpoint = "mattismegevand/pitchfork"
dataset_checkpoint_revision = "4d88fdd126d4dc1aa70ac0dbee25fd0020dd9690"

# model_checkpoint = "distilbert-base-cased"
# model_checkpoint_revision = "0dacbb01d604f8adeeb5b87c9339e485ac40d5c0"

model_checkpoint = "distilbert-base-multilingual-cased"
model_checkpoint_revision = "9e90d6dd84b6a2e4d65e4d751baed6cd56578fd3"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(
    model_checkpoint,
    revision=model_checkpoint_revision
)

In [5]:
# Make sure to specify "reviews.csv" since it will default to album images
raw_datasets = load_dataset(
    dataset_checkpoint,
    revision=dataset_checkpoint_revision,
    data_files=["reviews.csv"]
)

In [6]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url'],
        num_rows: 25709
    })
})

In [7]:
dataset = raw_datasets["train"]

# Clean raw dataset

In [8]:
def detect_nonstring(examples):
    is_string = []
    for artist, album, review, reviewer in zip(examples["artist"], examples["album"], examples["review"], examples["reviewer"]):
        is_string_ = isinstance(artist, str)
        is_string_ &= isinstance(album, str)
        is_string_ &= isinstance(review, str)
        is_string_ &= isinstance(reviewer, str)
        is_string.append(is_string_)
    return is_string


def get_review_n_tokens(examples):
    inputs = tokenizer(examples["review"], truncation=False)
    review_n_tokens = [len(inp_ids) for inp_ids in inputs.input_ids]
    input_ids = inputs.input_ids
    return {"review_n_tokens": review_n_tokens}


def unk_detect_batched(examples):
    batch_artist_ids = tokenizer(examples["artist"]).input_ids
    batch_album_ids = tokenizer(examples["album"]).input_ids
    unk_markers = []
    for artist_ids, album_ids in zip(batch_artist_ids, batch_album_ids):
        if (tokenizer.unk_token_id in artist_ids) or (tokenizer.unk_token_id in album_ids):
            unk_markers.append(True)
        else:
            unk_markers.append(False)
    return unk_markers


curly_lsquote = '\u2018'
curly_rsquote = '\u2019'
curly_ldquote = '\u201C'
curly_rdquote = '\u201D'
em_dash = '\u2014'
double_plus = '\u29FA'
ellipsis = '\u2026'
en_dash = '\u2013'
uml_I = "\u00CF"
delta = "\u2206"

blacklist = [
    curly_lsquote, curly_rsquote, curly_ldquote, curly_rdquote, em_dash, double_plus, ellipsis, en_dash, uml_I, delta
]

blacklist_pattern = re.compile("|".join(blacklist))

# Include the curly single quotation mark in the replacement dictionary
blacklist_replace_dict = {
    curly_lsquote: "'",
    curly_rsquote: "'",
    curly_ldquote: '"',
    curly_rdquote: '"',
    em_dash: "-",
    double_plus: "++",
    ellipsis: "...",
    en_dash: "-",
    uml_I: "I",
    delta: "delta"
}


def token_replacer(match):
    return blacklist_replace_dict[match.group(0)]


def replace_known_unk_tokens_batched(examples):
    artists = []
    albums = []
    reviews = []
    reviewers = []
    
    for artist, album, review, reviewer in zip(examples["artist"], examples["album"], examples["review"], examples["reviewer"]):
        artist_ = blacklist_pattern.sub(token_replacer, html.unescape(artist))
        album_ = blacklist_pattern.sub(token_replacer, html.unescape(album))
        review_ = blacklist_pattern.sub(token_replacer, html.unescape(review))
        reviewer_ = blacklist_pattern.sub(token_replacer, html.unescape(reviewer))
        artists.append(artist_)
        albums.append(album_)
        reviews.append(review_)
        reviewers.append(reviewer_)
        
    return {"artist": artists, "album": albums, "review": reviews, "reviewer": reviewers}


def replace_known_unk_tokens(examples):
    artist = blacklist_pattern.sub(token_replacer, html.unescape(example["artist"]))
    album = blacklist_pattern.sub(token_replacer, html.unescape(example["album"]))
    review = blacklist_pattern.sub(token_replacer, html.unescape(example["review"]))
    reviewer = blacklist_pattern.sub(token_replacer, html.unescape(example["reviewer"]))
    return {"artist": artist, "album": album, "review": review, "reviewer": reviewer}

In [9]:
# All columns we indend to analyze must have non-missing data
dataset = dataset.filter(
    # lambda examples: [isinstance(r, str) for r in examples["review"]],
    detect_nonstring,
    batched=True
)

# Replace known "unk" tokens
dataset = dataset.map(
    replace_known_unk_tokens_batched,
    batched=True
)

In [10]:
dataset

Dataset({
    features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url'],
    num_rows: 23034
})

In [11]:
dataset = dataset.map(
    get_review_n_tokens,
    batched=True,
    num_proc=num_cores_avail
)

In [12]:
dataset_leftover = dataset.filter(
    unk_detect_batched,
    batched=True
)

In [13]:
dataset_leftover

Dataset({
    features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url', 'review_n_tokens'],
    num_rows: 5
})

In [14]:
meta_dataset = dataset.map(
    remove_columns=["year_released", "small_text", "album_art_url", "review"]
)

In [15]:
meta_dataset_df = meta_dataset.to_pandas()

In [16]:
Path("../data/pitchfork").mkdir(parents=True, exist_ok=True)

meta_dataset_df.to_csv("../data/pitchfork/meta_df.csv", index=False)
dataset.save_to_disk("../data/pitchfork/dataset")

Saving the dataset (0/1 shards):   0%|          | 0/23034 [00:00<?, ? examples/s]