# Preparations

## Imports

In [None]:
import polars as pl

playlists = pl.scan_parquet('../processed_data/data_playlist_metadata.parquet')
playlist_tracks = pl.scan_parquet('../processed_data/data_playlist_songs.parquet')
tracks = pl.scan_parquet('../processed_data/data_song_metadata.parquet')

# Analysis

## Tokenization

In [None]:
def tokenize(expr: pl.Expr) -> pl.Expr:
    return expr.str.to_lowercase().str.split(' ')


def tokenize_unique(expr: pl.Expr) -> pl.Expr:
    return tokenize(expr)\
        .list.filter(pl.element().ne(''))\
        .list.unique(maintain_order=True)


def tokenize_filtered(expr: pl.Expr) -> pl.Expr:
    return (
        tokenize_unique(expr)
        # Filter our years & BPM ranges
        .list.filter(~pl.element().str.contains("^([0-9]+|[0-9]+-[0-9]+)$"))
        # Filter out stuff consisting only of non-letters
        .list.filter(pl.element().str.contains("[[:alpha:]]"))
    )

## Playlist statistics

In [None]:
patterns: dict[str, list[str]] = {}

In [None]:
patterns["context"] = pattern_context = {
    # Competitions
    "comp",
    "comps",
    "competition",
    "finals",
    "heat",  # "heat 1"
    "jnj",
    "j&j",
    "prelim",
    "prelims",
    "semis",
    "spotlight",
    "spotlights",
    "strictly",

    # Classes
    "class",
    "workshop",
    "course",

    # Warmup
    "warmup",
    "warm up",
    "warm-up",

    # Parties
    "party",
    "practice",
    "praktis",
    "social",
    "socialdans",
    "sosialdans",
    "soirée",

    # WCS Rally / Flashmob
    "flashmob",
    "rally",

    # Further investigation needed
    "weekly",

    # Party phases
    "closing",
}

In [None]:
patterns["genre"] = pattern_genres = [
    # The following genres seem to lead to precise matches
    "blues",
    "funk",
    "hip", "hop",
    "motown",
    "r&b",
    "rock", # "soft rock", "alternative rock", "pop rock",
    "soul",

    # The following genres may require more detailed examination,
    # and may need further information to identify WCS playlists
    # "kpop",
    # "musicals",
    # "pop",

    # These aren't exactly music genres, but have a very
    # similar meaning in the context of West Coast Swing
    # "acoustic",
    # "guitar",
    # "instrumental",
    # "late", "night",
    # "piano",

    # These are qualifiers
    # "fast",
    # "slow",
    # "medium", # "medium fast", # imprecise

    # The following terms probably need further investigation.
    # Some of them seem to define "musical feels".
    # "fusion", # imprecise
    # "groovy", # -"groovy tuesdays"?
    # "smooth",
    # "triple",
    # "step", # "step step"
    # "walk",
]

In [None]:
patterns["epoch"] = pattern_epochs = [
    # These classifications are based on time frames instead of genres
    "contemporary", # "slow contemporary",
    "oldies",
    "80s", "80's",
    "90s", "90's",
    "modern", # imprecise
    ### "now",  # imprecise
    "throwback",
]

In [None]:
patterns["mood"] = pattern_moods = [
    # These aren't musical genres but moods
    "calm",
    "chill",
    "happy",
    "high", "energy", # "high energy",
    "relax", "relaxing",
    "sad",
    "sexy",
    ### "warm", # -"warm up" # imprecise without further filtering
]

In [None]:
patterns["timing"] = pattern_timing = [
    "shuffle",
    "smooth",
    "straight",
    "swung",
    "ternary",
]

In [None]:
patterns["topic"] = pattern_topics = [
    "bumper",
    ### "connection", # needs further filtering
    "drill", "drills",
    "footwork",
    "lyrical",
    "musicality",
    "phrase", "phrasing",
    "rhythms",
]

In [None]:
patterns["language"] = pattern_languages = [
    # Note: We likely want to filter out stuff like "French Open", "German Open"
    #       when we are trying to find e.g. "German [Songs]"
    "deutsch",
    "french",
    "german",
    "spanish",
]

In [None]:
patterns["weird"] = pattern_weird = [
    "odd",
    "weird",
    "strange",
]

In [None]:
patterns["too_broad"] = pattern_too_broad = [
    "dance",
    "love",
    "remix",
]

In [None]:
import itertools

playlists_tokenized = playlists.select(
    pl.col('playlist.id'),
    pl.col('playlist.name'),
    pl.col('playlist.name').pipe(tokenize_filtered).alias('unique_terms'),
)

exploded_playlists_tokenized = playlists_tokenized\
    .explode('unique_terms')\
    .rename({'unique_terms': 'term'})

tokens = exploded_playlists_tokenized\
    .group_by('term')\
    .agg(pl.col('term').count().alias('playlist_count'),
         pl.col('playlist.name').head(20))\
    .sort('playlist_count', descending=True)

pattern_maybe_events_or_organizers = [
    "tp",
]

tokens\
    .filter(~pl.col('term').is_in(list(itertools.chain.from_iterable(patterns.values()))))\
    .filter(pl.col('playlist_count').ge(10))\
    .collect(engine='streaming')