# Preparations

## Imports

In [None]:
import polars as pl

playlists = pl.scan_parquet('../processed_data/data_playlist_metadata.parquet')
playlist_tracks = pl.scan_parquet('../processed_data/data_playlist_songs.parquet')
tracks = pl.scan_parquet('../processed_data/data_song_metadata.parquet')

# Analysis

## Tokenization

In [None]:
def tokenize(expr: pl.Expr) -> pl.Expr:
    return expr.str.to_lowercase().str.split(' ')


def tokenize_unique(expr: pl.Expr) -> pl.Expr:
    return tokenize(expr)\
        .list.filter(pl.element().ne(''))\
        .list.unique(maintain_order=True)


def tokenize_filtered(expr: pl.Expr) -> pl.Expr:
    return (
        tokenize_unique(expr)
        # Filter our years & BPM ranges
        .list.filter(~pl.element().str.contains("^([0-9]+|[0-9]+-[0-9]+)$"))
        # Filter out stuff consisting only of non-letters
        .list.filter(pl.element().str.contains("[[:alpha:]]"))
    )

## Playlist statistics

### Patterns

In [None]:
patterns: dict[str, list[str]] = {}

#### Context & Level

In [None]:
patterns["context"] = pattern_context = {
    # Competitions
    "comp",
    "competition",
    "comps",
    "contest", # unrelated?
    "final",
    "finals",
    "heat",  # "heat 1"
    "invitational",
    "j&j",
    "jill",
    "jnj",
    "prelim",
    "prelims",
    "semis",
    "spotlight",
    "spotlights",
    "strictly",

    # Classes
    "bootcamp",
    "class",
    "classes",
    "cours",
    "course",
    "group",  # "group class",
    "intensive",
    "kurs",
    "l1",
    "l2",
    "l3",
    "learn",  # imprecise
    "lesson",
    "lessons",
    "private",
    "privates",
    "teach", # imprecise
    "teaching",  # imprecise
    "unterricht",
    "workshop",
    "workshops",
    "ws",

    # Routines
    "proam",
    "rising", "rising star",
    "routine",
    "routines",
    "showcase",

    # Shows
    "demo",
    "show",

    # Warmup
    "warmup",
    "warmups",
    "warm up",
    "warm-up",

    # Parties
    "air",  # "open air",
    "nights",  # "social nights",
    "outdoor",
    "party",
    "practica",
    "practice",
    "practise",
    "praktis",
    "pratique",
    "social",
    "socialdans",
    "socials",
    "soiree",
    "soirée",
    "sosialdans",
    "stuggi",

    # Training?
    "training",
    "träning",
    "treino",

    # WCS Rally / Flashmob
    "flashmob",
    "rally",

    # Further investigation needed
    "choreo",
    "weekly",

    # Party phases
    "background",
    "bumper",
    "ceremony",
    "dinner",
    "opening",
    "main",
    "closing", "fin",

    # Various
    "bal",
    "camp",
    "pre-class",
    "session",
}

In [None]:
patterns["level"] = pattern_levels = [
    # These can mean competition or difficulity levels
    "all-star", # "all star",
    "adv",
    "advanced",
    "bootcamp"
    "beg",
    "beginner", "friendly", "débutant",
    "beginners",
    "champs",
    "champions",
    "int",
    "inter",
    "intermediate",
    "master",
    "newbies",
    "newcomer",
    ### "nov",  # imprecise: can mean both "novice" and "november"
    "novice",

    # Difficulty levels
    "easy",
    "hard",
    "pro",
    "difficult",
]

#### Music

In [None]:
patterns["genre"] = pattern_genres = [
    # The following genres seem to lead to precise matches
    "blues", "bluesy",
    "country",
    "funk",
    "hip", "hop", "hip hop", "hiphop", "hip-hop",
    "folk",
    "motown",
    "punk",
    "rap",
    "rnb", "r&b",
    "rock",  # "soft rock", "alternative rock", "pop rock",
    "soul",

    # The following genres may require more detailed examination,
    # and may need further information to identify WCS playlists
    "kpop", "k-pop",
    "musicals",
    "jazz",
    "pop",
    "trance",
    "old",  # imprecise # "old school",

    # Review these
    "indie",
    "soundtrack)",
    "lyrics",

    # These aren't exactly music genres, but have a very
    # similar meaning in the context of West Coast Swing
    "acoustic",
    "cover",
    "covers",
    "funky",
    "guitar",
    "instrumental",
    "late", "night", "nite", "latenight",
    "piano",

    # These are qualifiers
    "fast",
    "faster",
    "slow",
    "slower",
    "medium",  # "medium fast", # imprecise

    # The following terms probably need further investigation.
    # Some of them seem to define "musical feels".
    "fusion",  # imprecise
    "groovy",  # -"groovy tuesdays"?
    "smooth",
    "triple",
    "step",  # "step step"
    "walk",
]

In [None]:
patterns["epoch"] = pattern_epochs = [
    # These classifications are based on time frames instead of genres
    "70's", "70s",
    "80s", "80's",
    "90s", "90's",
    "2000s",
    # ----------------
    "contemp", "contemporary",  # "slow contemporary",
    "modern",  # imprecise
    # "now",  # imprecise
    # ----------------
    "classics",
    "nostalgia",
    "oldies",
    "retro",
    "throwback",
    "throwbacks",
    # ----------------
    "hits",
    "bangers",
    "hit",
    "popular",
]

In [None]:
patterns["mood"] = pattern_moods = [
    # These aren't musical genres but moods
    "calm",
    "chill",
    "happy",
    "high", "energy",  # "high energy",
    "rage",
    "relax", "relaxing",
    "sad",
    "sexy",
    # "warm", # -"warm up" # imprecise without further filtering

    # Need further investigation
    "good", "vibes",
    "good", "morning",
    "gute",
    "nice",
    "power",

    # Also need more investigation
    "beautiful",
    "chillout",
    "emotional",
    "feeling",
    "guilty",
    "hype",
    "lucky",
    "mellow",
    "moody",
    "romantic",
    "sensual",

    # More stuff
    "soft",
    "dark",  # ?

    # mood or rhythm
    "downbeat",
    "upbeat",
]

In [None]:
patterns["season"] = pattern_seasons = [
    # Birthdays
    "b-day",
    "bday",
    "birthday",

    # Christmas
    "christmas",
    "noël",
    "xmas",
    "x-mas",

    # Seasons
    "autumn",
    "fall",
    "spring",  # also matches certain events
    "summer",  # also matches certain events
    "winter",  # also matches certain events

    # More
    "easter",
    "halloween",  # also matches certain events
    "holiday",
    "holidays",
    "season",
    "spooky",
    "thanksgiving",
    "valentine's",
    "valentine",
    "valentines",

    # Even more
    "beach",
    "pool",
    "silvester",
    "anniversary",
]

In [None]:
patterns["timing"] = pattern_timing = [
    "clear",
    "shuffle",
    "smooth",
    "straight",
    "swung",
    "ternary",
]

In [None]:
patterns["topic"] = pattern_topics = [
    "basic",
    "basics",
    # "connection", # needs further filtering
    "drill", "drills",
    "footwork",
    "lyrical",
    "musicality", "musicalité",
    "phrase", "change", "phrasing",
    "rhythm", "rhythms",
    "structure", "bar", "count",
    "style", "styling",
    "timing",
    #
    "barre",
    "body",
    "call", "response",
    "changes",
    "critique",
    "double",
    "intro",
    "intros",
    "micro",
    "mixed",
    "motivation",
    "moves",
    "musique",
    "performance",
    "rounds",
    "single",
    "stretch",
    "switch",

    "together",
]

In [None]:
patterns["dance"] = pattern_dances = [
    # West Coast Swing
    "wc", "wcs", "(wcs)", "wcs-", "wsc", "wcs:", "wcs.",
    "west", "coast", "swing",
    "westcoast", "westcoastswing",
    "westie:", "westie", "westies",

    # Ballroom & Latin
    "ccc",  # "cha cha", "cha cha cha",
    "cha",  # Review
    "df", "discofox",
    "lw",
    "nc2", "nc2s", "two-step",  # "nightclub twostep",
    "qs",
    "rb", "ru",  # "rumba",
    "sf",
    "tango", "argentino", "milonga",
    "ww",  # imprecise?

    # More ballroom
    "ball",
    "hochzeit",
    "tanz",
    "tanzmusik",
    "tanzparty",
    "tanzrunde",

    # Unrelated solo dancing
    "kita", "kids",
    "hh",
    "fitness",
    "pole",  # "pole dance", "pole dancing",
    "line",  # "line dance"

    # Other dance styles
    "crossover",
    "kiz", "kizomba",  # "urban kiz",
    "soltinho",  # Google says: "Brasilian couple dance, that developed from East Coast Swing and is danced to Rock, Disco and Swing."
    "swouk",
    "zouk",
]

In [None]:
patterns["language"] = pattern_languages = [
    # Note: We likely want to filter out stuff like "French Open", "German Open"
    #       when we are trying to find e.g. "German [Songs]"
    "chinese",
    "english",
    "french", "française", "français",
    "german", "deutsch",
    "korean",
    "spanish",
]

#### Exploration

In [None]:
patterns["shazam"] = pattern_shazam = [
    "shazam",
    "shazams",
    "shazam-titel",
]

In [None]:
patterns["explore"] = pattern_exploration = [
    "dump",
    "everything",
    "experimental",
    "explore",
    "extra",
    "find",
    "finds",
    "ideas",
    "incoming",
    "inspiration",
    "interesting",
    "listen",
    "memories",
    "misc",
    "new",
    "possible",
    "potential",
    "random",
    "some",
    "something",
    "sort",
    "suggestions",
    "test",
    "trial",
    "unsorted",
]

In [None]:
patterns["favorite"] = pattern_favorites = [
    "fav",
    "fave",
    "faves",
    "favorite",
    "favoriter",
    "favorites",
    "favourite",
    "favourites",
    "favs",
    "liked",
    "mit star bewertet",
    "picks",
    "starred",
    "top",
    #
    "anthems",
    "ready",
    "ultimate",
]

In [None]:
patterns["weird"] = pattern_weird = [
    "odd",
    "weird",
    "strange",
]

#### Skip words

In [None]:
patterns["too_broad"] = pattern_too_broad = [
    "dance", "dans", "danse", "dancing",
    "drinking",
    "love",
    "remix",
    "best",  # "best of",
    "live",
    "open",
]

In [None]:
patterns["too_generic"] = pattern_too_generic = [
    "(deluxe", "(deluxe)",
    "album", "release", "various", "artists",
    "alt", "backup",
    "baby",
    "battle",  # "dj battle",
    "beats",
    "bpm", "bpm:", "bpm)", "bpm]", "rpm",
    "cast",
    "collab",
    "dj", "djs", "[dj",
    "edition", "edition)",
    "friends",
    "game", "games",
    "general",
    "island",
    "jam", "jams", "jamz",  # ?
    "mini",
    "movie", "video", "motion", "picture",
    "music", "musik",
    "original", "(original",
    "part", "pt", "vol.", "volume",
    "room", "hall",
    "sélection", "shortlist", "options", "collection", "bag","jukebox",
    "set", "setlist", "played", "playlist", "playlista", "list", "lista", "mix",
    "song", "songs", "titres", "tunes", "tracks", "låtar",
    "sound", "sounds",
    "spotify",
    "stuff", "thing", "these", "shit",
    "time",
    "version", "version)",

    # Maybe indicates a party?
    "century",
    "year", "years",
    "monthly",
    "week",
    "daily", "day", "days",
    "today", "tonight",
    "afternoon",
    "first", "second", "half",
    "hour",
    "end",
]

In [None]:
patterns["skip"] = pattern_skip = [
    "(feat.",
    "(with",
    "1h",
    "1st",
    "2nd",
    "3rd",
    "4th",
    "5th",
    "6p",
    "6th",
    "7th",
    "8th",
    "9a",
    "9th",
    "a",
    "aa",
    "about",
    "after",
    "again",
    "all",
    "am",
    "an",
    "and",
    "are",
    "as",
    "at",
    "b",
    "back",  # "back to school", "throw back",
    "be",
    "big",
    "bis",
    "bottle",
    "boy",
    "but",  # interesting
    "by",
    "c",
    "ca.",
    "can't",
    "can",
    "come",
    "d",
    "da",
    "de",
    "den",
    "die",
    "do",
    "don't",
    "drive",
    "driving",
    "du",
    "e",
    "en",
    "et",
    "f",
    "for",
    "from",
    "full",
    "für",
    "g",
    "get",
    "girl",
    "girls",
    "go",
    "god",
    "got",
    "have",
    "hd",
    "head",
    "here",
    "home",
    "i'm",
    "i",
    "if",
    "ii",
    "im",
    "in",
    "international",
    "into",
    "is",
    "it's",
    "it’s",
    "it",
    "j",
    "just",
    "k",
    "king",
    "know",
    "komplett",
    "l",
    "la",
    "la",
    "ladies",
    "lady",
    "le",
    "les",
    "let",
    "life",
    "like",
    "lil",
    "m",
    "ma",
    "make",
    "man",
    "mas",
    "me",
    "meine",
    "mes",
    "mind",
    "mine",
    "moje",
    "mom",
    "more",
    "my",
    "myself",
    "n",
    "na",
    "need",
    "next",
    "no",
    "not",  # interesting
    "o",
    "och",
    "of",
    "off",
    "oh",
    "on",
    "one",
    "only",
    "or",
    "original",
    "out",
    "på",
    "park",
    "part",
    "people",
    "play",
    "pour",
    "r",
    "ride",
    "road",
    "run",
    "s",
    "see",
    "shake",
    "so",
    "t",
    "take",
    "that",
    "the",
    "they",
    "this",
    "three",
    "till",
    "to",
    "try",
    "two",
    "u",
    "und",
    "up",
    "us",
    "utwory",
    "v",
    "van",
    "vol.",
    "w",
    "w/",
    "wanna",
    "want",
    "was",
    "way",
    "we",
    "what",
    "who",
    "will",
    "with",
    "work",
    "world",
    "x",
    "y",
    "yes",
    "you're",
    "you",
    "your",
    "z",
]

#### Months & Weekdays

In [None]:
patterns["weekday"] = pattern_weekdays = [
    "mon", "mon,",
    "monday", "monday,",
    "tue", "tue,", "tues",
    "tuesday", "tuesday,",
    "tuesdays",
    "wed", "wed,",
    "wednesday", "wednesday,",
    "thu", "thu,", "thurs",
    "thursday", "thursday,",
    "fri", "fri,",
    "friday", "friday,",
    "sat", "sat,",
    "saturday", "saturday,",
    "sun", "sun,",
    "sunday", "sunday,",

    # French
    "dimanche",
    "jeudi",
    "lundi",
    "mardi",
    "samedi",
    "vendredi",
]

In [None]:
patterns["month"] = pattern_months = [
    "jan", "january", "januar", "janvier",
    "feb", "february", "februar", "février",
    "mar", "march", "märz", "mars",
    "apr", "april", "avril",
    "may", "mai",
    "jun", "june", "juni", "juin",
    "jul", "july", "juli", "juillet",
    "aug", "august", "août",
    "sep", "sept", "september", "septembre",
    "oct", "october", "okt", "oktober",
    "nov", "november", "novembre",
    "dec", "december", "dez", "dezember",
]

#### DJ/Artist/Event Names

In [None]:
patterns["dj"] = pattern_djs = [
    "alex",
    "andrzejki",
    "artur",
    "attucks",
    "beti",
    "breno",
    "david",
    "michaela",
    "dyos",
    "fuzz",
    "kia", # "dj kia",
    "lew",
    "lils",
    "lojyk's",
    "marco",
    "margies",
    "matt",
    "meech",
    "poronin",
    "psdj",
    "rick",
    "ruby",
    "sam's",
    "sara", # "dj sara",
    "shay",
    "snail]",
    "steve",
    "tein",
    "tim",
    "wah",
    "yvonne",
]

In [None]:
patterns["artist"] = pattern_artists = [
    # Artist names
    "bryan", "jordan",
    "coldplay",
    "ed", "sheeran",
    "greatest", "showman",
    "imagine", "dragons",
    "john", "mayer",
    "michael", "jackson",
    "mumford", "sons",
    "sam", "smith",
    "sara", "bareilles",
    "taylor", "swift",

    # Unsorted
    "amigo's",
    "billy",
    "brad",
    "chris",
    "jack",
    "james",
    "jason",
    "justin",
    "linkin",
    "prince",
    "shawn",
    "train",
    "williams",
]

In [None]:
patterns["maybe_events"] = pattern_maybe_events_or_organizers = [
    "mcs",
    "angels",
    "antonio",
    "aw",
    "awa",
    "baltic",
    "barka",  # ?
    "bash",
    "bcsdc",
    "berlin",
    "bh",
    "boston",
    "botb",
    "brunch",
    "bsl",
    "budafest",
    "buli",
    "buma",
    "całonocna",
    "capital",  # Capital Swing
    "central",
    "chatt",
    "chicago",
    "city",  # imprecise # Westie Pink City, but also Owl City
    "classic",  # imprecise # Paris Swing Classic, Tx Classic Swing
    "club",  # imprecise # Tyrol Club of Solvay
    "code",  # imprecise?
    "collective",
    "college",
    "crush",
    "danceboston",
    "dc",
    "dcsx",
    "diego",  # Swing Diego, San Diego
    "dual",  # imprecise
    "dv",
    "elks",
    "empower",  # XPRESS EMPOWER
    "esdc",
    "farnham",
    "fest",
    "festival",
    "ff",
    "five",  # "friday five",
    "fling",
    "float",
    "fnpl",
    "fools",  # Dancing Fools
    "haifa",
    "hcs",
    "idance",  # iDance
    "infused",
    "itsallswing",
    "jj",
    "k&s",
    "keller",
    "lab",  # "Dance Lab",
    "ladc",
    "liberty",
    "madjam",  # "MADjam",
    "madness",
    "mamalist",
    "mj",
    "mwcsc",
    "mwf",
    "nordic",  # Nordic Open
    "nye",
    "nzo",
    "osaka",
    "paris",
    "phoenix",
    "pink",
    "potsdam",
    "proswingdjs",
    "push",
    "rc",
    "red",
    "roc",  # Wild Westies ROC
    "rocket",
    "rose",
    "rtb",
    "rx",
    "ryan",
    "san",
    "sea",
    "seattle",
    "shakedown",
    "silbando",
    "sofia",
    "ss",
    "ssdc",
    "st",
    "stanford",
    "street",
    "strength",  # imprecise
    "studio",
    "summit",
    "sundance",
    "swingesota",
    "swingout",  # SwingOut
    "swingsation",
    "swingside",
    "swingtacular",
    "swingtzerland",
    "switchx",  # SwitchXperience
    "synergy",
    "syracuse",  # Syracuse Socials
    "tap",  # The After Party
    "tb",
    "tds",
    "tp",
    "tsp",
    "ucswing",
    "uk",
    "ulm",
    "uptown",
    "usa",
    "wasda",
    "wcs@home",
    "wcsa",
    "wcw",
    "westiebos",
    "westival",
    "westy",
    "white",
    "whs",
    "wicked",
    "wild",
    "wotp",  # Westie On The Promenade
    "xchange",
    "xpress",  # XPRESS CLASSIC, XPRESS CONNECT
    "żaczek",
    "zf", "konobueno",
    "zonawcs",
]

### Scanning for patterns

In [None]:
import itertools

playlists_tokenized = playlists.select(
    pl.col('playlist.id'),
    pl.col('playlist.name'),
    pl.col('playlist.name').pipe(tokenize_filtered).alias('unique_terms'),
)

exploded_playlists_tokenized = playlists_tokenized\
    .explode('unique_terms')\
    .rename({'unique_terms': 'term'})

tokens = exploded_playlists_tokenized\
    .group_by('term')\
    .agg(pl.col('term').count().alias('playlist_count'),
         pl.col('playlist.name').head(20))\
    .sort('playlist_count', descending=True)

patterns["misc"] = pattern_misc = [
    # These may warrant further investigation
    "beat",
    "danceability",
    "drops",
    "flow",
    "level",
    "tempo",
    "low",
    "high",
    " / ",

    # Sorting
    "decreasing",
    "increasing",
    "level",
    "ordered",
    "similar",
    "sorted",

    # Probably not WCS
    "tik", "tok",
]

patterns_new_exclude = [
    "down",  # "slow down", "steady down beat, slow",
    "first",  # "first half", "first hour",
    "school",  # "old school", "middle school",

    "white",
    "rabbit",
    "rebels",  # unrelated
    "fun",  # imprecise
    "star",  # imprecise
    "warm",  # imprecise

    # context 2
    "solo",
    "ballet",

    "new",
    "neu",
    "now",
    "hot",
    "current",
]

patterns_new = [
    "med",
    "feel",
    "feels",
    "groove",
    "mood",
    "connect",
    "intro",
    "cool",
    "soundtrack",
    "vibe",
    "sing",
    "pre",
    "freestyle",
    "weekend",
    "classical",
    "mid",
    "musical",
    "creativity",
    "kids",
    "super",  # "super fast",
    "very",
    "last",
    "first",
    "movement",
    "karaoke",
    "break",
    "early",
    "epic",
    "requests",
    "breaks",
    "long",
    "other",
    "min",
    "fire",
    "sa",
    "maybe",
    "when",
    "little",
    "bad",
    "too",
    "vs",
    "better",
    "half",
    "made",
    "going",
    "before",
    "along",
    "start",
    "crazy",
    "right",
    "water",
    "shower",
    "magic",
    "never",
    "ok",
    "really",
    "different",
    "dreams",
    "remember",
    "taste",
    "let's",
    "fantasy",
    "still",
    "welcome",
    "than",
    "boom",
    "keep",
    "same",
    "away",
    "should",
    "inspired",
    "pretty",
    "non",
    "mit",
    "getting",
    "heart",
    "gold",
    "real",
    "check",
    "close",
]

(tokens
    .filter(~pl.col('term').is_in(list(itertools.chain.from_iterable(patterns.values()))))
    .filter(~pl.col('term').is_in(patterns_new_exclude))
    .filter(~pl.col('term').is_in(patterns_new))
    .filter(pl.col('playlist_count').ge(20))
    # .with_columns(pl.col('playlist.name').list.join("|"))
    # .sink_csv('out.csv', engine='streaming')
    .collect(engine='streaming'))