# Preparations

## Imports

In [None]:
import polars as pl

playlists = pl.scan_parquet('../processed_data/data_playlist_metadata.parquet')
playlist_tracks = pl.scan_parquet('../processed_data/data_playlist_songs.parquet')
tracks = pl.scan_parquet('../processed_data/data_song_metadata.parquet')

# Analysis

## Tokenization

In [None]:
def tokenize(expr: pl.Expr) -> pl.Expr:
    return expr.str.to_lowercase().str.split(' ')


def tokenize_unique(expr: pl.Expr) -> pl.Expr:
    return tokenize(expr)\
        .list.filter(pl.element().ne(''))\
        .list.unique(maintain_order=True)


def tokenize_filtered(expr: pl.Expr) -> pl.Expr:
    return (
        tokenize_unique(expr)
        # Filter our years & BPM ranges
        .list.filter(~pl.element().str.contains("^([0-9]+|[0-9]+-[0-9]+)$"))
        # Filter out stuff consisting only of non-letters
        .list.filter(pl.element().str.contains("[[:alpha:]]"))
    )

## Playlist statistics

In [None]:
playlists_tokenized = playlists.select(
    pl.col('playlist.id'),
    pl.col('playlist.name'),
    pl.col('playlist.name').pipe(tokenize_filtered).alias('unique_terms'),
)

exploded_playlists_tokenized = playlists_tokenized\
    .explode('unique_terms')\
    .rename({'unique_terms': 'term'})

tokens = exploded_playlists_tokenized\
    .group_by('term')\
    .agg(pl.col('term').count().alias('playlist_count'),
         pl.col('playlist.name').head(20))\
    .sort('playlist_count', descending=True)

tokens.filter(pl.col('playlist_count').ge(100)).collect(engine='streaming')