# Parameters

In [None]:
show_intermediate_results = True

# Preparations

## Imports

In [None]:
import polars as pl

playlists = pl.scan_parquet('../processed_data/data_playlist_metadata.parquet')

## Playlist Source Data

In [3]:
playlists.collect(engine='streaming')

# Analysis

Step 1: Tokenize the playlist names by splitting on whitespaces.

We currently turn every word into its own separate keyword term.
As a later optimization, it might make sense to treat words most often
occuring together (e.g. `late night`) to make the output more useful.

In [None]:
playlists_tokenized = playlists.select(
    pl.col('playlist.id'),
    pl.col('playlist.name'),
    pl.col('playlist.name').str.to_lowercase().str.split(' ')
    .list.filter(pl.element().ne(''))
    .list.unique(maintain_order=True).alias('unique_terms'),
)

playlists_tokenized.collect(engine='streaming') if show_intermediate_results else None

Step 2: Aggregate over playlist terms

In [None]:
exploded_playlists_tokenized = playlists_tokenized\
    .explode('unique_terms')\
    .rename({'unique_terms': 'term'})

exploded_playlists_tokenized.limit(100).collect(engine='streaming') if show_intermediate_results else None

In [None]:
tokens = exploded_playlists_tokenized\
    .group_by('term')\
    .agg(pl.col('term').count().alias('playlist_count'))\
    .sort('playlist_count', descending=True)

Review query plan for potential performance/memory problems:

In [None]:
tokens.show_graph(plan_stage='physical', engine='streaming', optimized=True)

In [None]:
tokens.filter(pl.col('playlist_count').ge(100)).collect(engine='streaming')

In [None]:
# Write to CSV
# tokens.filter(pl.col('playlist_count').ge(20)).sink_csv('playlist_keywords.csv', engine='streaming')

The following discoveries where made when manually reviewing the CSV data:

- Also split on & remove common punctuation (`(`, `)`, `[`, `]`,`:`, `#` etc.)
- Remove certain common words that do not provide any information:
  - on
  - by
  - with
  - at
  - and
  - a
  - I
  - ...
- Unify `90's`/`90s` etc.
- Unify `bday`/`birthday`/`b-day` etc.
- Check correlations between consecutive words