In [None]:
import numpy as np
import pandas as pd

pd.options.mode.chained_assignment = None  # default='warn'
import seaborn as sns

sns.set_theme(style='white')

#### setup db connection

In [None]:
from sqlalchemy import create_engine


def postgresql_engine(user, pwd, host, port, dbname):
    # Need psycopg2-binary package
    sql_engine = create_engine('postgres://' + user + ':' + pwd + '@' + host + ':' + port + '/' + dbname, echo=False)
    return sql_engine

In [None]:
# DB username & password
import getpass

username = getpass.getpass()
password = getpass.getpass()

In [None]:
# misc db parameters
url = 'adds-postgres-dev.cfgztrijqgvp.us-east-1.rds.amazonaws.com'
database = 'musiclab'
port = '5432'

In [None]:
# create DB engine
engine = postgresql_engine(username, password, url, port, database)

#### read in raw features from postgres DB

In [None]:
# read raw features table
query_raw_features = '''
Select *
from adds_temp.ebw_raw_features_h1 as eh
--where extract (year from eh.song_release_date) >= 2020
'''

with engine.connect() as conn:
    with conn.begin():
        df_raw_features = pd.read_sql(query_raw_features, con=conn)

In [None]:
# write to pickle file
# df_raw_features.to_pickle('ebw_df_raw_features_h1.pkl')

In [None]:
# read in pickle file
df_raw_features = pd.read_pickle('ebw_df_raw_features_h1.pkl')

In [None]:
# explicit cast for date related fields
date_cols = ['week_dt', 'song_release_date', 'hit_tag_date', 'first_spin_date', 'last_spin_date', 'ftq_date',
             'last_callout_date', 'first_callout_date']

df_raw_features[date_cols] = df_raw_features[date_cols].apply(pd.to_datetime)

In [None]:
# sort data
df_raw_features.sort_values(by=['mediabase_id', 'call_letters', 'week_dt'], inplace=True)

#### calculated fields

In [None]:
# Define a dictionary of computed columns to create
computed_cols = {
    'cuml_song_station_spins_non_on': lambda df: df.groupby(['mediabase_id', 'call_letters'])['song_station_spins_non_on'].cumsum(),
    'weeks_since_first_spins': lambda df: (df['week_dt'] - df['first_spin_date']) / np.timedelta64(1, 'W'),
    'weeks_since_release': lambda df: np.round((df['week_dt'] - df['song_release_date']) / np.timedelta64(1, 'W')),
    'weeks_since_hit': lambda df: (df['week_dt'] - df['hit_tag_date']) / np.timedelta64(1, 'W'),
    'weeks_bw_ftq_first_spins': lambda df: (df['ftq_date'] - df['first_spin_date']) / np.timedelta64(1, 'W'),
    'weeks_bw_top_quintiles': lambda df: (df['hit_tag_date'] - df['ftq_date']) / np.timedelta64(1, 'W'),
    'weeks_bw_ddl_thresh_cross': lambda df: (df['ddl_over_thresh_dt_second'] - df['ddl_over_thresh_dt_first']) / np.timedelta64(1, 'W'),
    'market_spins_propn': lambda df: df['song_station_spins_non_on'] / df['song_market_spins_non_on'],
    'artist_spins_propn': lambda df: df['song_station_spins_non_on'] / df['artist_station_spins_non_on']
}

# Use assign with a dictionary comprehension to create the computed columns
df_raw_features = df_raw_features.assign(**{col_name: func(df_raw_features) for col_name, func in computed_cols.items()})

In [None]:
# Define a sub-dataframe containing only the rows where week_dt equals hit_tag_date
mask = df_raw_features['week_dt'] == df_raw_features['hit_tag_date']
sub_df = df_raw_features.loc[mask, ['mediabase_id', 'call_letters', 'week_dt', 'cuml_song_station_spins_non_on']]
sub_df = sub_df.set_index(['mediabase_id', 'call_letters'])

# Join the sub-dataframe back onto the original dataframe
df_raw_features = df_raw_features.join(sub_df, on=['mediabase_id', 'call_letters'], rsuffix='_at_hit', how='left')

In [None]:
# Define a sub-dataframe containing only the rows where week_dt equals ftq_date
mask = df_raw_features['week_dt'] == df_raw_features['ftq_date']
sub_df = df_raw_features.loc[mask, ['mediabase_id', 'call_letters', 'week_dt', 'cuml_song_station_spins_non_on']]
sub_df = sub_df.set_index(['mediabase_id', 'call_letters'])

# Join the sub-dataframe back onto the original dataframe
df_raw_features = df_raw_features.join(sub_df, on=['mediabase_id', 'call_letters'], rsuffix='_at_ftq', how='left')

In [None]:
# Filter rows where week_dt equals ddl_over_thresh_dt_first
mask = df_raw_features['week_dt'] == df_raw_features['ddl_over_thresh_dt_first']
sub_df = df_raw_features.loc[mask, ['mediabase_id', 'call_letters', 'week_dt', 'cuml_song_station_spins_non_on']]
sub_df = sub_df.set_index(['mediabase_id', 'call_letters'])

# Join sub-dataframe to original dataframe
df_raw_features = df_raw_features.join(sub_df, on=['mediabase_id', 'call_letters'], rsuffix='_at_pre_burnout',
                                       how='left')

In [None]:
# Filter rows where week_dt equals ddl_over_thresh_dt_second
mask = df_raw_features['week_dt'] == df_raw_features['ddl_over_thresh_dt_second']
sub_df = df_raw_features.loc[mask, ['mediabase_id', 'call_letters', 'week_dt', 'cuml_song_station_spins_non_on']]
sub_df = sub_df.set_index(['mediabase_id', 'call_letters'])

# Join sub-dataframe to original dataframe
df_raw_features = df_raw_features.join(sub_df, on=['mediabase_id', 'call_letters'], rsuffix='_at_burnout', how='left')

In [None]:
# measures spins to burnout after song turns into a hit
df_raw_features['spins_to_burnout_from_hit'] = df_raw_features['cuml_song_station_spins_non_on_at_burnout'] -\
                                               df_raw_features['cuml_song_station_spins_non_on_at_hit']

# measure spins to hit from ftq
df_raw_features['spins_to_hit_from_ftq'] = df_raw_features['cuml_song_station_spins_non_on_at_hit'] -\
                                           df_raw_features['cuml_song_station_spins_non_on_at_ftq']

In [None]:
df_raw_features['hit_spins_bucket'] = pd.cut(df_raw_features['cuml_song_station_spins_non_on_at_hit'], np.arange(0,
                                                                                                                 np.max(
                                                                                                                     df_raw_features[
                                                                                                                         'cuml_song_station_spins_non_on_at_hit']),
                                                                                                                 250))

In [None]:
df_raw_features['hit_spins_bucket']

#### extract unique hit information (spins/dates)

In [None]:
# de-dupe raw features to obtain unique hit info
id_cols = ['mediabase_id', 'call_letters']
date_cols = ['song_release_date', 'song_release_year', 'ftq_date', 'hit_tag_date', 'ddl_over_thresh_dt_first',
             'ddl_over_thresh_dt_second']
weeks_cols = ['weeks_bw_ftq_first_spins', 'weeks_bw_top_quintiles']
spins_cols = ['cuml_song_station_spins_non_on_at_ftq', 'cuml_song_station_spins_non_on_at_hit',
              'cuml_song_station_spins_non_on_at_pre_burnout', 'cuml_song_station_spins_non_on_at_burnout',
              'spins_to_hit_from_ftq', 'spins_to_burnout_from_hit']

# create dataframe with unique hit information
df_hits_tracker = df_raw_features[id_cols + date_cols + weeks_cols + spins_cols].drop_duplicates()

In [None]:
df_hits_tracker

In [None]:
# determine spins cutoff for end of burn monitoring
df_hits_tracker.groupby(['song_release_year']).apply(lambda x: [len(x), np.nanquantile(x['spins_to_burnout_from_hit'], 0.05), np.nanquantile(x['spins_to_burnout_from_hit'], 0.95), (x['spins_to_burnout_from_hit'] > 2500).sum(),
                                                                (x['spins_to_burnout_from_hit'] > 2500).sum() / len(x)])

In [None]:
mask = df_hits_tracker['spins_to_burnout_from_hit'] < 150
df_hits_tracker.loc[mask]

Based on the above information about 2-3% of the songs which experienced burnout in 2020 and 2021 took more than 2500 spins after turning into a hit. 2500 spins seems to be a reasonable cutoff for end of burnout monitoring

In [None]:
# define censoring flag
burn_end_spins_cutoff = 2500
df_hits_tracker['censoring_flg'] = ~(pd.isna(df_hits_tracker['ddl_over_thresh_dt_second']) | (
            df_hits_tracker['spins_to_burnout_from_hit'] > burn_end_spins_cutoff))

In [None]:
df_hits_tracker.groupby(['song_release_year', 'censoring_flg'])['mediabase_id'].count()

#### feature engineering

In [None]:
rolling_weeks = [1, 4, 8 ,13, 26]

##### pop score related columns

In [None]:
pop_cols = [col for col in df_raw_features.columns if
            (('_pop' in col) or ('_ddl' in col) or ('_fav' in col)) and ('weeks_' not in col)]

In [None]:
pop_cols

##### rolling statistics look back

In [None]:
for win_len in rolling_weeks:
    for col in pop_cols:
        col_name = f"{col}_prior_{win_len}wk"
        roll_col = df_raw_features.groupby(['mediabase_id', 'call_letters'])[col].rolling(window=win_len, min_periods=0)
        df_raw_features[f"{col_name}_min"] = (roll_col.min().shift(1).droplevel([0,1])).ffill()
        df_raw_features[f"{col_name}_max"] = (roll_col.max().shift(1).droplevel([0,1])).ffill()
        df_raw_features[f"{col_name}_mean"] = (roll_col.mean().shift(1).droplevel([0,1])).ffill()
        df_raw_features[f"{col_name}_std"] = (roll_col.std().shift(1).droplevel([0,1])).ffill()

In [None]:
import matplotlib.pyplot as plt
mask = (df_raw_features['mediabase_id'] == 1086587) & (df_raw_features['call_letters'] == 'KHTS-FM')
sel_cols = ['song_format_pop','song_format_pop_prior_4wk_mean', 'song_format_pop_prior_13wk_mean', 'song_format_pop_prior_26wk_mean', 'song_format_pop_prior_8wk_mean', 'song_format_pop_prior_1wk_mean']
plt.scatter(df_raw_features.loc[mask, 'week_dt'], df_raw_features.loc[mask, sel_cols[0]], label=sel_cols[0])
plt.plot(df_raw_features.loc[mask, 'week_dt'], df_raw_features.loc[mask, sel_cols[1]], label=sel_cols[1])
plt.plot(df_raw_features.loc[mask, 'week_dt'], df_raw_features.loc[mask, sel_cols[2]], label=sel_cols[2])
plt.plot(df_raw_features.loc[mask, 'week_dt'], df_raw_features.loc[mask, sel_cols[3]], label=sel_cols[3])
plt.plot(df_raw_features.loc[mask, 'week_dt'], df_raw_features.loc[mask, sel_cols[4]], label=sel_cols[4])
plt.plot(df_raw_features.loc[mask, 'week_dt'], df_raw_features.loc[mask, sel_cols[5]], label=sel_cols[5])
plt.legend()

##### rolling stats from ftq to hit


In [None]:
for col in pop_cols:
    col_name = f"{col}_ftq_to_hit"
    col_names = [f"{col_name}_min", f"{col_name}_max", f"{col_name}_mean", f"{col_name}_std"]
    mask = (df_raw_features['week_dt'] >= df_raw_features['week_dt_at_ftq']) & (
            df_raw_features['week_dt'] <= df_raw_features['week_dt_at_hit'])
    df_sub = df_raw_features.loc[mask]
    df_sub_grpd = df_sub.groupby(['mediabase_id', 'call_letters'])[col]
    df_temp = pd.concat([df_sub_grpd.min(), df_sub_grpd.max(), df_sub_grpd.mean(), df_sub_grpd.std()], axis=1)
    df_temp.columns=col_names
    df_raw_features = df_raw_features.join(df_temp, on =['mediabase_id', 'call_letters'], how='left')

##### spins related information

In [None]:
spins_cols = [col for col in df_raw_features.columns if
              ('_spins' in col) and ('cuml_' not in col) and ('_bucket' not in col) and ('_propn' not in col) and (
                      'weeks_' not in col)]

In [None]:
spins_cols

##### rolling statistics look back

In [None]:
for win_len in rolling_weeks:
    for col in spins_cols:
        col_name = f"{col}_prior_{win_len}wk"
        roll_col = df_raw_features.groupby(['mediabase_id', 'call_letters'])[col].rolling(window=win_len, min_periods=0)
        df_raw_features[f"{col_name}_min"] = (roll_col.min().shift(1).droplevel([0,1])).ffill()
        df_raw_features[f"{col_name}_max"] = (roll_col.max().shift(1).droplevel([0,1])).ffill()
        df_raw_features[f"{col_name}_mean"] = (roll_col.mean().shift(1).droplevel([0,1])).ffill()
        df_raw_features[f"{col_name}_std"] = (roll_col.std().shift(1).droplevel([0,1])).ffill()

##### rolling stats from ftq to hit


In [None]:
for col in spins_cols:
    col_name = f"{col}_ftq_to_hit"
    col_names = [f"{col_name}_min", f"{col_name}_max", f"{col_name}_mean", f"{col_name}_std"]
    mask = (df_raw_features['week_dt'] >= df_raw_features['week_dt_at_ftq']) & (
            df_raw_features['week_dt'] <= df_raw_features['week_dt_at_hit'])
    df_sub = df_raw_features.loc[mask]
    df_sub_grpd = df_sub.groupby(['mediabase_id', 'call_letters'])[col]
    df_temp = pd.concat([df_sub_grpd.min(), df_sub_grpd.max(), df_sub_grpd.mean(), df_sub_grpd.std()], axis=1)
    df_temp.columns=col_names
    df_raw_features = df_raw_features.join(df_temp, on =['mediabase_id', 'call_letters'], how='left')

##### stream related data

In [None]:
stream_cols = [col for col in df_raw_features.columns if '_unv' in col]

In [None]:
stream_cols

##### rolling statistics look back

In [None]:
for win_len in rolling_weeks:
    for col in stream_cols:
        col_name = f"{col}_prior_{win_len}wk"
        roll_col = df_raw_features.groupby(['mediabase_id', 'call_letters'])[col].rolling(window=win_len, min_periods=0)
        df_raw_features[f"{col_name}_min"] = (roll_col.min().shift(1).droplevel([0,1])).ffill()
        df_raw_features[f"{col_name}_max"] = (roll_col.max().shift(1).droplevel([0,1])).ffill()
        df_raw_features[f"{col_name}_mean"] = (roll_col.mean().shift(1).droplevel([0,1])).ffill()
        df_raw_features[f"{col_name}_std"] = (roll_col.std().shift(1).droplevel([0,1])).ffill()

##### rolling stats from ftq to hit


In [None]:
for col in stream_cols:
    col_name = f"{col}_ftq_to_hit"
    col_names = [f"{col_name}_min", f"{col_name}_max", f"{col_name}_mean", f"{col_name}_std"]
    mask = (df_raw_features['week_dt'] >= df_raw_features['week_dt_at_ftq']) & (
            df_raw_features['week_dt'] <= df_raw_features['week_dt_at_hit'])
    df_sub = df_raw_features.loc[mask]
    df_sub_grpd = df_sub.groupby(['mediabase_id', 'call_letters'])[col]
    df_temp = pd.concat([df_sub_grpd.min(), df_sub_grpd.max(), df_sub_grpd.mean(), df_sub_grpd.std()], axis=1)
    df_temp.columns=col_names
    df_raw_features = df_raw_features.join(df_temp, on =['mediabase_id', 'call_letters'], how='left')

##### proportion related data

In [None]:
propn_cols = [col for col in df_raw_features.columns if 'propn' in col]

In [None]:
propn_cols

##### rolling statistics look back

In [None]:
for win_len in rolling_weeks:
    for col in propn_cols:
        col_name = f"{col}_prior_{win_len}wk"
        roll_col = df_raw_features.groupby(['mediabase_id', 'call_letters'])[col].rolling(window=win_len, min_periods=0)
        df_raw_features[f"{col_name}_min"] = (roll_col.min().shift(1).droplevel([0,1])).ffill()
        df_raw_features[f"{col_name}_max"] = (roll_col.max().shift(1).droplevel([0,1])).ffill()
        df_raw_features[f"{col_name}_mean"] = (roll_col.mean().shift(1).droplevel([0,1])).ffill()
        df_raw_features[f"{col_name}_std"] = (roll_col.std().shift(1).droplevel([0,1])).ffill()

##### rolling stats from ftq to hit


In [None]:
for col in propn_cols:
    col_name = f"{col}_ftq_to_hit"
    col_names = [f"{col_name}_min", f"{col_name}_max", f"{col_name}_mean", f"{col_name}_std"]
    mask = (df_raw_features['week_dt'] >= df_raw_features['week_dt_at_ftq']) & (
            df_raw_features['week_dt'] <= df_raw_features['week_dt_at_hit'])
    df_sub = df_raw_features.loc[mask]
    df_sub_grpd = df_sub.groupby(['mediabase_id', 'call_letters'])[col]
    df_temp = pd.concat([df_sub_grpd.min(), df_sub_grpd.max(), df_sub_grpd.mean(), df_sub_grpd.std()], axis=1)
    df_temp.columns=col_names
    df_raw_features = df_raw_features.join(df_temp, on =['mediabase_id', 'call_letters'], how='left')

In [None]:
# propogate censoring flag back to raw features
join_cols = ['mediabase_id', 'call_letters']
df_sub = df_hits_tracker.set_index(join_cols)['censoring_flg']

df_raw_features = df_raw_features.join(df_sub, on=join_cols, how='left')

In [None]:
df_hits_tracker.to_pickle('ebw_hits_tracker_h1.pkl')

In [None]:
df_raw_features.to_pickle('ebw_temp_features_h1.pkl')

In [None]:
import pandas as pd
df_raw_features = pd.read_pickle('ebw_temp_features_h1.pkl')
df_hits_tracker = pd.read_pickle('ebw_hits_tracker_h1.pkl')

In [None]:
df_raw_features['spins_to_burnout_from_hit'].drop_duplicates().plot(kind='box')