In [2]:
# import packages
import pandas as pd
import pickle as pkl
import os
import numpy as np

In [112]:
from sqlalchemy import create_engine

def postgresql_engine(user, pwd, host, port, dbname):
    # Need pyycopg2-binary package
    sql_engine = create_engine('postgres://' + user + ':' + pwd + '@' + host + ':' + port + '/' + dbname, echo=False)
    return sql_engine

In [113]:
# DB username and password
import getpass

user = getpass.getpass()
pwd = getpass.getpass()

In [115]:
# misc db parameters
host= 'adds-postgres-dev.cfgztrijqgvp.us-east-1.rds.amazonaws.com'
dbname= 'musiclab'
port= '5432'

In [5]:
# get callout research for songs released in the past 2 years
data_query_train = '''
Select *
from adds_temp.demo_rr_features_h1 as rdfh
where pop_all is not null
'''

In [6]:
filter_rules = '''and song_weeks_since_last_spins <=13
and ((song_last_test_co_weeks <=26)
or (song_last_test_omt_weeks <=104)
or format_code in ('u4','l2'))
and (((station_test_1_plus=0 and station_test_1_id=1) or station_test_1_plus>0) or format_code in ('u4','l2'))
and (format_code<>'h1' or station_id<>3323403)
and (format_code<>'c1' or station_id<>3322825)
and (format_code<>'a2' or station_id<>3322799 or gcr<>'G')
and taa_quintile is not null
and gcr_adj is not null
'''

In [7]:
print(data_query_train + filter_rules)


Select *
from adds_temp.demo_rr_features_h1 as rdfh
where pop_all is not null
and song_weeks_since_last_spins <=13
and ((song_last_test_co_weeks <=26)
or (song_last_test_omt_weeks <=104)
or format_code in ('u4','l2'))
and (((station_test_1_plus=0 and station_test_1_id=1) or station_test_1_plus>0) or format_code in ('u4','l2'))
and (format_code<>'h1' or station_id<>3323403)
and (format_code<>'c1' or station_id<>3322825)
and (format_code<>'a2' or station_id<>3322799 or gcr<>'G')
and taa_quintile is not null
and gcr_adj is not null



In [8]:
engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as con:
    with con.connect():
        df_train = pd.read_sql(data_query_train + filter_rules, con=con)

In [9]:
#write to pkl file
df_train.to_pickle('df_train_2022_11_05_2020_11_28.pkl')

In [3]:
# read from pickle file
df_train = pd.read_pickle('df_train_2022_11_05_2020_11_28.pkl')

In [4]:
df_train.sort_values(by=['station_id', 'mediabase_id', 'breakout_name', 'week_dt', ], inplace=True)

In [5]:
# define demo segments and categories
breakout_category = {'*Core*': 'Core-Cume', '*Old*': 'Age', '*Young*': 'Age', 'Total': 'Total', 'White': 'Race',
                     'Non-Core': 'Core-Cume',
                     'Hispanic': 'Race', 'AA': 'Race', 'F': 'Gender', 'M': 'Gender', 'WAO': 'Race',
                     'F (25-29)': 'Gender', 'F (20-24)': 'Gender', 'F (18-29)': 'Gender', 'F (17-29)': 'Gender',
                     'F (20-23)': 'Gender', 'F (18-39)' : 'Gender',
                     'F (16-24)': 'Gender', 'F (30-34)': 'Gender', 'F (18-34)': 'Gender', 'F (24-29)': 'Gender',
                     'F (17-19)': 'Gender', 'F (15-26)': 'Gender', 'F (15-19)': 'Gender', 'F (15-24)': 'Gender',
                     'F (18-24)': 'Gender', 'F (20-29)': 'Gender', 'F (25-34)': 'Gender', 'F (Other)': 'Gender'}

breakout_map = {'*Core*': 'Core', '*Old*': 'Old', '*Young*': 'Young', 'Total': 'Total', 'White': 'White',
                'Non-Core': 'Non-Core',
                'Hispanic': 'Hispanic', 'AA': 'AA', 'F': 'Female', 'M': 'Male', 'WAO': 'White', 'F (25-29)': 'Female',
                'F (20-24)': 'Female', 'F (18-29)': 'Female', 'F (17-29)': 'Female', 'F (20-23)': 'Female',
                'F (16-24)': 'Female', 'F (30-34)': 'Female', 'F (18-34)': 'Female', 'F (24-29)': 'Female',
                'F (17-19)': 'Female', 'F (18-39)' : 'Female',
                'F (15-26)': 'Female', 'F (15-19)': 'Female', 'F (15-24)': 'Female',
                'F (18-24)': 'Female_(18-24)', 'F (20-29)': 'Female', 'F (25-34)': 'Female',
                'F (Other)': 'Female_Other'}

In [6]:
# create segment and category fields
df_train['segment'] = df_train['breakout_name'].apply(func=(lambda x: breakout_map[x] if (x in breakout_map.keys()) else None))
df_train['demo_category'] = df_train['breakout_name'].apply(func=(lambda x: breakout_category[x] if x in breakout_category.keys() else None))

In [7]:
# drop misc female breakouts
drop_idx = df_train[(df_train['segment'] == 'Female') | (pd.isna(df_train['segment']))].index
df_train.drop(drop_idx, inplace=True)

In [9]:
# Drop songs with just a single score in the past 2 years
df_train_week_ct = pd.DataFrame(df_train.groupby(['station_id', 'mediabase_id', 'breakout_id'])['week_dt'].count())
drop_idx = df_train.join(df_train_week_ct[df_train_week_ct['week_dt'] == 1], on=['station_id', 'mediabase_id', 'breakout_id'], how='right', rsuffix='_r').index
df_train.drop(index=drop_idx, inplace=True)

In [10]:
df_train.groupby(['demo_category', 'segment', 'taa_quintile'])['mediabase_id'].count()

demo_category  segment         taa_quintile
Age            Old             1                2252
                               2                6295
                               3                6168
                               4                8090
                               5               21146
               Young           1                2252
                               2                6295
                               3                6173
                               4                8088
                               5               21143
Core-Cume      Core            1                2251
                               2                6293
                               3                6169
                               4                8089
                               5               21150
               Non-Core        1                2242
                               2                6266
                               3                6147
  

### Investigate columns with missing data

In [11]:
len(df_train[pd.isna(df_train['mediabase_id'])])

0

In [12]:
len(df_train[df_train['gcr'] == df_train['gcr_adj']])/len(df_train)

1.0

In [17]:
len(df_train.columns)

108

In [7]:
df_train.shape

(614649, 108)

In [8]:
df_train.groupby(['station_test_1_plus', 'station_test_1_id'])['mediabase_id'].count()

station_test_1_plus  station_test_1_id
0                    1                     58773
1                    0                    497747
                     1                     58129
Name: mediabase_id, dtype: int64

##### isolate numeric and categorical columns

In [13]:
# constants
num_cols_like = ['artist_count', 'feat_artist', 'feat_artist_song', 'mscore', 'spins','pop_prior',
                 'pop_artist_prior', 'song_age_weeks', 'song_last_test']
cat_cols_like = ['Market_Name', 'taa_quintile', 'segment', 'gcr', 'gcr_adj', 'omt_co_flag']
target = ['pop_all']
id_cols = ['mediabase_id', 'station_id', 'week_dt', 'breakout_id', 'breakout_name', 'demo_category', 'pop_co', 'pop_omt', 'gcr']

exclude_cols_like = ['date','song_last_test_any_weeks','song_last_test_co_weeks', 'song_last_test_omt_weeks',
                     'std_pop_prior', 'std_pop_artist_prior']#,'_unv']#,'univ_spins', 'market_spins']

In [14]:
id_cols = id_cols
target_col = target
exclude_cols = df_train.columns[df_train.columns.str.contains('|'.join(exclude_cols_like), regex=True)]

cat_cols = list(set(df_train.columns[df_train.columns.str.contains('|'.join(cat_cols_like), regex=True)]) - set(
    id_cols) - set(exclude_cols))

num_cols = list(set(df_train.select_dtypes(exclude=['object', 'datetime64']).columns) & set(
df_train.columns[(df_train.columns.str.contains('|'.join(num_cols_like), regex=True))]) - set(id_cols) - set(cat_cols) - set(exclude_cols))

feature_cols = list(set(list(num_cols) + list(cat_cols)))

In [15]:
cat_cols

['gcr_adj', 'Market_Name', 'taa_quintile', 'omt_co_flag', 'segment']

##### check missingness by different feature types

In [34]:
# check missingness by instance (station-song combination) for categorical variables
cols_avl_cat = df_train.groupby(['station_id', 'mediabase_id'])[cat_cols].agg(lambda x: 1 - sum(pd.isnull(x))/(1.0*len(x))).reset_index()

In [35]:
cols_avl_cat.describe()

Unnamed: 0,station_id,mediabase_id,taa_quintile,gcr_adj,omt_co_flag,Market_Name,segment
count,4774.0,4774.0,4774.0,4774.0,4774.0,4774.0,4774.0
mean,3344216.0,2450651.0,1.0,1.0,1.0,1.0,1.0
std,174230.3,394312.8,0.0,0.0,0.0,0.0,0.0
min,3321797.0,1085550.0,1.0,1.0,1.0,1.0,1.0
25%,3322025.0,2355676.0,1.0,1.0,1.0,1.0,1.0
50%,3322828.0,2583267.0,1.0,1.0,1.0,1.0,1.0
75%,3323410.0,2720314.0,1.0,1.0,1.0,1.0,1.0
max,4762077.0,2848773.0,1.0,1.0,1.0,1.0,1.0


In [24]:
df_train[(df_train['station_id'] == 3321797) & (df_train['mediabase_id'] == 1086587) & (pd.isna(df_train['segment']))]

Unnamed: 0,mediabase_id,station_id,week_dt,artist_id,format_code,FirstLast,SongTitle,Market_Name,song_release_date,breakout_id,...,mean_pop_artist_prior,std_pop_artist_prior,count_pop_artist_prior,max_pop_artist_prior_unv,min_pop_artist_prior_unv,mean_pop_artist_prior_unv,count_pop_artist_prior_unv,mr_pop_artist_prior_unv,segment,demo_category
2125,1086587,3321797,2022-07-31,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,,,,,,,,,,
2128,1086587,3321797,2022-08-21,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,64.0,,1.0,64.0,64.0,64.0,1.0,64.0,,
2132,1086587,3321797,2022-09-04,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,71.0,9.899495,2.0,78.0,64.0,68.666667,3.0,78.0,,
2133,1086587,3321797,2022-09-18,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,71.333333,7.023769,3.0,78.0,64.0,70.0,6.0,72.0,,
2136,1086587,3321797,2022-10-02,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,72.0,5.887841,4.0,78.0,64.0,70.8,10.0,74.0,,
2138,1086587,3321797,2022-10-16,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,70.6,5.98331,5.0,78.0,64.0,70.733333,15.0,65.0,,
2139,1086587,3321797,2022-10-30,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,71.5,5.787918,6.0,78.0,64.0,70.952381,21.0,76.0,,
2141,1086587,3321797,2022-11-27,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,72.714286,6.183696,7.0,80.0,64.0,71.392857,28.0,80.0,,
2087,1086587,3321797,2022-07-31,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428762,...,,,,,,,,,,
2090,1086587,3321797,2022-08-21,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428762,...,59.0,,1.0,59.0,59.0,59.0,1.0,59.0,,


In [72]:
# Investigate taa_quintile
# df_temp = df_train[pd.isna(df_train['taa_quintile'])][['station_id', 'mediabase_id', 'week_dt']]
# len(df_temp)
# df_temp.groupby(['week_dt']).agg({'mediabase_id':len}).sort_values(by=['mediabase_id'], ascending=False)
# df_temp.groupby(['mediabase_id']).agg({'week_dt':len}).sort_values(by=['week_dt'], ascending=False)
# df_temp.groupby(['station_id', 'mediabase_id']).agg({'week_dt':len}).sort_values(by=['week_dt'], ascending=False)
# idx = (df_train['station_id'] == 3323400) & (df_train['mediabase_id'] == 2591543)
# df_train.loc[idx].groupby(['week_dt'])['mediabase_id'].count()
# idx = (df_train['station_id'] == 3323400) & (df_train['mediabase_id'] == 2591543) & (pd.isna(df_train['taa_quintile'])) & (df_train['breakout_name'] == 'Total')
# df_train.loc[idx]

In [149]:
# Investigate gcr and gcr_adj
# df_temp = df_train[(pd.isna(df_train['gcr_adj'])) & (~pd.isna(df_train['gcr']))][['station_id', 'mediabase_id', 'week_dt']]
# len(df_temp)
# print(df_temp.groupby(['week_dt']).agg({'mediabase_id':len}).sort_values(by=['mediabase_id'], ascending=False))
# print(df_temp.groupby(['mediabase_id']).agg({'week_dt':len}).sort_values(by=['week_dt'], ascending=False))
# print(df_temp.groupby(['station_id', 'mediabase_id']).agg({'week_dt':len}).sort_values(by=['week_dt'], ascending=False))
# idx = (df_train['station_id'] == 3322002) & (df_train['mediabase_id'] == 2294907)
# df_train.loc[idx].groupby(['week_dt'])['mediabase_id'].count()
# idx = (df_train['station_id'] == 3323404) & (df_train['mediabase_id'] == 2629560) & (pd.isna(df_train['gcr_adj'])) & (df_train['breakout_name'] == 'Total')
# df_train.loc[idx]
# [np.min(df_temp['week_dt']), np.max(df_temp['week_dt'])]

[datetime.date(2020, 11, 22), datetime.date(2021, 1, 24)]

###### Investigate Numeric Columns

In [16]:
# Investigate Numeric Columns
num_cols_spins = [col for col in num_cols if 'spins' in col]
num_cols_pop = [col for col in num_cols if 'pop' in col]
num_cols_other = list(set(num_cols) - set(num_cols_spins) - set(num_cols_pop))

In [17]:
[len(num_cols), len(num_cols_spins), len(num_cols_pop), len(num_cols_other)]

[75, 52, 19, 4]

In [18]:
num_cols_other

['artist_count', 'feat_artist', 'song_age_weeks', 'feat_artist_song']

In [19]:
num_cols_pop

['min_pop_prior',
 'mean_pop_prior_unv',
 'min_pop_artist_prior_unv',
 'count_pop_artist_prior_unv',
 'max_pop_artist_prior_unv',
 'mean_pop_artist_prior_unv',
 'mr_pop_prior_unv',
 'max_pop_prior',
 'mr_pop_prior',
 'count_pop_artist_prior',
 'max_pop_artist_prior',
 'min_pop_artist_prior',
 'mr_pop_artist_prior_unv',
 'mean_pop_artist_prior',
 'med_pop_prior',
 'min_pop_prior_unv',
 'count_pop_prior_unv',
 'mean_pop_prior',
 'max_pop_prior_unv']

In [20]:
df_train.shape

(377628, 110)

In [68]:
print(581054 + 33595)

614649


In [21]:
# Backfill pop based data
df_train[num_cols_pop] = df_train.groupby(['station_id', 'mediabase_id', 'breakout_id'])[num_cols_pop].bfill()

In [22]:
# Fill missing perc spin diffs with 1.0
num_cols_spins_perc = [i for i in num_cols_spins if (('perc_diff_' in i) or ('per_diff' in i))]
df_train[num_cols_spins_perc] = df_train[num_cols_spins_perc].transform(lambda x: x.fillna(1.0))

In [23]:
# Backfill non perc diff spins diff
num_cols_spins_nonperc = list(set(num_cols_spins) - set(num_cols_spins_perc))
df_train[num_cols_spins_nonperc] = df_train.groupby(['station_id', 'mediabase_id', 'breakout_id'])[num_cols_spins_nonperc].bfill()

In [43]:
num_cols_spins_nonperc

['avg_market_spins_prior',
 'avg_station_artist_spins_prior',
 'mr_market_spins_spins',
 'mr_song_univ_spins_prior',
 'market_artist_spins',
 'song_univ_spins',
 'avg_artist_univ_spins_prior',
 'total_market_artist_spins_prior',
 'total_station_artist_spins_prior',
 'song_market_weeks_since_first_spins',
 'diff_spins_song_market_prior',
 'song_weeks_since_last_spins',
 'station_spins',
 'mr_artist_univ_spins',
 'diff_market_spins_spins_prior',
 'avg_song_univ_spins_prior',
 'artist_weeks_since_first_spins',
 'total_market_spins_prior',
 'diff_artist_univ_spins_prior',
 'diff_spins_song_station_prior',
 'mr_spins_artist_station_prior',
 'format_spins',
 'artist_station_weeks_since_first_spins',
 'artist_univ_spins',
 'song_weeks_since_first_spins',
 'diff_market_artist_spins_prior',
 'diff_spins_artist_station_prior',
 'spins_non_on',
 'total_spins_song_station_prior',
 'diff_song_univ_spins_prior',
 'total_song_univ_spins_prior',
 'avg_spins_song_station_prior',
 'spins_total',
 'avg_m

In [24]:
# check missingness by instance (station-song combination) for categorical variables
cols_avl_num = df_train.groupby(['station_id', 'mediabase_id', 'breakout_name'])[num_cols].agg(lambda x: 1 - sum(pd.isnull(x))/(1.0*len(x))).reset_index()

In [25]:
cols_avl_num = cols_avl_num.join(pd.DataFrame(df_train.groupby(['station_id', 'mediabase_id', 'breakout_name']).count()['week_dt']), on=['station_id', 'mediabase_id', 'breakout_name'], rsuffix='_r')

In [26]:
cols_avl_num[num_cols].mean().reset_index().sort_values(by=[0])

Unnamed: 0,index,0
0,min_pop_prior,1.0
53,med_pop_prior,1.0
52,song_weeks_since_first_spins,1.0
51,station_spins,1.0
50,per_diff_market_spins_spins_prior,1.0
...,...,...
22,total_station_artist_spins_prior,1.0
21,max_pop_prior,1.0
20,artist_univ_spins,1.0
18,format_spins,1.0


In [48]:
idx = df_train[cat_cols + num_cols].dropna(axis=1).index
df_train.loc[idx].shape

(377628, 110)

In [49]:
df_train.shape

(377628, 110)

In [93]:
cols_avl_num[(cols_avl_num['song_last_test_co_weeks'] < 1) & (cols_avl_num['breakout_name'] == 'Total')][['station_id', 'mediabase_id', 'breakout_name', 'week_dt']]

Unnamed: 0,station_id,mediabase_id,breakout_name,week_dt
2788,3321799,1388281,Total,2
2812,3321799,1640575,Total,2
7090,3322002,1243640,Total,2
7100,3322002,1249237,Total,2
7134,3322002,1261285,Total,3
...,...,...,...,...
56105,3323602,2397182,Total,2
56124,3323602,2422489,Total,3
56130,3323602,2422949,Total,2
56179,3323602,2445876,Total,3


In [95]:
df_train[(df_train['station_id'] == 3323602) & (df_train['mediabase_id'] == 2348056) & (df_train['breakout_name'] == 'Total')][['station_id', 'mediabase_id', 'breakout_name', 'week_dt', 'song_last_test_any_weeks','song_last_test_co_weeks', 'song_last_test_omt_weeks']]

Unnamed: 0,station_id,mediabase_id,breakout_name,week_dt,song_last_test_any_weeks,song_last_test_co_weeks,song_last_test_omt_weeks
64814,3323602,2348056,Total,2020-11-22,0.0,,0.0
64815,3323602,2348056,Total,2021-02-07,0.0,0.0,1.0
64816,3323602,2348056,Total,2021-09-26,0.0,8.0,0.0
64817,3323602,2348056,Total,2022-08-21,0.0,0.0,1.0


### Prep Data and create train/test splits

In [60]:
id_cols

['mediabase_id',
 'station_id',
 'week_dt',
 'breakout_id',
 'breakout_name',
 'demo_category',
 'pop_co',
 'pop_omt',
 'gcr']

In [27]:
# Extract train data
[np.min(df_train['week_dt']),pd.to_datetime(np.min(df_train['week_dt'])) + np.timedelta64(2,'Y'), np.max(df_train['week_dt'])]
scoring_date = pd.to_datetime('2022-11-22')
train_idx = df_train['week_dt'] < scoring_date.date()

df_train_final = df_train.loc[train_idx][id_cols + feature_cols + target_col]
X_train = pd.get_dummies(df_train_final[feature_cols], columns=cat_cols)
y_train = df_train_final[target]

In [28]:
demo_cats = list(set(breakout_category.values()) - set(['Total']))

In [30]:
X_train.shape

(369153, 125)

In [32]:
# imports for model training
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, GroupKFold
from sklearn.metrics import make_scorer, mean_pinball_loss

In [34]:
low_alpha = 0.05
high_alpha = 0.95

param_grid = dict(
    learning_rate=[.2, .1, .05],
    n_estimators=[5, 10, 15],
    max_depth=[2, 4, 6],
    min_samples_leaf=[5, 10, 20],
    min_samples_split=[5, 10, 20]
)

n_iter = 50
n_splits = 5

In [49]:
import time
demo_cols_all = [col for col in X_train.columns if 'segment_' in col]
best_scores = {}
best_estimators = {}

for cat in demo_cats:
    tic = time.perf_counter()
    idx = (df_train_final['demo_category'] == cat)
    print(cat + ': ' + str(sum(idx)))

    # extract relevant segment indicator columns
    demo_cols_cat = ['segment_' + i for i in list(pd.unique(df_train_final.loc[idx]['segment']))]
    demo_cols_excl = list(set(demo_cols_all) - set(demo_cols_cat))
    feature_cols_cat = list(set(X_train.columns) - set(demo_cols_excl))

    # create features and target
    X = X_train.loc[idx][feature_cols_cat]
    y = y_train.loc[idx]

    # quantile regressor

    # gradient boosted quantile regressor
    group_kfold = GroupKFold(n_splits=n_splits)

    # train model for upper threshold given features
    cv = group_kfold.split(X.loc[idx], y.loc[idx], df_train_final.loc[idx]['mediabase_id'])
    neg_mean_pinball_loss_high = make_scorer(
        mean_pinball_loss,
        alpha=high_alpha,
        greater_is_better=False,  # maximize the negative loss
    )

    model_high_thresh = GradientBoostingRegressor(loss="quantile", alpha=high_alpha,
                                                  random_state=0)

    rs_high_thresh = RandomizedSearchCV(
        model_high_thresh,
        param_grid,
        n_iter=n_iter,
        scoring=neg_mean_pinball_loss_high,
        cv=cv,
        verbose=1,
        random_state=0,
        n_jobs=-1
    )

    rs_high_thresh.fit(X.loc[idx], np.ravel(y.loc[idx]))
    print(cat + ": Fitting for upper wobble threshold completed")

    # train model for lower threshold given features
    cv = group_kfold.split(X.loc[idx], y.loc[idx], df_train_final.loc[idx]['mediabase_id'])
    neg_mean_pinball_loss_low = make_scorer(
        mean_pinball_loss,
        alpha=low_alpha,
        greater_is_better=False,  # maximize the negative loss
    )

    model_low_thresh = GradientBoostingRegressor(loss="quantile", alpha=low_alpha,
                                                 random_state=0)

    rs_low_thresh = RandomizedSearchCV(
        model_low_thresh,
        param_grid,
        n_iter=n_iter,
        scoring=neg_mean_pinball_loss_low,
        cv=cv,
        verbose=1,
        random_state=0,
        n_jobs=-1
    )

    rs_low_thresh.fit(X.loc[idx], np.ravel(y.loc[idx]))
    print(cat + ": Fitting for lower wobble threshold completed")

    # train model for mean pop score given features
    cv = group_kfold.split(X.loc[idx], y.loc[idx], df_train_final.loc[idx]['mediabase_id'])
    model_mean = GradientBoostingRegressor(loss="squared_error")

    rs_mean = RandomizedSearchCV(
        model_mean,
        param_grid,
        n_iter=n_iter,
        scoring='neg_mean_absolute_error',
        cv=cv,
        verbose=1,
        random_state=0,
        n_jobs=-1
    )

    rs_mean.fit(X.loc[idx], np.ravel(y.loc[idx]))
    print(cat + ": Fitting for mean pop completed")

    toc = time.perf_counter()
    time_elapsed = toc-tic
    print('Total time elapsed for ' + cat + ': ' + '%.2f'%time_elapsed)

    best_scores[cat] = [rs_low_thresh.best_score_, rs_mean.best_score_, rs_high_thresh.best_score_]
    best_estimators[cat] = [rs_low_thresh.best_estimator_, rs_mean.best_estimator_, rs_high_thresh.best_estimator_]

Age: 85978
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Age: Fitting for upper wobble threshold completed
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Age: Fitting for lower wobble threshold completed
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Age: Fitting for mean pop completed
Total time elapsed for Age: 547.70
Gender: 78211
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Gender: Fitting for upper wobble threshold completed
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Gender: Fitting for lower wobble threshold completed
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Gender: Fitting for mean pop completed
Total time elapsed for Gender: 494.66
Race: 76222
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Race: Fitting for upper wobble threshold completed
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Race: Fitting for lower wobble threshold completed
Fitting 5 f

In [51]:
import pickle
pickle.dump(best_scores, open('best_scores_all.pkl', "wb"))
pickle.dump(best_estimators, open('best_estimators_all.pkl', "wb"))

In [55]:
test_read = pd.read_pickle('best_estimators_all.pkl')

In [56]:
test_read

{'Age': [GradientBoostingRegressor(alpha=0.05, learning_rate=0.2, loss='quantile',
                            max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15,
                            random_state=0),
  GradientBoostingRegressor(learning_rate=0.2, max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15),
  GradientBoostingRegressor(alpha=0.95, learning_rate=0.2, loss='quantile',
                            max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15,
                            random_state=0)],
 'Gender': [GradientBoostingRegressor(alpha=0.05, learning_rate=0.2, loss='quantile',
                            max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15,
                            random_state=0),
  GradientBoostingRegressor(learning_rate=0.2, max_depth=6, min_samples_leaf=10,
  

### Prep scoring data & score


In [74]:
scoring_date = pd.to_datetime('2022-12-25')
test_idx = df_train['week_dt'] == scoring_date.date()

df_test_final = df_train.loc[test_idx][id_cols + feature_cols + target_col]
X_test = pd.get_dummies(df_test_final[feature_cols], columns=cat_cols)

missing_cols = list(set(X_train.columns) - set(X_test.columns))

y_test = df_test_final[target]

In [76]:
for i in missing_cols:
    X_test[i] = 0

In [81]:
X_test = X_test[X_train.columns]

In [80]:
X_train.columns

Index(['min_pop_prior', 'mean_pop_prior_unv', 'avg_song_univ_spins_prior',
       'max_pop_artist_prior_unv', 'avg_market_spins_prior',
       'artist_station_weeks_since_first_spins', 'diff_song_univ_spins_prior',
       'mean_pop_artist_prior_unv', 'mr_pop_prior_unv', 'format_spins',
       ...
       'segment_AA', 'segment_Core', 'segment_Female_(18-24)',
       'segment_Female_Other', 'segment_Hispanic', 'segment_Non-Core',
       'segment_Old', 'segment_Total', 'segment_White', 'segment_Young'],
      dtype='object', length=125)

In [106]:
df_out = pd.DataFrame(columns=['lower_wob_thresh', 'mean_pop_predicted', 'upper_wobble_thresh'])
for cat in demo_cats:
    tic = time.perf_counter()
    idx = (df_test_final['demo_category'] == cat)
    print(cat + ': ' + str(sum(idx)))

    # extract relevant segment indicator columns
    demo_cols_cat = ['segment_' + i for i in list(pd.unique(df_test_final.loc[idx]['segment']))]
    demo_cols_excl = list(set(demo_cols_all) - set(demo_cols_cat))
    feature_cols_cat = list(set(X_test.columns) - set(demo_cols_excl))

    # create features and target
    X = X_test.loc[idx][feature_cols_cat]
    y = y_test.loc[idx]
    #print(X)

    # create empty dataframe
    df_temp = pd.DataFrame()
    # predict using estimator

    df_temp['lower_wob_thresh'] = pd.DataFrame(best_estimators[cat][0].predict(X), index=X_test.loc[idx].index)
    df_temp['mean_pop_predicted'] = pd.DataFrame(best_estimators[cat][1].predict(X), index=X_test.loc[idx].index)
    df_temp['upper_wobble_thresh'] = pd.DataFrame(best_estimators[cat][2].predict(X), index=X_test.loc[idx].index)

    df_out = pd.concat([df_out,df_temp], axis=0)

Age: 542
Gender: 482
Race: 569
Core-Cume: 542


In [127]:
df_out_final = df_test_final[id_cols].join(df_out, how='left')

KeyError: "['call_letters'] not in index"

In [109]:
df_out_final['segment'] = df_out_final['breakout_name'].apply(func=(lambda x: breakout_map[x] if (x in breakout_map.keys()) else None))

In [110]:
df_out_final.join()

Unnamed: 0,mediabase_id,station_id,week_dt,breakout_id,breakout_name,demo_category,pop_co,pop_omt,gcr,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh,segment
342,1086587,3322000,2022-12-25,317542,*Core*,Core-Cume,92.0,,R,64.675067,83.263664,97.321842,Core
378,1086587,3322000,2022-12-25,401515,*Old*,Age,80.0,,R,66.001720,82.686322,98.315638,Old
369,1086587,3322000,2022-12-25,401514,*Young*,Age,85.0,,R,66.282747,83.917700,102.093248,Young
405,1086587,3322000,2022-12-25,412759,F (18-24),Gender,85.0,,R,65.612498,84.783759,103.995919,Female_(18-24)
315,1086587,3322000,2022-12-25,-2,F (Other),Gender,80.0,,R,67.976539,82.272345,97.169073,Female_Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...
612216,2833906,3323602,2022-12-25,403078,AA,Race,78.0,,C,50.440120,64.960117,89.347330,AA
612204,2833906,3323602,2022-12-25,400756,Hispanic,Race,62.0,,C,58.090805,76.880403,96.752183,Hispanic
612195,2833906,3323602,2022-12-25,-1,Non-Core,Core-Cume,65.0,,C,52.295566,66.779507,85.366178,Non-Core
612198,2833906,3323602,2022-12-25,1,Total,Total,60.0,,C,,,,Total


In [116]:
# song-artist lookup
song_query = '''
Select mediabase_id, song_name, artist_name
from data.songs_v as sv
'''
engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as conn:
    with conn.begin():
        df_song_lookup = pd.read_sql(song_query, con=conn)

In [129]:
station_query = '''
Select distinct station_id, call_letters
from data.stations_v as sv
'''

engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as conn:
    with conn.begin():
        df_station_lookup = pd.read_sql(station_query, con=conn)

In [117]:
df_song_lookup.set_index(['mediabase_id'], inplace=True)

In [130]:
df_station_lookup.set_index(['station_id'], inplace=True)

In [119]:
df_song_lookup['song_artist']  = df_song_lookup['song_name'] + ' (' + df_song_lookup['artist_name'] + ')'

In [120]:
df_song_lookup

Unnamed: 0_level_0,song_name,artist_name,song_artist
mediabase_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1342132,Maui Hawaiian Sup'pa Man,ISRAEL KAMAKAWIWO'OLE,Maui Hawaiian Sup'pa Man (ISRAEL KAMAKAWIWO'OLE)
2318242,Under The Pines,TWIN PEAKS,Under The Pines (TWIN PEAKS)
2825802,Love Is Bigger...(Apollo '18),U2,Love Is Bigger...(Apollo '18) (U2)
2019373,I Be U,FUTURE,I Be U (FUTURE)
2159334,Burn Down The ... (Live '09),ELTON JOHN,Burn Down The ... (Live '09) (ELTON JOHN)
...,...,...,...
2620719,Get On My Wave,ANDREW MCMAHON IN WILDERNESS,Get On My Wave (ANDREW MCMAHON IN WILDERNESS)
2667991,Exaggeration,JUAN HAZE,Exaggeration (JUAN HAZE)
2438195,Every Week,DQ4E,Every Week (DQ4E)
2623051,Your Story Is Over!,AYREON,Your Story Is Over! (AYREON)


In [124]:
df_out_final['song_artist'] = df_out_final.join(df_song_lookup, on=['mediabase_id'], how='left')['song_artist']

In [131]:
df_out_final['call_letters'] = df_out_final.join(df_station_lookup, on=['station_id'], how='left')['call_letters']

In [132]:
df_out_final

Unnamed: 0,mediabase_id,station_id,week_dt,breakout_id,breakout_name,demo_category,pop_co,pop_omt,gcr,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh,segment,song_artist,call_letters
342,1086587,3322000,2022-12-25,317542,*Core*,Core-Cume,92.0,,R,64.675067,83.263664,97.321842,Core,Running Up That Hill (A Deal.. (KATE BUSH),KHTS-FM
378,1086587,3322000,2022-12-25,401515,*Old*,Age,80.0,,R,66.001720,82.686322,98.315638,Old,Running Up That Hill (A Deal.. (KATE BUSH),KHTS-FM
369,1086587,3322000,2022-12-25,401514,*Young*,Age,85.0,,R,66.282747,83.917700,102.093248,Young,Running Up That Hill (A Deal.. (KATE BUSH),KHTS-FM
405,1086587,3322000,2022-12-25,412759,F (18-24),Gender,85.0,,R,65.612498,84.783759,103.995919,Female_(18-24),Running Up That Hill (A Deal.. (KATE BUSH),KHTS-FM
315,1086587,3322000,2022-12-25,-2,F (Other),Gender,80.0,,R,67.976539,82.272345,97.169073,Female_Other,Running Up That Hill (A Deal.. (KATE BUSH),KHTS-FM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612216,2833906,3323602,2022-12-25,403078,AA,Race,78.0,,C,50.440120,64.960117,89.347330,AA,I'm Good (Blue) (DAVID GUETTA & BEBE REXHA),WWPW-FM
612204,2833906,3323602,2022-12-25,400756,Hispanic,Race,62.0,,C,58.090805,76.880403,96.752183,Hispanic,I'm Good (Blue) (DAVID GUETTA & BEBE REXHA),WWPW-FM
612195,2833906,3323602,2022-12-25,-1,Non-Core,Core-Cume,65.0,,C,52.295566,66.779507,85.366178,Non-Core,I'm Good (Blue) (DAVID GUETTA & BEBE REXHA),WWPW-FM
612198,2833906,3323602,2022-12-25,1,Total,Total,60.0,,C,,,,Total,I'm Good (Blue) (DAVID GUETTA & BEBE REXHA),WWPW-FM


In [136]:
df_out_final['wobble_flag'] = df_out_final.apply(lambda x: int((x['pop_co'] < x['lower_wob_thresh']) | (x['pop_co'] > x['upper_wobble_thresh'])), axis=1)

In [142]:
df_out_final['format_mean_pop'] = df_out_final.join(df_out_final.groupby(['mediabase_id', 'breakout_id'])['mean_pop_predicted'].mean(), on=['mediabase_id', 'breakout_id'], how='left', rsuffix='_r')['mean_pop_predicted_r']

In [137]:
df_out_final

Unnamed: 0,mediabase_id,station_id,week_dt,breakout_id,breakout_name,demo_category,pop_co,pop_omt,gcr,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh,segment,song_artist,call_letters,wobble_flag
342,1086587,3322000,2022-12-25,317542,*Core*,Core-Cume,92.0,,R,64.675067,83.263664,97.321842,Core,Running Up That Hill (A Deal.. (KATE BUSH),KHTS-FM,0
378,1086587,3322000,2022-12-25,401515,*Old*,Age,80.0,,R,66.001720,82.686322,98.315638,Old,Running Up That Hill (A Deal.. (KATE BUSH),KHTS-FM,0
369,1086587,3322000,2022-12-25,401514,*Young*,Age,85.0,,R,66.282747,83.917700,102.093248,Young,Running Up That Hill (A Deal.. (KATE BUSH),KHTS-FM,0
405,1086587,3322000,2022-12-25,412759,F (18-24),Gender,85.0,,R,65.612498,84.783759,103.995919,Female_(18-24),Running Up That Hill (A Deal.. (KATE BUSH),KHTS-FM,0
315,1086587,3322000,2022-12-25,-2,F (Other),Gender,80.0,,R,67.976539,82.272345,97.169073,Female_Other,Running Up That Hill (A Deal.. (KATE BUSH),KHTS-FM,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612216,2833906,3323602,2022-12-25,403078,AA,Race,78.0,,C,50.440120,64.960117,89.347330,AA,I'm Good (Blue) (DAVID GUETTA & BEBE REXHA),WWPW-FM,0
612204,2833906,3323602,2022-12-25,400756,Hispanic,Race,62.0,,C,58.090805,76.880403,96.752183,Hispanic,I'm Good (Blue) (DAVID GUETTA & BEBE REXHA),WWPW-FM,0
612195,2833906,3323602,2022-12-25,-1,Non-Core,Core-Cume,65.0,,C,52.295566,66.779507,85.366178,Non-Core,I'm Good (Blue) (DAVID GUETTA & BEBE REXHA),WWPW-FM,0
612198,2833906,3323602,2022-12-25,1,Total,Total,60.0,,C,,,,Total,I'm Good (Blue) (DAVID GUETTA & BEBE REXHA),WWPW-FM,0


In [144]:
df_out_pvt = df_out_final.pivot_table(index=['call_letters', 'song_artist'], columns=['segment'], values=['format_mean_pop', 'mean_pop_predicted', 'pop_co', 'wobble_flag'])

In [145]:
df_out_pvt.columns = [i[0] + '_' + i[1].replace(' ', '_') for i in df_out_pvt.columns]

In [146]:
df_out_pvt.columns

Index(['format_mean_pop_AA', 'format_mean_pop_Core',
       'format_mean_pop_Female_(18-24)', 'format_mean_pop_Female_Other',
       'format_mean_pop_Hispanic', 'format_mean_pop_Non-Core',
       'format_mean_pop_Old', 'format_mean_pop_White', 'format_mean_pop_Young',
       'mean_pop_predicted_AA', 'mean_pop_predicted_Core',
       'mean_pop_predicted_Female_(18-24)', 'mean_pop_predicted_Female_Other',
       'mean_pop_predicted_Hispanic', 'mean_pop_predicted_Non-Core',
       'mean_pop_predicted_Old', 'mean_pop_predicted_White',
       'mean_pop_predicted_Young', 'pop_co_AA', 'pop_co_Core',
       'pop_co_Female_(18-24)', 'pop_co_Female_Other', 'pop_co_Hispanic',
       'pop_co_Non-Core', 'pop_co_Old', 'pop_co_Total', 'pop_co_White',
       'pop_co_Young', 'wobble_flag_AA', 'wobble_flag_Core',
       'wobble_flag_Female_(18-24)', 'wobble_flag_Female_Other',
       'wobble_flag_Hispanic', 'wobble_flag_Non-Core', 'wobble_flag_Old',
       'wobble_flag_Total', 'wobble_flag_White', 'wobb

In [150]:
df_out_pvt['gender_perc_diff'] = (1 - df_out_pvt['mean_pop_predicted_Female_Other']/df_out_pvt['mean_pop_predicted_Female_(18-24)'])

In [152]:
df_out_pvt['core_perc_diff'] = (1 - df_out_pvt['mean_pop_predicted_Non-Core']/df_out_pvt['mean_pop_predicted_Core'])

In [153]:
df_out_pvt['Hispanic_perc_diff'] = (1- df_out_pvt['mean_pop_predicted_Hispanic']/(df_out_pvt['mean_pop_predicted_White']))

In [154]:
df_out_pvt['AA_perc_diff'] = (1- df_out_pvt['mean_pop_predicted_AA']/(df_out_pvt['mean_pop_predicted_White']))

In [155]:
df_out_pvt['age_perc_diff'] = (1- df_out_pvt['mean_pop_predicted_Young']/(df_out_pvt['mean_pop_predicted_Old']))

In [158]:
df_out_pvt['fmt_gender_perc_diff'] = (
            1 - df_out_pvt['format_mean_pop_Female_Other'] / df_out_pvt['format_mean_pop_Female_(18-24)'])
df_out_pvt['fmt_core_perc_diff'] = (1 - df_out_pvt['format_mean_pop_Non-Core'] / df_out_pvt['format_mean_pop_Core'])
df_out_pvt['fmt_Hispanic_perc_diff'] = (
            1 - df_out_pvt['format_mean_pop_Hispanic'] / (df_out_pvt['format_mean_pop_White']))
df_out_pvt['fmt_AA_perc_diff'] = (1 - df_out_pvt['format_mean_pop_AA'] / (df_out_pvt['format_mean_pop_White']))
df_out_pvt['fmt_age_perc_diff'] = (1 - df_out_pvt['format_mean_pop_Young'] / (df_out_pvt['format_mean_pop_Old']))

In [161]:
df_out_pvt.reset_index()

Unnamed: 0,call_letters,song_artist,format_mean_pop_AA,format_mean_pop_Core,format_mean_pop_Female_(18-24),format_mean_pop_Female_Other,format_mean_pop_Hispanic,format_mean_pop_Non-Core,format_mean_pop_Old,format_mean_pop_White,...,gender_perc_diff,core_perc_diff,Hispanic_perc_diff,AA_perc_diff,age_perc_diff,fmt_gender_perc_diff,fmt_core_perc_diff,fmt_Hispanic_perc_diff,fmt_AA_perc_diff,fmt_age_perc_diff
0,KHKS-FM,Anti-Hero (TAYLOR SWIFT),,82.127573,74.493818,82.430586,77.875460,75.613543,86.016667,80.256929,...,-0.067962,0.081432,0.182835,,0.193180,-0.106543,0.079316,0.029673,,0.140588
1,KHKS-FM,As It Was (HARRY STYLES),,87.830026,90.023646,84.171009,94.201023,82.341182,82.166940,79.737648,...,0.044965,0.090285,-0.148668,,0.035893,0.065012,0.062494,-0.181387,,-0.063984
2,KHKS-FM,Bad Habit (STEVE LACY),,82.013527,83.743150,74.907096,85.412874,76.392218,76.438781,73.508405,...,0.094261,0.065986,-0.118238,,-0.088021,0.105514,0.068541,-0.161947,,-0.097671
3,KHKS-FM,Boom Boom Pow (BLACK EYED PEAS),,87.812984,90.387504,77.715568,86.123436,82.396947,80.492144,85.384060,...,0.140196,0.061677,-0.008659,,-0.084572,0.140196,0.061677,-0.008659,,-0.084572
4,KHKS-FM,Buttons (PUSSYCAT DOLLS),,86.516694,82.136233,91.069995,99.091280,87.231928,97.709518,80.352152,...,-0.108768,-0.008267,-0.233213,,0.145283,-0.108768,-0.008267,-0.233213,,0.145283
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,WXKS-FM,Vegas (DOJA CAT),,84.072396,81.839631,82.003951,88.265450,78.923937,84.142427,78.469233,...,0.029666,0.030311,-0.127354,,-0.029104,-0.002008,0.061238,-0.124842,,0.037497
267,WXKS-FM,Victoria's Secret (JAX),,73.222142,64.877938,76.107715,69.238783,69.536516,80.209057,72.500145,...,-0.213936,0.119329,-0.152029,,0.237523,-0.173091,0.050335,0.044984,,0.195822
268,WXKS-FM,Woman (DOJA CAT),,83.718492,83.042887,81.353243,92.664696,79.354223,80.407841,72.435204,...,-0.013612,0.079455,-0.226460,,-0.005061,0.020347,0.052130,-0.279277,,-0.018829
269,WXKS-FM,You Right f/The Weeknd (DOJA CAT),,83.671756,84.169472,80.022909,90.194286,79.885553,79.546257,76.597232,...,0.029268,0.008617,-0.200608,,-0.031093,0.049264,0.045251,-0.177514,,-0.054413


In [162]:
out_stations = pd.unique(df_out_pvt.index.get_level_values(0))

In [163]:
with pd.ExcelWriter('Score_Gaps_by_Demographic_012023.xlsx') as writer:
    for stat in out_stations:
        idx = df_out_pvt[df_out_pvt.index.get_level_values(0) == stat].index
        df_out_pvt.loc[idx].to_excel(writer, sheet_name=stat)

In [164]:
with pd.ExcelWriter('Score_Gaps_by_Demographic_012023.xlsx') as writer:
    df_out_pvt.to_excel(writer, sheet_name='Master')