In [1]:
# import packages
import pandas as pd
import pickle as pkl
import os
import numpy as np

In [2]:
from sqlalchemy import create_engine

def postgresql_engine(user, pwd, host, port, dbname):
    # Need pyycopg2-binary package
    sql_engine = create_engine('postgres://' + user + ':' + pwd + '@' + host + ':' + port + '/' + dbname, echo=False)
    return sql_engine

In [3]:
# DB username and password
import getpass

user = getpass.getpass()
pwd = getpass.getpass()

In [4]:
# misc db parameters
host= 'adds-postgres-dev.cfgztrijqgvp.us-east-1.rds.amazonaws.com'
dbname= 'musiclab'
port= '5432'

In [5]:
# get callout research for songs released in the past 2 years
data_query_train = '''
Select *
from adds_temp.demo_rr_features_h1 as rdfh
where pop_all is not null
'''

In [5]:
filter_rules = '''and song_weeks_since_last_spins <=13
and ((song_last_test_co_weeks <=26)
or (song_last_test_omt_weeks <=104)
or format_code in ('u4','l2'))
and (((station_test_1_plus=0 and station_test_1_id=1) or station_test_1_plus>0) or format_code in ('u4','l2'))
and (format_code<>'h1' or station_id<>3323403)
and (format_code<>'c1' or station_id<>3322825)
and (format_code<>'a2' or station_id<>3322799 or gcr<>'G')
and taa_quintile is not null
and gcr_adj is not null
'''

In [7]:
print(data_query_train + filter_rules)


Select *
from adds_temp.demo_rr_features_h1 as rdfh
where pop_all is not null
and song_weeks_since_last_spins <=13
and ((song_last_test_co_weeks <=26)
or (song_last_test_omt_weeks <=104)
or format_code in ('u4','l2'))
and (((station_test_1_plus=0 and station_test_1_id=1) or station_test_1_plus>0) or format_code in ('u4','l2'))
and (format_code<>'h1' or station_id<>3323403)
and (format_code<>'c1' or station_id<>3322825)
and (format_code<>'a2' or station_id<>3322799 or gcr<>'G')
and taa_quintile is not null
and gcr_adj is not null



In [8]:
engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as con:
    with con.connect():
        df_train = pd.read_sql(data_query_train + filter_rules, con=con)

In [9]:
#write to pkl file
df_train.to_pickle('df_train_2022_11_05_2020_11_28.pkl')

In [6]:
# read from pickle file
df_train = pd.read_pickle('df_train_2022_11_05_2020_11_28.pkl')

In [7]:
df_train.sort_values(by=['station_id', 'mediabase_id', 'breakout_name', 'week_dt', ], inplace=True)

In [8]:
# define demo segments and categories
breakout_category = {'*Core*': 'Core-Cume', '*Old*': 'Age', '*Young*': 'Age', 'Total': 'Total', 'White': 'Race',
                     'Non-Core': 'Core-Cume',
                     'Hispanic': 'Race', 'AA': 'Race', 'F': 'Gender', 'M': 'Gender', 'WAO': 'Race',
                     'F (25-29)': 'Gender', 'F (20-24)': 'Gender', 'F (18-29)': 'Gender', 'F (17-29)': 'Gender',
                     'F (20-23)': 'Gender', 'F (18-39)' : 'Gender',
                     'F (16-24)': 'Gender', 'F (30-34)': 'Gender', 'F (18-34)': 'Gender', 'F (24-29)': 'Gender',
                     'F (17-19)': 'Gender', 'F (15-26)': 'Gender', 'F (15-19)': 'Gender', 'F (15-24)': 'Gender',
                     'F (18-24)': 'Gender', 'F (20-29)': 'Gender', 'F (25-34)': 'Gender', 'F (Other)': 'Gender'}

breakout_map = {'*Core*': 'Core', '*Old*': 'Old', '*Young*': 'Young', 'Total': 'Total', 'White': 'White',
                'Non-Core': 'Non-Core',
                'Hispanic': 'Hispanic', 'AA': 'AA', 'F': 'Female', 'M': 'Male', 'WAO': 'White', 'F (25-29)': 'Female',
                'F (20-24)': 'Female', 'F (18-29)': 'Female', 'F (17-29)': 'Female', 'F (20-23)': 'Female',
                'F (16-24)': 'Female', 'F (30-34)': 'Female', 'F (18-34)': 'Female', 'F (24-29)': 'Female',
                'F (17-19)': 'Female', 'F (18-39)' : 'Female',
                'F (15-26)': 'Female', 'F (15-19)': 'Female', 'F (15-24)': 'Female',
                'F (18-24)': 'Female_(18-24)', 'F (20-29)': 'Female', 'F (25-34)': 'Female',
                'F (Other)': 'Female_Other'}

In [9]:
# create segment and category fields
df_train['segment'] = df_train['breakout_name'].apply(func=(lambda x: breakout_map[x] if (x in breakout_map.keys()) else None))
df_train['demo_category'] = df_train['breakout_name'].apply(func=(lambda x: breakout_category[x] if x in breakout_category.keys() else None))

In [10]:
# drop misc female breakouts
drop_idx = df_train[(df_train['segment'] == 'Female') | (pd.isna(df_train['segment']))].index
df_train.drop(drop_idx, inplace=True)

In [11]:
# Drop songs with just a single score in the past 2 years
df_train_week_ct = pd.DataFrame(df_train.groupby(['station_id', 'mediabase_id', 'breakout_id'])['week_dt'].count())
drop_idx = df_train.join(df_train_week_ct[df_train_week_ct['week_dt'] == 1], on=['station_id', 'mediabase_id', 'breakout_id'], how='right', rsuffix='_r').index
df_train.drop(index=drop_idx, inplace=True)

In [10]:
df_train.groupby(['demo_category', 'segment', 'taa_quintile'])['mediabase_id'].count()

demo_category  segment         taa_quintile
Age            Old             1                2252
                               2                6295
                               3                6168
                               4                8090
                               5               21146
               Young           1                2252
                               2                6295
                               3                6173
                               4                8088
                               5               21143
Core-Cume      Core            1                2251
                               2                6293
                               3                6169
                               4                8089
                               5               21150
               Non-Core        1                2242
                               2                6266
                               3                6147
  

### Investigate columns with missing data

In [11]:
len(df_train[pd.isna(df_train['mediabase_id'])])

0

In [12]:
len(df_train[df_train['gcr'] == df_train['gcr_adj']])/len(df_train)

1.0

In [17]:
len(df_train.columns)

108

In [7]:
df_train.shape

(614649, 108)

In [8]:
df_train.groupby(['station_test_1_plus', 'station_test_1_id'])['mediabase_id'].count()

station_test_1_plus  station_test_1_id
0                    1                     58773
1                    0                    497747
                     1                     58129
Name: mediabase_id, dtype: int64

##### isolate numeric and categorical columns

In [12]:
# constants
num_cols_like = ['artist_count', 'feat_artist', 'feat_artist_song', 'mscore', 'spins','pop_prior',
                 'pop_artist_prior', 'song_age_weeks', 'song_last_test']
cat_cols_like = ['Market_Name', 'taa_quintile', 'segment', 'gcr', 'gcr_adj', 'omt_co_flag']
target = ['pop_all']
id_cols = ['mediabase_id', 'station_id', 'week_dt', 'breakout_id', 'breakout_name', 'demo_category', 'pop_co', 'pop_omt', 'gcr']

exclude_cols_like = ['date','song_last_test_any_weeks','song_last_test_co_weeks', 'song_last_test_omt_weeks',
                     'std_pop_prior', 'std_pop_artist_prior']#,'_unv']#,'univ_spins', 'market_spins']

In [13]:
id_cols = id_cols
target_col = target
exclude_cols = df_train.columns[df_train.columns.str.contains('|'.join(exclude_cols_like), regex=True)]

cat_cols = list(set(df_train.columns[df_train.columns.str.contains('|'.join(cat_cols_like), regex=True)]) - set(
    id_cols) - set(exclude_cols))

num_cols = list(set(df_train.select_dtypes(exclude=['object', 'datetime64']).columns) & set(
df_train.columns[(df_train.columns.str.contains('|'.join(num_cols_like), regex=True))]) - set(id_cols) - set(cat_cols) - set(exclude_cols))

feature_cols = list(set(list(num_cols) + list(cat_cols)))

In [14]:
exclude_cols

Index(['song_release_date', 'song_last_test_any_weeks',
       'song_last_test_co_weeks', 'song_last_test_omt_weeks', 'std_pop_prior',
       'mr_pop_prior_date', 'mr_pop_prior_unv_date', 'std_pop_artist_prior'],
      dtype='object')

##### check missingness by different feature types

In [34]:
# check missingness by instance (station-song combination) for categorical variables
cols_avl_cat = df_train.groupby(['station_id', 'mediabase_id'])[cat_cols].agg(lambda x: 1 - sum(pd.isnull(x))/(1.0*len(x))).reset_index()

In [35]:
cols_avl_cat.describe()

Unnamed: 0,station_id,mediabase_id,taa_quintile,gcr_adj,omt_co_flag,Market_Name,segment
count,4774.0,4774.0,4774.0,4774.0,4774.0,4774.0,4774.0
mean,3344216.0,2450651.0,1.0,1.0,1.0,1.0,1.0
std,174230.3,394312.8,0.0,0.0,0.0,0.0,0.0
min,3321797.0,1085550.0,1.0,1.0,1.0,1.0,1.0
25%,3322025.0,2355676.0,1.0,1.0,1.0,1.0,1.0
50%,3322828.0,2583267.0,1.0,1.0,1.0,1.0,1.0
75%,3323410.0,2720314.0,1.0,1.0,1.0,1.0,1.0
max,4762077.0,2848773.0,1.0,1.0,1.0,1.0,1.0


In [24]:
df_train[(df_train['station_id'] == 3321797) & (df_train['mediabase_id'] == 1086587) & (pd.isna(df_train['segment']))]

Unnamed: 0,mediabase_id,station_id,week_dt,artist_id,format_code,FirstLast,SongTitle,Market_Name,song_release_date,breakout_id,...,mean_pop_artist_prior,std_pop_artist_prior,count_pop_artist_prior,max_pop_artist_prior_unv,min_pop_artist_prior_unv,mean_pop_artist_prior_unv,count_pop_artist_prior_unv,mr_pop_artist_prior_unv,segment,demo_category
2125,1086587,3321797,2022-07-31,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,,,,,,,,,,
2128,1086587,3321797,2022-08-21,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,64.0,,1.0,64.0,64.0,64.0,1.0,64.0,,
2132,1086587,3321797,2022-09-04,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,71.0,9.899495,2.0,78.0,64.0,68.666667,3.0,78.0,,
2133,1086587,3321797,2022-09-18,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,71.333333,7.023769,3.0,78.0,64.0,70.0,6.0,72.0,,
2136,1086587,3321797,2022-10-02,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,72.0,5.887841,4.0,78.0,64.0,70.8,10.0,74.0,,
2138,1086587,3321797,2022-10-16,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,70.6,5.98331,5.0,78.0,64.0,70.733333,15.0,65.0,,
2139,1086587,3321797,2022-10-30,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,71.5,5.787918,6.0,78.0,64.0,70.952381,21.0,76.0,,
2141,1086587,3321797,2022-11-27,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,72.714286,6.183696,7.0,80.0,64.0,71.392857,28.0,80.0,,
2087,1086587,3321797,2022-07-31,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428762,...,,,,,,,,,,
2090,1086587,3321797,2022-08-21,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428762,...,59.0,,1.0,59.0,59.0,59.0,1.0,59.0,,


In [72]:
# Investigate taa_quintile
# df_temp = df_train[pd.isna(df_train['taa_quintile'])][['station_id', 'mediabase_id', 'week_dt']]
# len(df_temp)
# df_temp.groupby(['week_dt']).agg({'mediabase_id':len}).sort_values(by=['mediabase_id'], ascending=False)
# df_temp.groupby(['mediabase_id']).agg({'week_dt':len}).sort_values(by=['week_dt'], ascending=False)
# df_temp.groupby(['station_id', 'mediabase_id']).agg({'week_dt':len}).sort_values(by=['week_dt'], ascending=False)
# idx = (df_train['station_id'] == 3323400) & (df_train['mediabase_id'] == 2591543)
# df_train.loc[idx].groupby(['week_dt'])['mediabase_id'].count()
# idx = (df_train['station_id'] == 3323400) & (df_train['mediabase_id'] == 2591543) & (pd.isna(df_train['taa_quintile'])) & (df_train['breakout_name'] == 'Total')
# df_train.loc[idx]

In [149]:
# Investigate gcr and gcr_adj
# df_temp = df_train[(pd.isna(df_train['gcr_adj'])) & (~pd.isna(df_train['gcr']))][['station_id', 'mediabase_id', 'week_dt']]
# len(df_temp)
# print(df_temp.groupby(['week_dt']).agg({'mediabase_id':len}).sort_values(by=['mediabase_id'], ascending=False))
# print(df_temp.groupby(['mediabase_id']).agg({'week_dt':len}).sort_values(by=['week_dt'], ascending=False))
# print(df_temp.groupby(['station_id', 'mediabase_id']).agg({'week_dt':len}).sort_values(by=['week_dt'], ascending=False))
# idx = (df_train['station_id'] == 3322002) & (df_train['mediabase_id'] == 2294907)
# df_train.loc[idx].groupby(['week_dt'])['mediabase_id'].count()
# idx = (df_train['station_id'] == 3323404) & (df_train['mediabase_id'] == 2629560) & (pd.isna(df_train['gcr_adj'])) & (df_train['breakout_name'] == 'Total')
# df_train.loc[idx]
# [np.min(df_temp['week_dt']), np.max(df_temp['week_dt'])]

[datetime.date(2020, 11, 22), datetime.date(2021, 1, 24)]

###### Investigate Numeric Columns

In [14]:
# Investigate Numeric Columns
num_cols_spins = [col for col in num_cols if 'spins' in col]
num_cols_pop = [col for col in num_cols if 'pop' in col]
num_cols_other = list(set(num_cols) - set(num_cols_spins) - set(num_cols_pop))
num_cols_spins_perc = [i for i in num_cols_spins if (('perc_diff_' in i) or ('per_diff' in i))]
num_cols_spins_nonperc = list(set(num_cols_spins) - set(num_cols_spins_perc))

In [17]:
[len(num_cols), len(num_cols_spins), len(num_cols_pop), len(num_cols_other)]

[75, 52, 19, 4]

In [18]:
num_cols_other

['artist_count', 'feat_artist', 'song_age_weeks', 'feat_artist_song']

In [19]:
num_cols_pop

['min_pop_prior',
 'mean_pop_prior_unv',
 'min_pop_artist_prior_unv',
 'count_pop_artist_prior_unv',
 'max_pop_artist_prior_unv',
 'mean_pop_artist_prior_unv',
 'mr_pop_prior_unv',
 'max_pop_prior',
 'mr_pop_prior',
 'count_pop_artist_prior',
 'max_pop_artist_prior',
 'min_pop_artist_prior',
 'mr_pop_artist_prior_unv',
 'mean_pop_artist_prior',
 'med_pop_prior',
 'min_pop_prior_unv',
 'count_pop_prior_unv',
 'mean_pop_prior',
 'max_pop_prior_unv']

In [20]:
df_train.shape

(377628, 110)

In [68]:
print(581054 + 33595)

614649


In [15]:
# Backfill pop based data
df_train[num_cols_pop] = df_train.groupby(['station_id', 'mediabase_id', 'breakout_id'])[num_cols_pop].bfill()

In [16]:
# Fill missing perc spin diffs with 1.0
df_train[num_cols_spins_perc] = df_train[num_cols_spins_perc].transform(lambda x: x.fillna(1.0))

In [17]:
# Backfill non perc diff spins diff
df_train[num_cols_spins_nonperc] = df_train.groupby(['station_id', 'mediabase_id', 'breakout_id'])[num_cols_spins_nonperc].bfill()

In [19]:
# Write processed train set to pickle file
df_train.to_pickle('df_train_2022_11_05_2020_11_28_processed.pkl')

In [43]:
num_cols_spins_nonperc

['avg_market_spins_prior',
 'avg_station_artist_spins_prior',
 'mr_market_spins_spins',
 'mr_song_univ_spins_prior',
 'market_artist_spins',
 'song_univ_spins',
 'avg_artist_univ_spins_prior',
 'total_market_artist_spins_prior',
 'total_station_artist_spins_prior',
 'song_market_weeks_since_first_spins',
 'diff_spins_song_market_prior',
 'song_weeks_since_last_spins',
 'station_spins',
 'mr_artist_univ_spins',
 'diff_market_spins_spins_prior',
 'avg_song_univ_spins_prior',
 'artist_weeks_since_first_spins',
 'total_market_spins_prior',
 'diff_artist_univ_spins_prior',
 'diff_spins_song_station_prior',
 'mr_spins_artist_station_prior',
 'format_spins',
 'artist_station_weeks_since_first_spins',
 'artist_univ_spins',
 'song_weeks_since_first_spins',
 'diff_market_artist_spins_prior',
 'diff_spins_artist_station_prior',
 'spins_non_on',
 'total_spins_song_station_prior',
 'diff_song_univ_spins_prior',
 'total_song_univ_spins_prior',
 'avg_spins_song_station_prior',
 'spins_total',
 'avg_m

In [20]:
# check missingness by instance (station-song combination) for categorical variables
cols_avl_num = df_train.groupby(['station_id', 'mediabase_id', 'breakout_name'])[num_cols].agg(lambda x: 1 - sum(pd.isnull(x))/(1.0*len(x))).reset_index()

In [21]:
cols_avl_num = cols_avl_num.join(pd.DataFrame(df_train.groupby(['station_id', 'mediabase_id', 'breakout_name']).count()['week_dt']), on=['station_id', 'mediabase_id', 'breakout_name'], rsuffix='_r')

In [22]:
cols_avl_num[num_cols].mean().reset_index().sort_values(by=[0])

Unnamed: 0,index,0
0,spins_non_on,1.0
53,total_market_spins_prior,1.0
52,mean_pop_artist_prior_unv,1.0
51,mr_artist_univ_spins,1.0
50,diff_market_spins_spins_prior,1.0
...,...,...
22,mean_pop_prior,1.0
21,station_spins,1.0
20,format_spins,1.0
18,total_song_univ_spins_prior,1.0


In [48]:
idx = df_train[cat_cols + num_cols].dropna(axis=1).index
df_train.loc[idx].shape

(377628, 110)

In [49]:
df_train.shape

(377628, 110)

In [93]:
cols_avl_num[(cols_avl_num['song_last_test_co_weeks'] < 1) & (cols_avl_num['breakout_name'] == 'Total')][['station_id', 'mediabase_id', 'breakout_name', 'week_dt']]

Unnamed: 0,station_id,mediabase_id,breakout_name,week_dt
2788,3321799,1388281,Total,2
2812,3321799,1640575,Total,2
7090,3322002,1243640,Total,2
7100,3322002,1249237,Total,2
7134,3322002,1261285,Total,3
...,...,...,...,...
56105,3323602,2397182,Total,2
56124,3323602,2422489,Total,3
56130,3323602,2422949,Total,2
56179,3323602,2445876,Total,3


In [95]:
df_train[(df_train['station_id'] == 3323602) & (df_train['mediabase_id'] == 2348056) & (df_train['breakout_name'] == 'Total')][['station_id', 'mediabase_id', 'breakout_name', 'week_dt', 'song_last_test_any_weeks','song_last_test_co_weeks', 'song_last_test_omt_weeks']]

Unnamed: 0,station_id,mediabase_id,breakout_name,week_dt,song_last_test_any_weeks,song_last_test_co_weeks,song_last_test_omt_weeks
64814,3323602,2348056,Total,2020-11-22,0.0,,0.0
64815,3323602,2348056,Total,2021-02-07,0.0,0.0,1.0
64816,3323602,2348056,Total,2021-09-26,0.0,8.0,0.0
64817,3323602,2348056,Total,2022-08-21,0.0,0.0,1.0


### Prep Data and create train/test splits

In [60]:
id_cols

['mediabase_id',
 'station_id',
 'week_dt',
 'breakout_id',
 'breakout_name',
 'demo_category',
 'pop_co',
 'pop_omt',
 'gcr']

In [18]:
# Extract train data
[np.min(df_train['week_dt']),pd.to_datetime(np.min(df_train['week_dt'])) + np.timedelta64(2,'Y'), np.max(df_train['week_dt'])]
scoring_date = pd.to_datetime('2022-11-22')
train_idx = df_train['week_dt'] < scoring_date.date()

df_train_final = df_train.loc[train_idx][id_cols + feature_cols + target_col]
X_train = pd.get_dummies(df_train_final[feature_cols], columns=cat_cols)
y_train = df_train_final[target]

In [56]:
demo_cats = list(set(breakout_category.values()) - set(['Total']))

In [30]:
X_train.shape

(369153, 125)

In [32]:
# imports for model training
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, GroupKFold
from sklearn.metrics import make_scorer, mean_pinball_loss

In [34]:
low_alpha = 0.05
high_alpha = 0.95

param_grid = dict(
    learning_rate=[.2, .1, .05],
    n_estimators=[5, 10, 15],
    max_depth=[2, 4, 6],
    min_samples_leaf=[5, 10, 20],
    min_samples_split=[5, 10, 20]
)

n_iter = 50
n_splits = 5

In [19]:
import time
demo_cols_all = [col for col in X_train.columns if 'segment_' in col]

In [34]:
best_scores = {}
best_estimators = {}

for cat in demo_cats:
    tic = time.perf_counter()
    idx = (df_train_final['demo_category'] == cat)
    print(cat + ': ' + str(sum(idx)))

    # extract relevant segment indicator columns
    demo_cols_cat = ['segment_' + i for i in list(pd.unique(df_train_final.loc[idx]['segment']))]
    demo_cols_excl = list(set(demo_cols_all) - set(demo_cols_cat))
    feature_cols_cat = list(set(X_train.columns) - set(demo_cols_excl))

    # create features and target
    X = X_train.loc[idx][feature_cols_cat]
    y = y_train.loc[idx]

    # quantile regressor

    # gradient boosted quantile regressor
    group_kfold = GroupKFold(n_splits=n_splits)

    # train model for upper threshold given features
    cv = group_kfold.split(X.loc[idx], y.loc[idx], df_train_final.loc[idx]['mediabase_id'])
    neg_mean_pinball_loss_high = make_scorer(
        mean_pinball_loss,
        alpha=high_alpha,
        greater_is_better=False,  # maximize the negative loss
    )

    model_high_thresh = GradientBoostingRegressor(loss="quantile", alpha=high_alpha,
                                                  random_state=0)

    rs_high_thresh = RandomizedSearchCV(
        model_high_thresh,
        param_grid,
        n_iter=n_iter,
        scoring=neg_mean_pinball_loss_high,
        cv=cv,
        verbose=1,
        random_state=0,
        n_jobs=-1
    )

    rs_high_thresh.fit(X.loc[idx], np.ravel(y.loc[idx]))
    print(cat + ": Fitting for upper wobble threshold completed")

    # train model for lower threshold given features
    cv = group_kfold.split(X.loc[idx], y.loc[idx], df_train_final.loc[idx]['mediabase_id'])
    neg_mean_pinball_loss_low = make_scorer(
        mean_pinball_loss,
        alpha=low_alpha,
        greater_is_better=False,  # maximize the negative loss
    )

    model_low_thresh = GradientBoostingRegressor(loss="quantile", alpha=low_alpha,
                                                 random_state=0)

    rs_low_thresh = RandomizedSearchCV(
        model_low_thresh,
        param_grid,
        n_iter=n_iter,
        scoring=neg_mean_pinball_loss_low,
        cv=cv,
        verbose=1,
        random_state=0,
        n_jobs=-1
    )

    rs_low_thresh.fit(X.loc[idx], np.ravel(y.loc[idx]))
    print(cat + ": Fitting for lower wobble threshold completed")

    # train model for mean pop score given features
    cv = group_kfold.split(X.loc[idx], y.loc[idx], df_train_final.loc[idx]['mediabase_id'])
    model_mean = GradientBoostingRegressor(loss="squared_error")

    rs_mean = RandomizedSearchCV(
        model_mean,
        param_grid,
        n_iter=n_iter,
        scoring='neg_mean_absolute_error',
        cv=cv,
        verbose=1,
        random_state=0,
        n_jobs=-1
    )

    rs_mean.fit(X.loc[idx], np.ravel(y.loc[idx]))
    print(cat + ": Fitting for mean pop completed")

    toc = time.perf_counter()
    time_elapsed = toc-tic
    print('Total time elapsed for ' + cat + ': ' + '%.2f'%time_elapsed)

    best_scores[cat] = [rs_low_thresh.best_score_, rs_mean.best_score_, rs_high_thresh.best_score_]
    best_estimators[cat] = [rs_low_thresh.best_estimator_, rs_mean.best_estimator_, rs_high_thresh.best_estimator_]

Core-Cume: 85772


NameError: name 'GroupKFold' is not defined

In [51]:
import pickle
pickle.dump(best_scores, open('best_scores_all.pkl', "wb"))
pickle.dump(best_estimators, open('best_estimators_all.pkl', "wb"))

In [20]:
best_estimators = pd.read_pickle('best_estimators_all.pkl')

In [22]:
best_estimators

{'Age': [GradientBoostingRegressor(alpha=0.05, learning_rate=0.2, loss='quantile',
                            max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15,
                            random_state=0),
  GradientBoostingRegressor(learning_rate=0.2, max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15),
  GradientBoostingRegressor(alpha=0.95, learning_rate=0.2, loss='quantile',
                            max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15,
                            random_state=0)],
 'Gender': [GradientBoostingRegressor(alpha=0.05, learning_rate=0.2, loss='quantile',
                            max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15,
                            random_state=0),
  GradientBoostingRegressor(learning_rate=0.2, max_depth=6, min_samples_leaf=10,
  

### Prep scoring data & score


In [37]:
test_data_query = '''
Select *
from adds_temp.demo_rr_features_h1 as rdfh
where week_dt >= '2022-12-26'
'''

In [38]:
engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as conn:
    with conn.connect():
        df_test = pd.read_sql(test_data_query + filter_rules, conn)

In [39]:
df_test.shape

(587119, 108)

In [None]:
# df_test.to_pickle('df_score_2021_01_09_2020_11_28.pkl')

In [24]:
# df_test = pd.read_pickle('df_score_2021_01_09_2020_11_28.pkl')

In [40]:
# create segment and category fields
df_test['segment'] = df_test['breakout_name'].apply(func=(lambda x: breakout_map[x] if (x in breakout_map.keys()) else None))
df_test['demo_category'] = df_test['breakout_name'].apply(func=(lambda x: breakout_category[x] if x in breakout_category.keys() else None))

In [41]:
# drop misc female breakouts
drop_idx = df_test[(df_test['segment'] == 'Female') | (pd.isna(df_test['segment']))].index
df_test.drop(drop_idx, inplace=True)

In [42]:
# Backfill pop based data
df_test[num_cols_pop] = df_test.groupby(['station_id', 'mediabase_id', 'breakout_id'])[num_cols_pop].bfill()

In [43]:
# Fill missing perc spin diffs with 1.0
df_test[num_cols_spins_perc] = df_test[num_cols_spins_perc].transform(lambda x: x.fillna(1.0))

In [44]:
# Backfill non perc diff spins diff
df_test[num_cols_spins_nonperc] = df_test.groupby(['station_id', 'mediabase_id', 'breakout_id'])[num_cols_spins_nonperc].bfill()

In [33]:
pd.unique(df_test['week_dt'])

array([datetime.date(2023, 1, 1), datetime.date(2023, 1, 8),
       datetime.date(2023, 1, 15), datetime.date(2023, 1, 22),
       datetime.date(2023, 1, 29), datetime.date(2023, 2, 5),
       datetime.date(2023, 2, 12), datetime.date(2023, 2, 19)],
      dtype=object)

In [32]:
drop_idx = df_test[pd.to_datetime(df_test['week_dt']) == pd.to_datetime('2022-12-18')].index
df_test.drop(drop_idx, inplace=True)

In [41]:
df_test[(df_test['breakout_name'] == 'Total')].groupby(['station_id', 'week_dt']).apply(lambda x: len(pd.unique(x['mediabase_id'])))

station_id  week_dt   
3321797     2022-12-18    240
            2022-12-25    240
            2023-01-01    234
            2023-01-08    232
            2023-01-15    226
                         ... 
4762077     2023-01-08    100
            2023-01-15     99
            2023-01-22     99
            2023-01-29     99
            2023-02-05     97
Length: 232, dtype: int64

In [33]:
df_test[(df_test['station_id'] == 3322022) & (df_test['breakout_name'] == 'Total')].groupby(['week_dt'])['mediabase_id'].count()

week_dt
2022-12-11    289
2022-12-18    290
2022-12-25    288
2023-01-01    290
2023-01-08    295
2023-01-15    292
2023-01-22    289
2023-01-29    282
Name: mediabase_id, dtype: int64

In [33]:
df_test[num_cols].apply(lambda x: 1 - pd.isnull(x).sum()/len(x)).sort_values()

mr_pop_prior                       0.980531
max_pop_prior                      0.980531
count_pop_artist_prior             0.980531
mean_pop_artist_prior              0.980531
mean_pop_prior                     0.980531
                                     ...   
diff_spins_song_market_prior       1.000000
diff_market_artist_spins_prior     1.000000
total_market_artist_spins_prior    1.000000
mr_spins_song_market_prior         1.000000
avg_song_univ_spins_prior          1.000000
Length: 75, dtype: float64

In [34]:
df_test[cat_cols].apply(lambda x: 1 - pd.isnull(x).sum()/len(x)).sort_values()

omt_co_flag     0.085941
Market_Name     1.000000
taa_quintile    1.000000
segment         1.000000
gcr_adj         1.000000
dtype: float64

In [45]:
df_test['omt_co_flag'] = df_test['omt_co_flag'].fillna(0)

In [37]:
df_test[pd.isna(df_test['med_pop_prior'])]['mediabase_id'].drop_duplicates()

6362      1165871
14845     1186003
37952     1269144
62822     1436100
69145     1491192
72656     1502744
108046    1686075
144812    1801771
157912    1842682
182811    1932585
188820    1969359
208110    2062455
235840    2154641
240743    2175458
264691    2267512
275131    2294874
282374    2306701
364513    2522450
373446    2544637
387103    2552613
524681    2738262
569767    2781499
627530    2823991
631386    2827240
648129    2848768
653618    2863849
654272    2865765
654441    2838198
Name: mediabase_id, dtype: int64

In [101]:
df_test[(df_test['mediabase_id'] == 1354780) & (df_test['breakout_name'] == 'Total') & (df_test['station_id'] == 3322916)][id_cols + ['med_pop_prior']]

Unnamed: 0,mediabase_id,station_id,week_dt,breakout_id,breakout_name,demo_category,pop_co,pop_omt,gcr,med_pop_prior
36963,1354780,3322916,2022-11-27,1,Total,Total,,,G,
36964,1354780,3322916,2022-12-04,1,Total,Total,,,G,
36965,1354780,3322916,2022-12-11,1,Total,Total,,,G,
36966,1354780,3322916,2022-12-18,1,Total,Total,,,G,
36967,1354780,3322916,2022-12-25,1,Total,Total,,,G,
36968,1354780,3322916,2023-01-01,1,Total,Total,,,G,
36969,1354780,3322916,2023-01-08,1,Total,Total,109.0,,G,


In [48]:
df_test.shape

(387072, 110)

In [47]:
drop_idx = list(set(df_test.index) - set(df_test[feature_cols].dropna().index))
df_test.drop(drop_idx, inplace=True)

#### score all stations

In [49]:
scoring_date = pd.to_datetime('2023-01-01')
test_idx = (df_test['week_dt'] >= scoring_date.date()) # & (df_test['station_id'] == 3322022)

df_test_final = df_test.loc[test_idx][id_cols + feature_cols + target_col]
X_test = pd.get_dummies(df_test_final[feature_cols], columns=cat_cols)

missing_cols = list(set(X_train.columns) - set(X_test.columns))

y_test = df_test_final[target]

In [50]:
df_test_final.shape

(387072, 90)

In [53]:
df_test_final[feature_cols].dropna()

Unnamed: 0,count_pop_artist_prior_unv,total_market_spins_prior,song_univ_spins,mr_spins_song_station_prior,omt_co_flag,total_spins_song_station_prior,per_diff_artist_univ_spins_prior,song_weeks_since_last_spins,mr_pop_prior,max_pop_artist_prior,...,total_song_univ_spins_prior,spins_non_on,mr_spins_song_market_prior,med_pop_prior,artist_univ_spins,total_artist_univ_spins_prior,max_pop_artist_prior_unv,mean_pop_prior,perc_diff_spins_song_station_prior,mean_pop_artist_prior
0,7.0,6766.0,65.0,4,0,3324.0,0.000000,0.0,90.0,90.0,...,12960.0,3,4,90.0,65.0,12961.0,99.0,90.000000,-0.250000,90.000000
1,7.0,6812.0,72.0,3,0,3328.0,0.107692,1.0,90.0,90.0,...,13025.0,4,3,90.0,72.0,13026.0,99.0,90.000000,0.333333,90.000000
2,7.0,6865.0,64.0,4,0,3333.0,0.000000,0.0,90.0,90.0,...,13097.0,4,4,90.0,64.0,13098.0,99.0,90.000000,0.000000,90.000000
3,7.0,6910.0,66.0,4,0,3338.0,0.000000,0.0,90.0,90.0,...,13161.0,3,4,90.0,66.0,13162.0,99.0,90.000000,-0.250000,90.000000
4,7.0,6957.0,67.0,3,0,3341.0,0.000000,0.0,90.0,90.0,...,13227.0,4,3,90.0,67.0,13228.0,99.0,90.000000,0.333333,90.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
587114,21581.0,7476.0,941.0,28,0,3392.0,0.000000,1.0,84.0,107.0,...,218580.0,27,28,82.0,1975.0,379979.0,120.0,80.052632,-0.035714,81.369565
587115,22063.0,7536.0,667.0,27,0,3432.0,-0.213165,0.0,84.0,107.0,...,219521.0,29,27,82.0,1554.0,381954.0,120.0,80.052632,0.074074,81.369565
587116,22438.0,7595.0,814.0,29,CO_only,3473.0,0.000000,0.0,84.0,107.0,...,220188.0,26,29,82.0,1405.0,383508.0,120.0,80.052632,-0.103448,81.369565
587117,23142.0,7661.0,769.0,26,0,3514.0,0.000000,0.0,92.0,107.0,...,221002.0,34,26,82.5,1138.0,384913.0,120.0,80.650000,0.307692,80.807692


In [43]:
set(X_train.columns) - set(X_test.columns)

{'Market_Name_Atlanta',
 'Market_Name_Austin',
 'Market_Name_Baltimore',
 'Market_Name_Boston',
 'Market_Name_Charlotte',
 'Market_Name_Chicago',
 'Market_Name_Cincinnati',
 'Market_Name_Columbus, OH',
 'Market_Name_Dallas',
 'Market_Name_Denver',
 'Market_Name_Detroit',
 'Market_Name_Miami',
 'Market_Name_Minneapolis',
 'Market_Name_Nashville',
 'Market_Name_New York',
 'Market_Name_Orlando',
 'Market_Name_Philadelphia',
 'Market_Name_Phoenix',
 'Market_Name_Pittsburgh',
 'Market_Name_Portland, OR',
 'Market_Name_Raleigh',
 'Market_Name_Salt Lake City',
 'Market_Name_San Diego',
 'Market_Name_San Francisco',
 'Market_Name_Seattle',
 'Market_Name_St. Louis',
 'Market_Name_Tampa',
 'Market_Name_Washington, DC',
 'omt_co_flag_OMT_CO',
 'omt_co_flag_OMT_only',
 'segment_AA'}

In [52]:
for i in missing_cols:
    X_test[i] = 0

In [54]:
X_test = X_test[X_train.columns]

In [53]:
pd.unique(X_test['omt_co_flag_CO_only'])

array([0, 1], dtype=uint8)

In [57]:
df_out = pd.DataFrame(columns=['lower_wob_thresh', 'mean_pop_predicted', 'upper_wobble_thresh'])
for cat in demo_cats:
    #tic = time.perf_counter()
    idx = (df_test_final['demo_category'] == cat)
    print(cat + ': ' + str(sum(idx)))

    # extract relevant segment indicator columns
    demo_cols_cat = ['segment_' + i for i in list(pd.unique(df_test_final.loc[idx]['segment']))]
    demo_cols_excl = list(set(demo_cols_all) - set(demo_cols_cat))
    feature_cols_cat = list(set(X_test.columns) - set(demo_cols_excl))

    # create features and target
    X = X_test.loc[idx][feature_cols_cat]
    # if cat == 'Race':
    #     X['segment_AA'] = 0
    y = y_test.loc[idx]
    #print(X)

    # create empty dataframe
    df_temp = pd.DataFrame()
    # predict using estimator

    #re-arrange features

    df_temp['lower_wob_thresh'] = pd.DataFrame(best_estimators[cat][0].predict(X[best_estimators[cat][0].feature_names_in_]), index=X_test.loc[idx].index)
    df_temp['mean_pop_predicted'] = pd.DataFrame(best_estimators[cat][1].predict(X[best_estimators[cat][1].feature_names_in_]), index=X_test.loc[idx].index)
    df_temp['upper_wobble_thresh'] = pd.DataFrame(best_estimators[cat][2].predict(X[best_estimators[cat][2].feature_names_in_]), index=X_test.loc[idx].index)

    df_out = pd.concat([df_out,df_temp], axis=0)

Race: 67827
Core-Cume: 96120
Gender: 78885
Age: 96160


In [55]:
df_out

Unnamed: 0,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh
5,73.415541,90.507888,104.557081
6,74.544976,90.507888,104.829390
7,74.544976,90.507888,104.557081
8,74.544976,90.507888,104.557081
9,74.558265,90.507888,104.557081
...,...,...,...
650801,55.753489,70.714955,84.586848
650802,55.284164,71.350782,86.625285
650803,54.676082,71.904296,89.177677
650804,55.585284,71.469858,89.174522


#### score format alone without KIIS-FM

In [43]:
scoring_date = pd.to_datetime('2023-01-01')
test_idx_fmt = (df_test['week_dt'] >= scoring_date.date())

df_test_final_fmt = df_test.loc[test_idx_fmt][id_cols + ['total_respondents'] + feature_cols + target_col]
X_test_fmt = pd.get_dummies(df_test_final_fmt[feature_cols], columns=cat_cols)

missing_cols_fmt = list(set(X_train.columns) - set(X_test_fmt.columns))

y_test_fmt = df_test_final_fmt[target]

In [49]:
missing_cols_fmt

['omt_co_flag_OMT_CO', 'omt_co_flag_OMT_only']

In [44]:
for i in missing_cols_fmt:
    X_test_fmt[i] = 0

In [45]:
X_test_fmt = X_test_fmt[X_train.columns]

In [46]:
df_out_fmt = pd.DataFrame(columns=['lower_wob_thresh', 'mean_pop_predicted', 'upper_wobble_thresh'])
for cat in demo_cats:
    #tic = time.perf_counter()
    idx = (df_test_final_fmt['demo_category'] == cat)
    print(cat + ': ' + str(sum(idx)))

    # extract relevant segment indicator columns
    demo_cols_cat = ['segment_' + i for i in list(pd.unique(df_test_final_fmt.loc[idx]['segment']))]
    demo_cols_excl = list(set(demo_cols_all) - set(demo_cols_cat))
    feature_cols_cat = list(set(X_test.columns) - set(demo_cols_excl))

    # create features and target

    X = X_test_fmt.loc[idx][feature_cols_cat]
    X.dropna(inplace=True)
    idx_new = X.index
    print(cat + ': ' + str(len(idx_new)))
    y = y_test_fmt.loc[idx_new]
    #print(X)

    # create empty dataframe
    df_temp_fmt = pd.DataFrame()
    # predict using estimator

    #re-arrange features

    df_temp_fmt['lower_wob_thresh'] = pd.DataFrame(best_estimators[cat][0].predict(X[best_estimators[cat][0].feature_names_in_]), index=X_test_fmt.loc[idx_new].index)
    df_temp_fmt['mean_pop_predicted'] = pd.DataFrame(best_estimators[cat][1].predict(X[best_estimators[cat][1].feature_names_in_]), index=X_test_fmt.loc[idx_new].index)
    df_temp_fmt['upper_wobble_thresh'] = pd.DataFrame(best_estimators[cat][2].predict(X[best_estimators[cat][2].feature_names_in_]), index=X_test_fmt.loc[idx_new].index)

    df_out_fmt = pd.concat([df_out_fmt,df_temp_fmt], axis=0)

Race: 68156
Race: 68156
Core-Cume: 94746
Core-Cume: 94746
Age: 94788
Age: 94788
Gender: 77586
Gender: 77586


In [53]:
df_out_fmt

Unnamed: 0,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh
20,71.647261,89.997511,98.699385
21,69.020207,90.223598,98.529371
22,71.647261,89.997511,98.699385
23,71.647261,89.997511,98.699385
24,71.647261,89.997511,98.699385
...,...,...,...
656721,59.299788,76.027729,91.449372
656722,59.299788,75.842813,91.449372
656723,60.783936,76.315299,91.449372
656724,60.962091,75.842813,91.449372


In [58]:
# added this line of code since entire format being scored - 02/14/2023
df_out_fmt = df_out

In [63]:
df_test_final_fmt = df_test_final

In [59]:
df_out.to_pickle('df_out_stage_02192023_02262021.pkl')
df_out_fmt.to_pickle('df_out_fmt_stage_02192023_02262021.pkl')

In [3]:
df_out = pd.read_pickle('df_out_stage_02122023_02192021.pkl')
df_out_fmt = pd.read_pickle('df_out_fmt_stage_02122023_02192021.pkl')

In [4]:
df_out

Unnamed: 0,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh
282,69.707349,87.357206,97.580157
283,69.707349,87.357206,97.580157
284,69.707349,86.938477,97.246663
285,68.326534,86.938477,97.246663
286,68.326534,86.938477,97.580157
...,...,...,...
663488,54.498126,67.784587,87.376244
663489,54.498126,68.866443,88.720244
663490,54.498126,69.158211,88.720244
663491,54.498126,68.866443,88.720244


### Process output & write to Excel

In [60]:
df_out_final = df_test_final[id_cols + ['taa_quintile']].join(df_out, how='left')

In [61]:
df_out_final['segment'] = df_out_final['breakout_name'].apply(func=(lambda x: breakout_map[x] if (x in breakout_map.keys()) else None))

In [64]:
df_out_final_fmt = df_test_final_fmt[id_cols + ['taa_quintile']].join(df_out_fmt, how='left')

In [65]:
df_out_final_fmt['segment'] = df_out_final_fmt['breakout_name'].apply(func=(lambda x: breakout_map[x] if (x in breakout_map.keys()) else None))

In [67]:
df_out_final

Unnamed: 0,mediabase_id,station_id,week_dt,breakout_id,breakout_name,demo_category,pop_co,pop_omt,gcr,taa_quintile,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh,segment
1,1085550,3322916,2023-01-22,1,Total,Total,,,G,5,,,,Total
2,1085550,3322916,2023-01-29,1,Total,Total,,,G,5,,,,Total
3,1085550,3322916,2023-02-05,1,Total,Total,,,G,5,,,,Total
5,1085550,3322916,2022-12-18,317542,*Core*,Core-Cume,,,G,5,73.415541,90.507888,104.557081,Core
6,1085550,3322916,2022-12-25,317542,*Core*,Core-Cume,,,G,5,74.544976,90.507888,104.829390,Core
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
650843,2848768,3323195,2023-01-08,-2,F (Other),Gender,70.0,,C,2,54.918988,71.920493,84.613301,Female_Other
650844,2848768,3323195,2023-01-15,-2,F (Other),Gender,,,C,3,54.852231,71.758539,86.083429,Female_Other
650845,2848768,3323195,2023-01-22,-2,F (Other),Gender,,,C,3,54.407057,71.758539,85.782629,Female_Other
650846,2848768,3323195,2023-01-29,-2,F (Other),Gender,,,C,4,55.399057,72.143288,85.339670,Female_Other


In [66]:
# song-artist lookup
song_query = '''
Select mediabase_id, song_name, artist_name
from data.songs_v as sv
'''
engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as conn:
    with conn.begin():
        df_song_lookup = pd.read_sql(song_query, con=conn)

In [67]:
station_query = '''
Select distinct station_id, call_letters
from data.stations_v as sv
'''

engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as conn:
    with conn.begin():
        df_station_lookup = pd.read_sql(station_query, con=conn)

In [68]:
df_song_lookup.set_index(['mediabase_id'], inplace=True)

In [69]:
df_station_lookup.set_index(['station_id'], inplace=True)

In [70]:
df_song_lookup['song_artist']  = df_song_lookup['song_name'] + ' (' + df_song_lookup['artist_name'] + ')'

In [65]:
df_song_lookup

Unnamed: 0_level_0,song_name,artist_name,song_artist
mediabase_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1147620,Mr. Crowley (Live),OZZY OSBOURNE,Mr. Crowley (Live) (OZZY OSBOURNE)
1960216,Street Fighting Man (Live '13),ROLLING STONES,Street Fighting Man (Live '13) (ROLLING STONES)
2691117,Storybook Ending,BLUMES,Storybook Ending (BLUMES)
1804878,You Take My... (Live '76),QUEEN,You Take My... (Live '76) (QUEEN)
2131197,Hero Of The Day (Live),METALLICA,Hero Of The Day (Live) (METALLICA)
...,...,...,...
2437613,Valentine's Day,QUEEN OF THE MEADOW,Valentine's Day (QUEEN OF THE MEADOW)
2667991,Exaggeration,JUAN HAZE,Exaggeration (JUAN HAZE)
2438195,Every Week,DQ4E,Every Week (DQ4E)
2623051,Your Story Is Over!,AYREON,Your Story Is Over! (AYREON)


In [72]:
df_out_final['song_artist'] = df_out_final.join(df_song_lookup, on=['mediabase_id'], how='left')['song_artist']

ValueError: columns overlap but no suffix specified: Index(['song_artist'], dtype='object')

In [73]:
df_out_final['call_letters'] = df_out_final.join(df_station_lookup, on=['station_id'], how='left')['call_letters']

In [74]:
df_out_final['wobble_flag'] = df_out_final.apply(lambda x: int((x['pop_co'] < np.floor(x['lower_wob_thresh'])) | (x['pop_co'] > np.ceil(x['upper_wobble_thresh']))), axis=1)

In [75]:
df_out_final.to_pickle('df_out_final_CHR_2023_01_01_2023_02_19.pkl')

In [76]:
df_out_final_fmt['song_artist'] = df_out_final_fmt.join(df_song_lookup, on=['mediabase_id'], how='left')['song_artist']

In [77]:
df_out_final_fmt['call_letters'] = df_out_final_fmt.join(df_station_lookup, on=['station_id'], how='left')['call_letters']

In [78]:
df_out_final_fmt['wobble_flag'] = df_out_final_fmt.apply(lambda x: int((x['pop_co'] < np.floor(x['lower_wob_thresh'])) | (x['pop_co'] > np.ceil(x['upper_wobble_thresh']))), axis=1)

In [80]:
df_out_final_fmt.to_pickle('df_out_final_fmt_CHR_2023_01_01_2023_02_19.pkl')

In [5]:
df_out_final = pd.read_pickle('df_out_final_CHR_2022_12_25_2023_02_12.pkl')

In [9]:
df_out_final[(df_out_final['call_letters'] == 'KBKS-FM') & (df_out_final['song_artist'] == 'abcdefu (GAYLE)')]

Unnamed: 0,mediabase_id,station_id,week_dt,breakout_id,breakout_name,demo_category,pop_co,pop_omt,gcr,taa_quintile,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh,segment,song_artist,call_letters,wobble_flag
551695,2760708,3321797,2022-12-25,-2,F (Other),Gender,,,R,2,60.344388,76.647103,93.399436,Female_Other,abcdefu (GAYLE),KBKS-FM,0
551696,2760708,3321797,2023-01-01,-2,F (Other),Gender,,,R,2,62.207474,77.797221,93.130636,Female_Other,abcdefu (GAYLE),KBKS-FM,0
551697,2760708,3321797,2023-01-08,-2,F (Other),Gender,,,R,2,61.867408,77.797221,93.130636,Female_Other,abcdefu (GAYLE),KBKS-FM,0
551698,2760708,3321797,2023-01-15,-2,F (Other),Gender,,,R,2,62.125284,78.695010,93.130636,Female_Other,abcdefu (GAYLE),KBKS-FM,0
551699,2760708,3321797,2023-01-22,-2,F (Other),Gender,,,R,2,60.344388,76.788446,93.399436,Female_Other,abcdefu (GAYLE),KBKS-FM,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551788,2760708,3321797,2023-01-15,412759,F (18-24),Gender,,,R,2,48.776339,66.943894,86.120871,Female_(18-24),abcdefu (GAYLE),KBKS-FM,0
551789,2760708,3321797,2023-01-22,412759,F (18-24),Gender,,,R,2,47.283107,66.943894,86.641831,Female_(18-24),abcdefu (GAYLE),KBKS-FM,0
551790,2760708,3321797,2023-01-29,412759,F (18-24),Gender,,,R,2,48.619369,66.943894,86.120871,Female_(18-24),abcdefu (GAYLE),KBKS-FM,0
551791,2760708,3321797,2023-02-05,412759,F (18-24),Gender,,,R,2,48.776339,66.489081,85.720871,Female_(18-24),abcdefu (GAYLE),KBKS-FM,0


### Read pre-written files

In [126]:
df_out_final = pd.read_pickle('df_out_final_KIIS-FM_2022_12_04_2023_01_22.pkl')

In [125]:
df_out_final_fmt = pd.read_pickle('df_out_final_H1_2022_12_04_2023_01_22.pkl')

In [113]:
df_out_final_fmt

Unnamed: 0,mediabase_id,station_id,week_dt,breakout_id,breakout_name,demo_category,pop_co,pop_omt,gcr,taa_quintile,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh,segment,song_artist,call_letters,wobble_flag
2,1085550,3322808,2022-12-04,412759,F (18-24),Gender,,,G,5,69.020207,89.997511,98.529371,Female_(18-24),In The End (LINKIN PARK),KZHT-FM,0
3,1085550,3322808,2022-12-11,412759,F (18-24),Gender,,,G,5,71.647261,89.997511,98.699385,Female_(18-24),In The End (LINKIN PARK),KZHT-FM,0
4,1085550,3322808,2022-12-18,412759,F (18-24),Gender,,,G,5,69.020207,90.223598,98.529371,Female_(18-24),In The End (LINKIN PARK),KZHT-FM,0
5,1085550,3322808,2022-12-25,412759,F (18-24),Gender,,,G,5,71.647261,89.997511,98.699385,Female_(18-24),In The End (LINKIN PARK),KZHT-FM,0
6,1085550,3322808,2023-01-01,412759,F (18-24),Gender,,,G,5,71.647261,89.997511,98.699385,Female_(18-24),In The End (LINKIN PARK),KZHT-FM,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
659830,2754544,3322204,2022-12-25,401515,*Old*,Age,,,R,2,57.644843,79.531999,94.185463,Old,Enemy From League Of Legends (IMAGINE DRAGONS),WHQC-FM,0
659831,2754544,3322204,2023-01-01,401515,*Old*,Age,,,R,2,57.428034,78.403516,93.442238,Old,Enemy From League Of Legends (IMAGINE DRAGONS),WHQC-FM,0
659832,2754544,3322204,2023-01-08,401515,*Old*,Age,,,R,2,57.002017,78.105013,93.442238,Old,Enemy From League Of Legends (IMAGINE DRAGONS),WHQC-FM,0
659833,2754544,3322204,2023-01-15,401515,*Old*,Age,,,R,2,57.002017,78.907983,93.787264,Old,Enemy From League Of Legends (IMAGINE DRAGONS),WHQC-FM,0


In [92]:
pd.unique(df_out_final['week_dt'])

array([datetime.date(2023, 1, 22), datetime.date(2023, 1, 29),
       datetime.date(2023, 2, 5), datetime.date(2022, 12, 18),
       datetime.date(2022, 12, 25), datetime.date(2023, 1, 1),
       datetime.date(2023, 1, 8), datetime.date(2023, 1, 15)],
      dtype=object)

#### wobbles report

In [81]:
df_instances = df_out_final[(df_out_final['breakout_id'] > 1) & (df_out_final['wobble_flag'] == 1)][['station_id', 'mediabase_id']].drop_duplicates()

In [82]:
df_instances

Unnamed: 0,station_id,mediabase_id
10030,3322204,1180221
16637,3322002,1203364
20934,3322022,1234685
39573,3322821,1340502
40067,3322821,1340651
...,...,...
586419,3322025,2794968
586518,3322204,2794968
586646,3322227,2794968
586803,3322009,2742545


In [83]:
df_wobble = df_instances.join(df_out_final[(df_out_final['breakout_id'] > 0) & (~pd.isna(df_out_final['pop_co']))].set_index(['mediabase_id', 'station_id']), on = ['mediabase_id', 'station_id'], how='left')

In [84]:
df_wobble.shape

(16335, 17)

In [85]:
df_wobble = df_wobble.join(df_test.set_index(['mediabase_id', 'station_id', 'week_dt', 'breakout_id'])[['spins_non_on', 'market_spins']], on=['mediabase_id', 'station_id', 'week_dt', 'breakout_id'], how='left')

In [87]:
taa_query = '''
Select mediabase_id, station_id, week_dt, taa
from dbo.rr_scores_adds_from_prod as rsafp
where week_dt >= '2022-12-18'
and format='H1'
--and station_id = 3322022
'''

engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as conn:
    with conn.begin():
        df_taa_lookup = pd.read_sql(taa_query, con=conn)

In [89]:
df_wobble = df_wobble.join(df_taa_lookup.set_index(['mediabase_id', 'station_id', 'week_dt'])['taa'], on=['mediabase_id', 'station_id', 'week_dt'], how='left')

ValueError: columns overlap but no suffix specified: Index(['taa'], dtype='object')

In [90]:
df_wobble['score_date'] = df_wobble['week_dt'] + np.timedelta64(8, 'D')

In [91]:
df_wobble[['score_date', 'week_dt']].drop_duplicates()

Unnamed: 0,score_date,week_dt
10030,2023-02-27,2023-02-19
16637,2023-01-23,2023-01-15
16637,2023-02-20,2023-02-12
42824,2023-01-16,2023-01-08
48972,2023-02-13,2023-02-05
114577,2023-01-09,2023-01-01
114577,2023-01-30,2023-01-22
141338,2023-02-06,2023-01-29


In [92]:
df_wobble_out = df_wobble.pivot(index=['call_letters', 'song_artist', 'score_date', 'gcr', 'market_spins', 'spins_non_on', 'taa'], columns=['breakout_name'], values=['pop_co', 'wobble_flag'])

In [93]:
df_wobble_out

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,pop_co,pop_co,pop_co,pop_co,pop_co,pop_co,pop_co,pop_co,pop_co,wobble_flag,wobble_flag,wobble_flag,wobble_flag,wobble_flag,wobble_flag,wobble_flag,wobble_flag,wobble_flag
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,breakout_name,*Core*,*Old*,*Young*,AA,F (18-24),Hispanic,Total,WAO,White,*Core*,*Old*,*Young*,AA,F (18-24),Hispanic,Total,WAO,White
call_letters,song_artist,score_date,gcr,market_spins,spins_non_on,taa,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
KBKS-FM,Anti-Hero (TAYLOR SWIFT),2023-01-09,C,206,101,7.9550,57.0,53.0,87.0,,87.0,,62.0,,62.0,0.0,0.0,0.0,,0.0,,0.0,,0.0
KBKS-FM,Anti-Hero (TAYLOR SWIFT),2023-01-23,C,194,73,7.4258,68.0,67.0,50.0,,50.0,,61.0,,63.0,0.0,0.0,1.0,,1.0,,0.0,,0.0
KBKS-FM,Anti-Hero (TAYLOR SWIFT),2023-02-06,C,103,40,7.4031,64.0,56.0,56.0,,56.0,,52.0,,57.0,0.0,0.0,0.0,,0.0,,0.0,,0.0
KBKS-FM,Anti-Hero (TAYLOR SWIFT),2023-02-20,C,62,31,7.1092,53.0,76.0,21.0,,21.0,,53.0,,64.0,1.0,0.0,1.0,,1.0,,0.0,,0.0
KBKS-FM,As It Was (HARRY STYLES),2023-01-09,C,144,32,8.2811,90.0,77.0,85.0,,85.0,,80.0,,81.0,0.0,0.0,0.0,,0.0,,0.0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WZFT-FM,golden hour (JVKE),2023-02-27,R,65,43,6.8937,56.0,77.0,39.0,72.0,39.0,,54.0,51.0,46.0,1.0,0.0,1.0,0.0,1.0,,0.0,1.0,1.0
WZFT-FM,good 4 u (OLIVIA RODRIGO),2023-01-09,R,45,5,7.8567,97.0,119.0,70.0,,70.0,,89.0,85.0,87.0,1.0,1.0,0.0,,0.0,,0.0,0.0,0.0
WZFT-FM,good 4 u (OLIVIA RODRIGO),2023-01-30,R,19,10,8.0981,71.0,81.0,62.0,94.0,62.0,,69.0,69.0,70.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
WZFT-FM,good 4 u (OLIVIA RODRIGO),2023-02-13,R,13,3,7.8576,96.0,92.0,81.0,78.0,76.0,,86.0,86.0,90.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0


In [141]:
df_wobble_out.columns

MultiIndex([(     'pop_co',    '*Core*'),
            (     'pop_co',     '*Old*'),
            (     'pop_co',   '*Young*'),
            (     'pop_co', 'F (18-24)'),
            (     'pop_co',  'Hispanic'),
            (     'pop_co',     'Total'),
            (     'pop_co',       'WAO'),
            ('wobble_flag',    '*Core*'),
            ('wobble_flag',     '*Old*'),
            ('wobble_flag',   '*Young*'),
            ('wobble_flag', 'F (18-24)'),
            ('wobble_flag',  'Hispanic'),
            ('wobble_flag',     'Total'),
            ('wobble_flag',       'WAO')],
           names=[None, 'breakout_name'])

#### percentage gaps report

In [94]:
out_cols = ['call_letters', 'song_artist', 'demo_category', 'segment', 'taa_quintile', 'pop_co', 'pop_omt', 'wobble_flag', 'mean_pop_predicted']
id_cols = ['station_id', 'mediabase_id', 'week_dt']

In [95]:
df_to_report = df_out_final[id_cols + out_cols]

In [97]:
df_taa_lookup

Unnamed: 0,mediabase_id,station_id,week_dt,taa
0,1909327,3321797,2022-12-18,8.3118
1,1611338,3322816,2022-12-18,8.6139
2,1609179,4955311,2022-12-18,8.9641
3,1611338,3322002,2022-12-18,9.0092
4,1611338,3322006,2022-12-18,8.6473
...,...,...,...,...
177395,2854579,3322916,2023-02-05,5.6767
177396,2855293,3322798,2023-02-05,5.6415
177397,2052083,3323400,2023-02-05,8.6008
177398,2110204,3322808,2023-02-05,6.9744


In [98]:
df_taa_lookup.set_index(id_cols)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,taa
station_id,mediabase_id,week_dt,Unnamed: 3_level_1
3321797,1909327,2022-12-18,8.3118
3322816,1611338,2022-12-18,8.6139
4955311,1609179,2022-12-18,8.9641
3322002,1611338,2022-12-18,9.0092
3322006,1611338,2022-12-18,8.6473
...,...,...,...
3322916,2854579,2023-02-05,5.6767
3322798,2855293,2023-02-05,5.6415
3323400,2052083,2023-02-05,8.6008
3322808,2110204,2023-02-05,6.9744


In [96]:
df_to_report = df_to_report.join(df_taa_lookup.set_index(id_cols), on=id_cols)

In [97]:
df_to_report_fmt = df_out_final_fmt[out_cols + id_cols]

In [98]:
df_to_report.sort_values(by=['call_letters', 'song_artist', 'week_dt', 'demo_category', 'segment'], inplace=True)

In [99]:
# define percentage gap lambdas
fn_diff_Age = lambda x: (x['Young'] - x['Old'])/x['Old']
fn_diff_Gender  = lambda x: (x['Female_Other'] - x['Female_(18-24)'])/x['Female_(18-24)']
fn_diff_Race_AA = lambda x: (x['AA'] - x['White'])/(x['White'])
fn_diff_Race_Hispanic = lambda x: (x['Hispanic'] - x['White'])/(x['White'])
fn_diff_Non_Core = lambda x: (x['Non-Core'] - x['Core'])/(x['Core'])

In [100]:
ref_demo = {'Age': 'Old', 'Race': 'White', 'Core-Cume': 'Core', 'Gender': 'Female_(18-24)'}
df_to_report_pvt = df_to_report.pivot_table(index=['call_letters', 'song_artist', 'week_dt', 'taa'], columns=['segment'], values=[ 'mean_pop_predicted'])
df_to_report_pvt.columns = [i[1].replace(' ', '_') for i in df_to_report_pvt.columns]

In [101]:
df_to_report_pvt_fmt =  (df_to_report_fmt.groupby(['song_artist', 'segment', 'week_dt'])['mean_pop_predicted'].mean().reset_index()).pivot_table(index=['song_artist', 'week_dt'], columns=['segment'], values=[ 'mean_pop_predicted'])
df_to_report_pvt_fmt.columns = [i[1].replace(' ', '_') for i in df_to_report_pvt_fmt.columns]

In [102]:
df_perc_gaps = df_to_report_pvt.apply([fn_diff_Age, fn_diff_Gender, fn_diff_Non_Core, fn_diff_Race_Hispanic, fn_diff_Race_AA], axis=1).reset_index()
df_perc_gaps.columns = ['call_letters', 'song_artist', 'week_dt', 'taa', 'age_gap', 'gender_gap', 'core_gap', 'race_gap_Hispanic', 'race_gap_AA']

In [103]:
df_perc_gaps_fmt = df_to_report_pvt_fmt.apply([fn_diff_Age, fn_diff_Gender, fn_diff_Non_Core, fn_diff_Race_Hispanic, fn_diff_Race_AA], axis=1).reset_index()
df_perc_gaps_fmt.columns = ['song_artist', 'week_dt', 'fmt_age_gap', 'fmt_gender_gap', 'fmt_core_gap', 'fmt_race_gap_Hispanic', 'fmt_race_gap_AA']

In [104]:
df_perc_gaps = df_perc_gaps.join(df_perc_gaps_fmt.set_index(['song_artist', 'week_dt']), on=['song_artist', 'week_dt'], how='left')

In [105]:
df_perc_gaps['score_date'] = df_perc_gaps['week_dt'] + np.timedelta64(8, 'D')

In [89]:
df_perc_gaps.columns

Index(['call_letters', 'song_artist', 'week_dt', 'taa', 'age_gap',
       'gender_gap', 'core_gap', 'race_gap_Hispanic', 'race_gap_AA',
       'fmt_age_gap', 'fmt_gender_gap', 'fmt_core_gap',
       'fmt_race_gap_Hispanic', 'fmt_race_gap_AA', 'score_date'],
      dtype='object')

In [106]:
out_cols_gaps = ['call_letters', 'song_artist', 'score_date', 'age_gap','fmt_age_gap', 'gender_gap','fmt_gender_gap',
       'core_gap', 'fmt_core_gap','race_gap_Hispanic',
       'fmt_race_gap_Hispanic', 'race_gap_AA',
       'fmt_race_gap_AA', 'taa' ]

#### WOW data

In [111]:
df_perc_gaps.pivot_table(index=['song_artist', 'call_letters'], columns=['week_dt'], values=['age_gap'])

Unnamed: 0_level_0,Unnamed: 1_level_0,age_gap,age_gap,age_gap,age_gap,age_gap,age_gap,age_gap,age_gap
Unnamed: 0_level_1,week_dt,2022-12-18,2022-12-25,2023-01-01,2023-01-08,2023-01-15,2023-01-22,2023-01-29,2023-02-05
song_artist,call_letters,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
(You Drive Me) Crazy (BRITNEY SPEARS),KBKS-FM,-0.350009,-0.350009,-0.329761,-0.350009,-0.323371,-0.329761,-0.350426,-0.323894
(You Drive Me) Crazy (BRITNEY SPEARS),KHKS-FM,-0.297558,-0.298055,-0.300857,,,,,
(You Drive Me) Crazy (BRITNEY SPEARS),KKRZ-FM,-0.283002,-0.301122,-0.257236,,,,,
(You Drive Me) Crazy (BRITNEY SPEARS),KZHT-FM,-0.364955,-0.374464,-0.371970,-0.374464,-0.361526,-0.358929,,
(You Drive Me) Crazy (BRITNEY SPEARS),KZZP-FM,-0.315539,-0.325651,-0.328269,-0.325651,-0.311892,-0.310135,-0.317294,-0.317294
...,...,...,...,...,...,...,...,...,...
traitor (OLIVIA RODRIGO),WXXL-FM,0.152552,0.152552,0.147659,0.152552,0.152552,0.160212,0.160212,0.174941
uh oh (TATE MCRAE),WHYI-FM,,-0.201036,-0.200880,-0.201411,-0.189185,-0.193634,-0.193634,-0.190705
uh oh (TATE MCRAE),WKSC-FM,,,,-0.283404,-0.277952,-0.283598,-0.283598,-0.282821
uh oh (TATE MCRAE),WNCI-FM,,,,-0.068704,-0.049529,-0.109401,-0.109401,-0.105983


### Write Output to Excel

In [107]:
df_perc_gaps.to_pickle('df_perc_gaps_02262023.pkl')
df_wobble_out.to_pickle('df_wobble_out_02262023.pkl')

In [108]:
with pd.ExcelWriter('RR_by_Demographic_CHR_02272023.xlsx') as writer:
    df_perc_gaps[out_cols_gaps].to_excel(writer, sheet_name='Percentage Gaps')
    df_wobble_out.to_excel(writer, sheet_name='Wobbles')