In [2]:
# import packages
import pandas as pd
import pickle as pkl
import os
import numpy as np

In [3]:
from sqlalchemy import create_engine

def postgresql_engine(user, pwd, host, port, dbname):
    # Need pyycopg2-binary package
    sql_engine = create_engine('postgres://' + user + ':' + pwd + '@' + host + ':' + port + '/' + dbname, echo=False)
    return sql_engine

In [4]:
# DB username and password
import getpass

user = getpass.getpass()
pwd = getpass.getpass()

In [6]:
# misc db parameters
host= 'adds-postgres-dev.cfgztrijqgvp.us-east-1.rds.amazonaws.com'
dbname= 'musiclab'
port= '5432'

In [19]:
# get callout research for songs released in the past 2 years
format_code = 'U1'
data_query_train = f'''
Select *
from adds_temp.demo_rr_features_{format_code.lower()} as rdfh
where pop_all is not null
'''

In [17]:
filter_rules = '''and song_weeks_since_last_spins <=13
and ((song_last_test_co_weeks <=26)
or (song_last_test_omt_weeks <=104)
or format_code in ('u4','l2'))
and (((station_test_1_plus=0 and station_test_1_id=1) or station_test_1_plus>0) or format_code in ('u4','l2'))
and (format_code<>'h1' or station_id<>3323403)
and (format_code<>'c1' or station_id<>3322825)
and (format_code<>'a2' or station_id<>3322799 or gcr<>'G')
and taa_quintile is not null
and gcr_adj is not null
'''

In [20]:
print(data_query_train + filter_rules)


Select *
from adds_temp.demo_rr_features_u1 as rdfh
where pop_all is not null
and song_weeks_since_last_spins <=13
and ((song_last_test_co_weeks <=26)
or (song_last_test_omt_weeks <=104)
or format_code in ('u4','l2'))
and (((station_test_1_plus=0 and station_test_1_id=1) or station_test_1_plus>0) or format_code in ('u4','l2'))
and (format_code<>'h1' or station_id<>3323403)
and (format_code<>'c1' or station_id<>3322825)
and (format_code<>'a2' or station_id<>3322799 or gcr<>'G')
and taa_quintile is not null
and gcr_adj is not null



In [22]:
engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as con:
    with con.connect():
        df_train = pd.read_sql(data_query_train + filter_rules, con=con)

In [23]:
#write to pkl file
df_train.to_pickle('df_train_U1_2023_02_19_2021_02_26.pkl')

In [13]:
# read from pickle file
df_train = pd.read_pickle('df_train_U1_2023_02_19_2021_02_26.pkl')

In [24]:
df_train.sort_values(by=['station_id', 'mediabase_id', 'breakout_name', 'week_dt', ], inplace=True)

In [25]:
df_train.groupby(['breakout_name'])['mediabase_id'].count()

breakout_name
*Core*       19697
*Old*        20913
*Young*      20820
AA           11291
F (15-22)      965
F (15-24)     1424
F (16-19)      110
F (16-21)     3563
F (16-22)     3923
F (16-24)     6168
F (16-28)     1035
F (17-28)     1145
F (18-23)     2091
F (18-24)    11057
F (18-27)      992
F (18-29)      545
F (20-24)     1024
F (22-27)     3563
F (23-29)     3026
F (24-27)     2091
F (25-27)      179
F (25-29)     2185
F (25-32)     1287
F (25-34)    11991
F (27-37)       73
F (28-34)      992
F (30-34)       33
F (30-39)      545
F (30-40)       73
F (35-44)       73
F (Other)    11059
Hispanic     10570
Non-Core     19696
TOTAL (F)    20912
TOTAL (M)    20915
Total        20930
White          439
Name: mediabase_id, dtype: int64

In [26]:
# define demo segments and categories
breakout_category = {'*Core*': 'Core-Cume', '*Old*': 'Age', '*Young*': 'Age', 'Total': 'Total', 'White': 'Race',
                     'Non-Core': 'Core-Cume',
                     'Hispanic': 'Race', 'AA': 'Race', 'TOTAL (F)': 'Gender', 'TOTAL (M)': 'Gender', 'WAO': 'Race',
                     'F (25-29)': 'Gender', 'F (20-24)': 'Gender', 'F (18-29)': 'Gender', 'F (17-29)': 'Gender',
                     'F (20-23)': 'Gender', 'F (18-39)' : 'Gender',
                     'F (16-24)': 'Gender', 'F (30-34)': 'Gender', 'F (18-34)': 'Gender', 'F (24-29)': 'Gender',
                     'F (17-19)': 'Gender', 'F (15-26)': 'Gender', 'F (15-19)': 'Gender', 'F (15-24)': 'Gender',
                     'F (18-24)': 'Gender', 'F (20-29)': 'Gender', 'F (25-34)': 'Gender', 'F (Other)': 'Gender'}

breakout_map = {'*Core*': 'Core', '*Old*': 'Old', '*Young*': 'Young', 'Total': 'Total', 'White': 'White',
                'Non-Core': 'Non-Core',
                'Hispanic': 'Hispanic', 'AA': 'AA', 'TOTAL (F)': 'Total_Female', 'TOTAL (M)': 'Total_Male', 'WAO': 'White', 'F (25-29)': 'Female',
                'F (20-24)': 'Female', 'F (18-29)': 'Female', 'F (17-29)': 'Female', 'F (20-23)': 'Female',
                'F (16-24)': 'Female', 'F (30-34)': 'Female', 'F (18-34)': 'Female', 'F (24-29)': 'Female',
                'F (17-19)': 'Female', 'F (18-39)' : 'Female',
                'F (15-26)': 'Female', 'F (15-19)': 'Female', 'F (15-24)': 'Female',
                'F (18-24)': 'Female', 'F (20-29)': 'Female', 'F (25-34)': 'Female',
                'F (Other)': 'Female'}

In [27]:
# create segment and category fields
df_train['segment'] = df_train['breakout_name'].apply(func=(lambda x: breakout_map[x] if (x in breakout_map.keys()) else None))
df_train['demo_category'] = df_train['breakout_name'].apply(func=(lambda x: breakout_category[x] if x in breakout_category.keys() else None))

In [32]:
df_train.groupby(['segment', 'demo_category'])['mediabase_id'].count()

segment       demo_category
AA            Race              9154
Core          Core-Cume        17302
Hispanic      Race              9181
Non-Core      Core-Cume        17302
Old           Age              18330
Total         Total            18347
Total_Female  Gender           18330
Total_Male    Gender           18331
White         Race               202
Young         Age              18230
Name: mediabase_id, dtype: int64

In [29]:
# drop misc female breakouts
drop_idx = df_train[(df_train['segment'] == 'Female') | (pd.isna(df_train['segment']))].index
df_train.drop(drop_idx, inplace=True)

In [31]:
# Drop songs with just a single score in the past 2 years
df_train_week_ct = pd.DataFrame(df_train.groupby(['station_id', 'mediabase_id', 'breakout_id'])['week_dt'].count())
drop_idx = df_train.join(df_train_week_ct[df_train_week_ct['week_dt'] == 1], on=['station_id', 'mediabase_id', 'breakout_id'], how='right', rsuffix='_r').index
df_train.drop(index=drop_idx, inplace=True)

In [33]:
df_train.groupby(['demo_category', 'segment', 'taa_quintile'])['mediabase_id'].count()

demo_category  segment       taa_quintile
Age            Old           1               2156
                             2               4238
                             3               4045
                             4               2994
                             5               4897
               Young         1               2142
                             2               4213
                             3               4017
                             4               2983
                             5               4875
Core-Cume      Core          1               2118
                             2               4107
                             3               3821
                             4               2819
                             5               4437
               Non-Core      1               2121
                             2               4106
                             3               3818
                             4               2819
        

### Investigate columns with missing data

In [11]:
len(df_train[pd.isna(df_train['mediabase_id'])])

0

In [12]:
len(df_train[df_train['gcr'] == df_train['gcr_adj']])/len(df_train)

1.0

In [17]:
len(df_train.columns)

108

In [7]:
df_train.shape

(614649, 108)

In [8]:
df_train.groupby(['station_test_1_plus', 'station_test_1_id'])['mediabase_id'].count()

station_test_1_plus  station_test_1_id
0                    1                     58773
1                    0                    497747
                     1                     58129
Name: mediabase_id, dtype: int64

##### isolate numeric and categorical columns

In [34]:
# constants
num_cols_like = ['artist_count', 'feat_artist', 'feat_artist_song', 'mscore', 'spins','pop_prior',
                 'pop_artist_prior', 'song_age_weeks', 'song_last_test']
cat_cols_like = ['Market_Name', 'taa_quintile', 'segment', 'gcr', 'gcr_adj', 'omt_co_flag']
target = ['pop_all']
id_cols = ['mediabase_id', 'station_id', 'week_dt', 'breakout_id', 'breakout_name', 'demo_category', 'pop_co', 'pop_omt', 'gcr']

exclude_cols_like = ['date','song_last_test_any_weeks','song_last_test_co_weeks', 'song_last_test_omt_weeks',
                     'std_pop_prior', 'std_pop_artist_prior']#,'_unv']#,'univ_spins', 'market_spins']

In [35]:
id_cols = id_cols
target_col = target
exclude_cols = df_train.columns[df_train.columns.str.contains('|'.join(exclude_cols_like), regex=True)]

cat_cols = list(set(df_train.columns[df_train.columns.str.contains('|'.join(cat_cols_like), regex=True)]) - set(
    id_cols) - set(exclude_cols))

num_cols = list(set(df_train.select_dtypes(exclude=['object', 'datetime64']).columns) & set(
df_train.columns[(df_train.columns.str.contains('|'.join(num_cols_like), regex=True))]) - set(id_cols) - set(cat_cols) - set(exclude_cols))

feature_cols = list(set(list(num_cols) + list(cat_cols)))

In [14]:
exclude_cols

Index(['song_release_date', 'song_last_test_any_weeks',
       'song_last_test_co_weeks', 'song_last_test_omt_weeks', 'std_pop_prior',
       'mr_pop_prior_date', 'mr_pop_prior_unv_date', 'std_pop_artist_prior'],
      dtype='object')

##### check missingness by different feature types

In [34]:
# check missingness by instance (station-song combination) for categorical variables
cols_avl_cat = df_train.groupby(['station_id', 'mediabase_id'])[cat_cols].agg(lambda x: 1 - sum(pd.isnull(x))/(1.0*len(x))).reset_index()

In [35]:
cols_avl_cat.describe()

Unnamed: 0,station_id,mediabase_id,taa_quintile,gcr_adj,omt_co_flag,Market_Name,segment
count,4774.0,4774.0,4774.0,4774.0,4774.0,4774.0,4774.0
mean,3344216.0,2450651.0,1.0,1.0,1.0,1.0,1.0
std,174230.3,394312.8,0.0,0.0,0.0,0.0,0.0
min,3321797.0,1085550.0,1.0,1.0,1.0,1.0,1.0
25%,3322025.0,2355676.0,1.0,1.0,1.0,1.0,1.0
50%,3322828.0,2583267.0,1.0,1.0,1.0,1.0,1.0
75%,3323410.0,2720314.0,1.0,1.0,1.0,1.0,1.0
max,4762077.0,2848773.0,1.0,1.0,1.0,1.0,1.0


In [24]:
df_train[(df_train['station_id'] == 3321797) & (df_train['mediabase_id'] == 1086587) & (pd.isna(df_train['segment']))]

Unnamed: 0,mediabase_id,station_id,week_dt,artist_id,format_code,FirstLast,SongTitle,Market_Name,song_release_date,breakout_id,...,mean_pop_artist_prior,std_pop_artist_prior,count_pop_artist_prior,max_pop_artist_prior_unv,min_pop_artist_prior_unv,mean_pop_artist_prior_unv,count_pop_artist_prior_unv,mr_pop_artist_prior_unv,segment,demo_category
2125,1086587,3321797,2022-07-31,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,,,,,,,,,,
2128,1086587,3321797,2022-08-21,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,64.0,,1.0,64.0,64.0,64.0,1.0,64.0,,
2132,1086587,3321797,2022-09-04,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,71.0,9.899495,2.0,78.0,64.0,68.666667,3.0,78.0,,
2133,1086587,3321797,2022-09-18,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,71.333333,7.023769,3.0,78.0,64.0,70.0,6.0,72.0,,
2136,1086587,3321797,2022-10-02,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,72.0,5.887841,4.0,78.0,64.0,70.8,10.0,74.0,,
2138,1086587,3321797,2022-10-16,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,70.6,5.98331,5.0,78.0,64.0,70.733333,15.0,65.0,,
2139,1086587,3321797,2022-10-30,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,71.5,5.787918,6.0,78.0,64.0,70.952381,21.0,76.0,,
2141,1086587,3321797,2022-11-27,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,72.714286,6.183696,7.0,80.0,64.0,71.392857,28.0,80.0,,
2087,1086587,3321797,2022-07-31,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428762,...,,,,,,,,,,
2090,1086587,3321797,2022-08-21,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428762,...,59.0,,1.0,59.0,59.0,59.0,1.0,59.0,,


In [72]:
# Investigate taa_quintile
# df_temp = df_train[pd.isna(df_train['taa_quintile'])][['station_id', 'mediabase_id', 'week_dt']]
# len(df_temp)
# df_temp.groupby(['week_dt']).agg({'mediabase_id':len}).sort_values(by=['mediabase_id'], ascending=False)
# df_temp.groupby(['mediabase_id']).agg({'week_dt':len}).sort_values(by=['week_dt'], ascending=False)
# df_temp.groupby(['station_id', 'mediabase_id']).agg({'week_dt':len}).sort_values(by=['week_dt'], ascending=False)
# idx = (df_train['station_id'] == 3323400) & (df_train['mediabase_id'] == 2591543)
# df_train.loc[idx].groupby(['week_dt'])['mediabase_id'].count()
# idx = (df_train['station_id'] == 3323400) & (df_train['mediabase_id'] == 2591543) & (pd.isna(df_train['taa_quintile'])) & (df_train['breakout_name'] == 'Total')
# df_train.loc[idx]

In [149]:
# Investigate gcr and gcr_adj
# df_temp = df_train[(pd.isna(df_train['gcr_adj'])) & (~pd.isna(df_train['gcr']))][['station_id', 'mediabase_id', 'week_dt']]
# len(df_temp)
# print(df_temp.groupby(['week_dt']).agg({'mediabase_id':len}).sort_values(by=['mediabase_id'], ascending=False))
# print(df_temp.groupby(['mediabase_id']).agg({'week_dt':len}).sort_values(by=['week_dt'], ascending=False))
# print(df_temp.groupby(['station_id', 'mediabase_id']).agg({'week_dt':len}).sort_values(by=['week_dt'], ascending=False))
# idx = (df_train['station_id'] == 3322002) & (df_train['mediabase_id'] == 2294907)
# df_train.loc[idx].groupby(['week_dt'])['mediabase_id'].count()
# idx = (df_train['station_id'] == 3323404) & (df_train['mediabase_id'] == 2629560) & (pd.isna(df_train['gcr_adj'])) & (df_train['breakout_name'] == 'Total')
# df_train.loc[idx]
# [np.min(df_temp['week_dt']), np.max(df_temp['week_dt'])]

[datetime.date(2020, 11, 22), datetime.date(2021, 1, 24)]

###### Investigate Numeric Columns

In [36]:
# Investigate Numeric Columns
num_cols_spins = [col for col in num_cols if 'spins' in col]
num_cols_pop = [col for col in num_cols if 'pop' in col]
num_cols_other = list(set(num_cols) - set(num_cols_spins) - set(num_cols_pop))
num_cols_spins_perc = [i for i in num_cols_spins if (('perc_diff_' in i) or ('per_diff' in i))]
num_cols_spins_nonperc = list(set(num_cols_spins) - set(num_cols_spins_perc))

In [17]:
[len(num_cols), len(num_cols_spins), len(num_cols_pop), len(num_cols_other)]

[75, 52, 19, 4]

In [18]:
num_cols_other

['artist_count', 'feat_artist', 'song_age_weeks', 'feat_artist_song']

In [19]:
num_cols_pop

['min_pop_prior',
 'mean_pop_prior_unv',
 'min_pop_artist_prior_unv',
 'count_pop_artist_prior_unv',
 'max_pop_artist_prior_unv',
 'mean_pop_artist_prior_unv',
 'mr_pop_prior_unv',
 'max_pop_prior',
 'mr_pop_prior',
 'count_pop_artist_prior',
 'max_pop_artist_prior',
 'min_pop_artist_prior',
 'mr_pop_artist_prior_unv',
 'mean_pop_artist_prior',
 'med_pop_prior',
 'min_pop_prior_unv',
 'count_pop_prior_unv',
 'mean_pop_prior',
 'max_pop_prior_unv']

In [20]:
df_train.shape

(377628, 110)

In [68]:
print(581054 + 33595)

614649


In [37]:
# Backfill pop based data
df_train[num_cols_pop] = df_train.groupby(['station_id', 'mediabase_id', 'breakout_id'])[num_cols_pop].bfill()

In [38]:
# Fill missing perc spin diffs with 1.0
df_train[num_cols_spins_perc] = df_train[num_cols_spins_perc].transform(lambda x: x.fillna(1.0))

In [39]:
# Backfill non perc diff spins diff
df_train[num_cols_spins_nonperc] = df_train.groupby(['station_id', 'mediabase_id', 'breakout_id'])[num_cols_spins_nonperc].bfill()

In [40]:
# Write processed train set to pickle file
df_train.to_pickle('df_train_U1_2023_02_19_2021_02_26_processed.pkl')

In [43]:
num_cols_spins_nonperc

['avg_market_spins_prior',
 'avg_station_artist_spins_prior',
 'mr_market_spins_spins',
 'mr_song_univ_spins_prior',
 'market_artist_spins',
 'song_univ_spins',
 'avg_artist_univ_spins_prior',
 'total_market_artist_spins_prior',
 'total_station_artist_spins_prior',
 'song_market_weeks_since_first_spins',
 'diff_spins_song_market_prior',
 'song_weeks_since_last_spins',
 'station_spins',
 'mr_artist_univ_spins',
 'diff_market_spins_spins_prior',
 'avg_song_univ_spins_prior',
 'artist_weeks_since_first_spins',
 'total_market_spins_prior',
 'diff_artist_univ_spins_prior',
 'diff_spins_song_station_prior',
 'mr_spins_artist_station_prior',
 'format_spins',
 'artist_station_weeks_since_first_spins',
 'artist_univ_spins',
 'song_weeks_since_first_spins',
 'diff_market_artist_spins_prior',
 'diff_spins_artist_station_prior',
 'spins_non_on',
 'total_spins_song_station_prior',
 'diff_song_univ_spins_prior',
 'total_song_univ_spins_prior',
 'avg_spins_song_station_prior',
 'spins_total',
 'avg_m

In [41]:
# check missingness by instance (station-song combination) for categorical variables
cols_avl_num = df_train.groupby(['station_id', 'mediabase_id', 'breakout_name'])[num_cols].agg(lambda x: 1 - sum(pd.isnull(x))/(1.0*len(x))).reset_index()

In [42]:
cols_avl_num = cols_avl_num.join(pd.DataFrame(df_train.groupby(['station_id', 'mediabase_id', 'breakout_name']).count()['week_dt']), on=['station_id', 'mediabase_id', 'breakout_name'], rsuffix='_r')

In [43]:
cols_avl_num[num_cols].mean().reset_index().sort_values(by=[0])

Unnamed: 0,index,0
0,mr_pop_prior_unv,1.0
53,artist_station_weeks_since_first_spins,1.0
52,avg_spins_song_station_prior,1.0
51,diff_market_artist_spins_prior,1.0
50,mr_market_artist_spins_prior,1.0
...,...,...
22,min_pop_prior_unv,1.0
21,song_univ_spins,1.0
20,mean_pop_prior,1.0
18,diff_market_spins_spins_prior,1.0


In [44]:
idx = df_train[cat_cols + num_cols].dropna(axis=1).index
df_train.loc[idx].shape

(144709, 110)

In [45]:
df_train.shape

(144709, 110)

In [93]:
cols_avl_num[(cols_avl_num['song_last_test_co_weeks'] < 1) & (cols_avl_num['breakout_name'] == 'Total')][['station_id', 'mediabase_id', 'breakout_name', 'week_dt']]

Unnamed: 0,station_id,mediabase_id,breakout_name,week_dt
2788,3321799,1388281,Total,2
2812,3321799,1640575,Total,2
7090,3322002,1243640,Total,2
7100,3322002,1249237,Total,2
7134,3322002,1261285,Total,3
...,...,...,...,...
56105,3323602,2397182,Total,2
56124,3323602,2422489,Total,3
56130,3323602,2422949,Total,2
56179,3323602,2445876,Total,3


In [95]:
df_train[(df_train['station_id'] == 3323602) & (df_train['mediabase_id'] == 2348056) & (df_train['breakout_name'] == 'Total')][['station_id', 'mediabase_id', 'breakout_name', 'week_dt', 'song_last_test_any_weeks','song_last_test_co_weeks', 'song_last_test_omt_weeks']]

Unnamed: 0,station_id,mediabase_id,breakout_name,week_dt,song_last_test_any_weeks,song_last_test_co_weeks,song_last_test_omt_weeks
64814,3323602,2348056,Total,2020-11-22,0.0,,0.0
64815,3323602,2348056,Total,2021-02-07,0.0,0.0,1.0
64816,3323602,2348056,Total,2021-09-26,0.0,8.0,0.0
64817,3323602,2348056,Total,2022-08-21,0.0,0.0,1.0


### Prep Data and create train/test splits

In [60]:
id_cols

['mediabase_id',
 'station_id',
 'week_dt',
 'breakout_id',
 'breakout_name',
 'demo_category',
 'pop_co',
 'pop_omt',
 'gcr']

In [46]:
[np.min(df_train['week_dt']),pd.to_datetime(np.min(df_train['week_dt'])) + np.timedelta64(2,'Y'), np.max(df_train['week_dt'])]

[datetime.date(2021, 2, 21),
 Timestamp('2023-02-21 11:38:24'),
 datetime.date(2023, 3, 5)]

In [56]:
# Extract train data
[np.min(df_train['week_dt']),pd.to_datetime(np.min(df_train['week_dt'])) + np.timedelta64(2,'Y'), np.max(df_train['week_dt'])]
scoring_date = pd.to_datetime('2023-01-09')
train_idx = df_train['week_dt'] < scoring_date.date()

df_train_final = df_train.loc[train_idx][id_cols + feature_cols + target_col]
X_train = pd.get_dummies(df_train_final[feature_cols], columns=cat_cols)
y_train = df_train_final[target]

In [65]:
demo_cats = list(set(breakout_category.values()) - set(['Total']))

In [66]:
demo_cats

['Core-Cume', 'Gender', 'Age', 'Race']

In [67]:
X_train.shape

(130985, 110)

In [63]:
[np.min(df_train_final['week_dt']), np.max(df_train_final['week_dt'])]

[datetime.date(2021, 2, 21), datetime.date(2023, 1, 1)]

In [68]:
# imports for model training
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, GroupKFold
from sklearn.metrics import make_scorer, mean_pinball_loss

In [69]:
low_alpha = 0.05
high_alpha = 0.95

param_grid = dict(
    learning_rate=[.2, .1, .05],
    n_estimators=[5, 10, 15],
    max_depth=[2, 4, 6],
    min_samples_leaf=[5, 10, 20],
    min_samples_split=[5, 10, 20]
)

n_iter = 50
n_splits = 5

In [53]:
import time
demo_cols_all = [col for col in X_train.columns if 'segment_' in col]

In [70]:
demo_cols_all

['segment_AA',
 'segment_Core',
 'segment_Hispanic',
 'segment_Non-Core',
 'segment_Old',
 'segment_Total',
 'segment_Total_Female',
 'segment_Total_Male',
 'segment_White',
 'segment_Young']

In [71]:
best_scores = {}
best_estimators = {}

for cat in demo_cats:
    tic = time.perf_counter()
    idx = (df_train_final['demo_category'] == cat)
    print(cat + ': ' + str(sum(idx)))

    # extract relevant segment indicator columns
    demo_cols_cat = ['segment_' + i for i in list(pd.unique(df_train_final.loc[idx]['segment']))]
    demo_cols_excl = list(set(demo_cols_all) - set(demo_cols_cat))
    feature_cols_cat = list(set(X_train.columns) - set(demo_cols_excl))

    # create features and target
    X = X_train.loc[idx][feature_cols_cat]
    y = y_train.loc[idx]

    # quantile regressor

    # gradient boosted quantile regressor
    group_kfold = GroupKFold(n_splits=n_splits)

    # train model for upper threshold given features
    cv = group_kfold.split(X.loc[idx], y.loc[idx], df_train_final.loc[idx]['mediabase_id'])
    neg_mean_pinball_loss_high = make_scorer(
        mean_pinball_loss,
        alpha=high_alpha,
        greater_is_better=False,  # maximize the negative loss
    )

    model_high_thresh = GradientBoostingRegressor(loss="quantile", alpha=high_alpha,
                                                  random_state=0)

    rs_high_thresh = RandomizedSearchCV(
        model_high_thresh,
        param_grid,
        n_iter=n_iter,
        scoring=neg_mean_pinball_loss_high,
        cv=cv,
        verbose=1,
        random_state=0,
        n_jobs=-1
    )

    rs_high_thresh.fit(X.loc[idx], np.ravel(y.loc[idx]))
    print(cat + ": Fitting for upper wobble threshold completed")

    # train model for lower threshold given features
    cv = group_kfold.split(X.loc[idx], y.loc[idx], df_train_final.loc[idx]['mediabase_id'])
    neg_mean_pinball_loss_low = make_scorer(
        mean_pinball_loss,
        alpha=low_alpha,
        greater_is_better=False,  # maximize the negative loss
    )

    model_low_thresh = GradientBoostingRegressor(loss="quantile", alpha=low_alpha,
                                                 random_state=0)

    rs_low_thresh = RandomizedSearchCV(
        model_low_thresh,
        param_grid,
        n_iter=n_iter,
        scoring=neg_mean_pinball_loss_low,
        cv=cv,
        verbose=1,
        random_state=0,
        n_jobs=-1
    )

    rs_low_thresh.fit(X.loc[idx], np.ravel(y.loc[idx]))
    print(cat + ": Fitting for lower wobble threshold completed")

    # train model for mean pop score given features
    cv = group_kfold.split(X.loc[idx], y.loc[idx], df_train_final.loc[idx]['mediabase_id'])
    model_mean = GradientBoostingRegressor(loss="squared_error")

    rs_mean = RandomizedSearchCV(
        model_mean,
        param_grid,
        n_iter=n_iter,
        scoring='neg_mean_absolute_error',
        cv=cv,
        verbose=1,
        random_state=0,
        n_jobs=-1
    )

    rs_mean.fit(X.loc[idx], np.ravel(y.loc[idx]))
    print(cat + ": Fitting for mean pop completed")

    toc = time.perf_counter()
    time_elapsed = toc-tic
    print('Total time elapsed for ' + cat + ': ' + '%.2f'%time_elapsed)

    best_scores[cat] = [rs_low_thresh.best_score_, rs_mean.best_score_, rs_high_thresh.best_score_]
    best_estimators[cat] = [rs_low_thresh.best_estimator_, rs_mean.best_estimator_, rs_high_thresh.best_estimator_]

Core-Cume: 31378
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Core-Cume: Fitting for upper wobble threshold completed
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Core-Cume: Fitting for lower wobble threshold completed
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Core-Cume: Fitting for mean pop completed
Total time elapsed for Core-Cume: 182.81
Gender: 33253
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Gender: Fitting for upper wobble threshold completed
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Gender: Fitting for lower wobble threshold completed
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Gender: Fitting for mean pop completed
Total time elapsed for Gender: 193.66
Age: 33152
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Age: Fitting for upper wobble threshold completed
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Age: Fitting for lower wobble thre

In [72]:
import pickle
pickle.dump(best_scores, open('best_scores_all_u1.pkl', "wb"))
pickle.dump(best_estimators, open('best_estimators_all_u1.pkl', "wb"))

In [23]:
best_estimators = pd.read_pickle('best_estimators_all_u1.pkl')

In [73]:
best_estimators

{'Core-Cume': [GradientBoostingRegressor(alpha=0.05, learning_rate=0.2, loss='quantile',
                            max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15,
                            random_state=0),
  GradientBoostingRegressor(learning_rate=0.2, max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15),
  GradientBoostingRegressor(alpha=0.95, learning_rate=0.2, loss='quantile',
                            max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15,
                            random_state=0)],
 'Gender': [GradientBoostingRegressor(alpha=0.05, learning_rate=0.2, loss='quantile',
                            max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15,
                            random_state=0),
  GradientBoostingRegressor(learning_rate=0.2, max_depth=6, min_samples_leaf=

### Prep scoring data & score


In [75]:
test_data_query = '''
Select *
from adds_temp.demo_rr_features_u1 as rdfh
where week_dt >= '2023-01-09'
'''

In [76]:
engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as conn:
    with conn.connect():
        df_test = pd.read_sql(test_data_query + filter_rules, conn)

In [82]:
df_test.shape

(272475, 110)

In [None]:
# df_test.to_pickle('df_score_2021_01_09_2020_11_28.pkl')

In [24]:
# df_test = pd.read_pickle('df_score_2021_01_09_2020_11_28.pkl')

In [80]:
# create segment and category fields
df_test['segment'] = df_test['breakout_name'].apply(func=(lambda x: breakout_map[x] if (x in breakout_map.keys()) else None))
df_test['demo_category'] = df_test['breakout_name'].apply(func=(lambda x: breakout_category[x] if x in breakout_category.keys() else None))

In [81]:
# drop misc female breakouts
drop_idx = df_test[(df_test['segment'] == 'Female') | (pd.isna(df_test['segment']))].index
df_test.drop(drop_idx, inplace=True)

In [83]:
# Backfill pop based data
df_test[num_cols_pop] = df_test.groupby(['station_id', 'mediabase_id', 'breakout_id'])[num_cols_pop].bfill()

In [84]:
# Fill missing perc spin diffs with 1.0
df_test[num_cols_spins_perc] = df_test[num_cols_spins_perc].transform(lambda x: x.fillna(1.0))

In [85]:
# Backfill non perc diff spins diff
df_test[num_cols_spins_nonperc] = df_test.groupby(['station_id', 'mediabase_id', 'breakout_id'])[num_cols_spins_nonperc].bfill()

In [86]:
pd.unique(df_test['week_dt'])

array([datetime.date(2023, 1, 15), datetime.date(2023, 1, 22),
       datetime.date(2023, 1, 29), datetime.date(2023, 2, 5),
       datetime.date(2023, 2, 12), datetime.date(2023, 2, 19),
       datetime.date(2023, 2, 26), datetime.date(2023, 3, 5)],
      dtype=object)

In [32]:
drop_idx = df_test[pd.to_datetime(df_test['week_dt']) == pd.to_datetime('2022-12-18')].index
df_test.drop(drop_idx, inplace=True)

In [41]:
df_test[(df_test['breakout_name'] == 'Total')].groupby(['station_id', 'week_dt']).apply(lambda x: len(pd.unique(x['mediabase_id'])))

station_id  week_dt   
3321797     2022-12-18    240
            2022-12-25    240
            2023-01-01    234
            2023-01-08    232
            2023-01-15    226
                         ... 
4762077     2023-01-08    100
            2023-01-15     99
            2023-01-22     99
            2023-01-29     99
            2023-02-05     97
Length: 232, dtype: int64

In [33]:
df_test[(df_test['station_id'] == 3322022) & (df_test['breakout_name'] == 'Total')].groupby(['week_dt'])['mediabase_id'].count()

week_dt
2022-12-11    289
2022-12-18    290
2022-12-25    288
2023-01-01    290
2023-01-08    295
2023-01-15    292
2023-01-22    289
2023-01-29    282
Name: mediabase_id, dtype: int64

In [87]:
df_test[num_cols].apply(lambda x: 1 - pd.isnull(x).sum()/len(x)).sort_values()

mean_pop_artist_prior                 0.99967
mr_pop_prior                          0.99967
med_pop_prior                         0.99967
count_pop_artist_prior                0.99967
min_pop_prior                         0.99967
                                       ...   
per_diff_song_univ_spins_prior        1.00000
song_weeks_since_first_spins          1.00000
song_weeks_since_last_spins           1.00000
perc_diff_spins_song_station_prior    1.00000
song_age_weeks                        1.00000
Length: 75, dtype: float64

In [89]:
df_test[cat_cols].apply(lambda x: 1 - pd.isnull(x).sum()/len(x)).sort_values()

taa_quintile    1.0
gcr_adj         1.0
omt_co_flag     1.0
segment         1.0
Market_Name     1.0
dtype: float64

In [88]:
df_test['omt_co_flag'] = df_test['omt_co_flag'].fillna(0)

In [37]:
df_test[pd.isna(df_test['med_pop_prior'])]['mediabase_id'].drop_duplicates()

6362      1165871
14845     1186003
37952     1269144
62822     1436100
69145     1491192
72656     1502744
108046    1686075
144812    1801771
157912    1842682
182811    1932585
188820    1969359
208110    2062455
235840    2154641
240743    2175458
264691    2267512
275131    2294874
282374    2306701
364513    2522450
373446    2544637
387103    2552613
524681    2738262
569767    2781499
627530    2823991
631386    2827240
648129    2848768
653618    2863849
654272    2865765
654441    2838198
Name: mediabase_id, dtype: int64

In [101]:
df_test[(df_test['mediabase_id'] == 1354780) & (df_test['breakout_name'] == 'Total') & (df_test['station_id'] == 3322916)][id_cols + ['med_pop_prior']]

Unnamed: 0,mediabase_id,station_id,week_dt,breakout_id,breakout_name,demo_category,pop_co,pop_omt,gcr,med_pop_prior
36963,1354780,3322916,2022-11-27,1,Total,Total,,,G,
36964,1354780,3322916,2022-12-04,1,Total,Total,,,G,
36965,1354780,3322916,2022-12-11,1,Total,Total,,,G,
36966,1354780,3322916,2022-12-18,1,Total,Total,,,G,
36967,1354780,3322916,2022-12-25,1,Total,Total,,,G,
36968,1354780,3322916,2023-01-01,1,Total,Total,,,G,
36969,1354780,3322916,2023-01-08,1,Total,Total,109.0,,G,


In [92]:
df_test.shape

(272385, 110)

In [91]:
drop_idx = list(set(df_test.index) - set(df_test[feature_cols].dropna().index))
df_test.drop(drop_idx, inplace=True)

#### score all stations

In [93]:
scoring_date = pd.to_datetime('2023-01-15')
test_idx = (df_test['week_dt'] >= scoring_date.date()) # & (df_test['station_id'] == 3322022)

df_test_final = df_test.loc[test_idx][id_cols + feature_cols + target_col]
X_test = pd.get_dummies(df_test_final[feature_cols], columns=cat_cols)

missing_cols = list(set(X_train.columns) - set(X_test.columns))

y_test = df_test_final[target]

In [94]:
df_test_final.shape

(272385, 90)

In [44]:
df_test_final[feature_cols].dropna()

Unnamed: 0,max_pop_prior_unv,avg_market_spins_prior,song_market_weeks_since_first_spins,total_spins_non_on_song_station_prior,song_weeks_since_last_spins,count_pop_artist_prior_unv,market_spins,per_diff_song_univ_spins_prior,mr_pop_prior_unv,max_pop_artist_prior_unv,...,diff_spins_artist_station_prior,format_artist_spins,min_pop_artist_prior_unv,spins_total,mean_pop_artist_prior_unv,diff_spins_song_station_prior,diff_spins_song_market_prior,song_age_weeks,station_artist_spins,per_diff_market_spins_spins_prior
0,112.0,70.226804,97.0,2334.0,1.0,17.0,53,0.107692,86.0,112.0,...,1,10,70.0,5,93.333333,1,1,1148.857143,5,0.152174
1,99.0,70.226804,97.0,2334.0,0.0,7.0,53,0.107692,90.0,99.0,...,1,10,89.0,5,94.714286,1,1,1148.857143,5,0.152174
2,99.0,70.051020,98.0,2338.0,1.0,7.0,45,-0.111111,90.0,99.0,...,0,11,89.0,5,94.714286,0,0,1149.857143,5,-0.150943
3,99.0,69.797980,99.0,2342.0,0.0,7.0,47,0.031250,90.0,99.0,...,-1,10,89.0,3,94.714286,-1,-1,1150.857143,3,0.044444
4,99.0,69.570000,100.0,2345.0,0.0,7.0,49,0.015152,90.0,99.0,...,1,8,89.0,5,94.714286,1,1,1151.857143,5,0.042553
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586873,110.0,85.382979,47.0,2320.0,0.0,1985.0,7,-0.205742,69.0,110.0,...,0,54,50.0,4,82.935013,0,0,47.714286,4,0.750000
586874,110.0,83.750000,48.0,2321.0,0.0,2016.0,9,0.246988,78.0,110.0,...,1,59,50.0,6,82.898810,1,1,48.714286,6,0.285714
586875,110.0,82.224490,49.0,2323.0,1.0,2119.0,8,-0.173913,72.0,110.0,...,0,43,50.0,6,83.019349,0,0,49.714286,6,-0.111111
586876,110.0,80.740000,50.0,2325.0,0.0,2162.0,7,0.111111,98.0,110.0,...,0,40,50.0,5,83.075856,0,0,50.714286,5,-0.125000


In [43]:
set(X_train.columns) - set(X_test.columns)

{'Market_Name_Atlanta',
 'Market_Name_Austin',
 'Market_Name_Baltimore',
 'Market_Name_Boston',
 'Market_Name_Charlotte',
 'Market_Name_Chicago',
 'Market_Name_Cincinnati',
 'Market_Name_Columbus, OH',
 'Market_Name_Dallas',
 'Market_Name_Denver',
 'Market_Name_Detroit',
 'Market_Name_Miami',
 'Market_Name_Minneapolis',
 'Market_Name_Nashville',
 'Market_Name_New York',
 'Market_Name_Orlando',
 'Market_Name_Philadelphia',
 'Market_Name_Phoenix',
 'Market_Name_Pittsburgh',
 'Market_Name_Portland, OR',
 'Market_Name_Raleigh',
 'Market_Name_Salt Lake City',
 'Market_Name_San Diego',
 'Market_Name_San Francisco',
 'Market_Name_Seattle',
 'Market_Name_St. Louis',
 'Market_Name_Tampa',
 'Market_Name_Washington, DC',
 'omt_co_flag_OMT_CO',
 'omt_co_flag_OMT_only',
 'segment_AA'}

In [95]:
for i in missing_cols:
    X_test[i] = 0

In [96]:
X_test = X_test[X_train.columns]

In [97]:
pd.unique(X_test['omt_co_flag_CO_only'])

array([0, 1], dtype=uint8)

In [98]:
df_out = pd.DataFrame(columns=['lower_wob_thresh', 'mean_pop_predicted', 'upper_wobble_thresh'])
for cat in demo_cats:
    #tic = time.perf_counter()
    idx = (df_test_final['demo_category'] == cat)
    print(cat + ': ' + str(sum(idx)))

    # extract relevant segment indicator columns
    demo_cols_cat = ['segment_' + i for i in list(pd.unique(df_test_final.loc[idx]['segment']))]
    demo_cols_excl = list(set(demo_cols_all) - set(demo_cols_cat))
    feature_cols_cat = list(set(X_test.columns) - set(demo_cols_excl))

    # create features and target
    X = X_test.loc[idx][feature_cols_cat]
    # if cat == 'Race':
    #     X['segment_AA'] = 0
    y = y_test.loc[idx]
    #print(X)

    # create empty dataframe
    df_temp = pd.DataFrame()
    # predict using estimator

    #re-arrange features

    df_temp['lower_wob_thresh'] = pd.DataFrame(best_estimators[cat][0].predict(X[best_estimators[cat][0].feature_names_in_]), index=X_test.loc[idx].index)
    df_temp['mean_pop_predicted'] = pd.DataFrame(best_estimators[cat][1].predict(X[best_estimators[cat][1].feature_names_in_]), index=X_test.loc[idx].index)
    df_temp['upper_wobble_thresh'] = pd.DataFrame(best_estimators[cat][2].predict(X[best_estimators[cat][2].feature_names_in_]), index=X_test.loc[idx].index)

    df_out = pd.concat([df_out,df_temp], axis=0)

Core-Cume: 62102
Gender: 66755
Age: 66757
Race: 43391


In [48]:
df_out

Unnamed: 0,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh
57,74.431257,99.431951,112.523041
58,76.144184,98.721912,111.670240
59,76.703320,98.721912,111.670240
92,71.819160,88.912817,104.660213
93,72.025835,88.801459,104.602583
...,...,...,...
586825,66.050168,80.291174,95.755306
586826,66.348059,81.029875,95.755306
586827,66.050168,80.489275,95.755306
586828,66.348059,80.894726,95.755306


#### score format alone without KIIS-FM

In [43]:
# scoring_date = pd.to_datetime('2023-01-01')
# test_idx_fmt = (df_test['week_dt'] >= scoring_date.date())
#
# df_test_final_fmt = df_test.loc[test_idx_fmt][id_cols + ['total_respondents'] + feature_cols + target_col]
# X_test_fmt = pd.get_dummies(df_test_final_fmt[feature_cols], columns=cat_cols)
#
# missing_cols_fmt = list(set(X_train.columns) - set(X_test_fmt.columns))
#
# y_test_fmt = df_test_final_fmt[target]

In [49]:
# missing_cols_fmt

['omt_co_flag_OMT_CO', 'omt_co_flag_OMT_only']

In [44]:
# for i in missing_cols_fmt:
#     X_test_fmt[i] = 0

In [45]:
# X_test_fmt = X_test_fmt[X_train.columns]

In [46]:
# df_out_fmt = pd.DataFrame(columns=['lower_wob_thresh', 'mean_pop_predicted', 'upper_wobble_thresh'])
# for cat in demo_cats:
#     #tic = time.perf_counter()
#     idx = (df_test_final_fmt['demo_category'] == cat)
#     print(cat + ': ' + str(sum(idx)))
#
#     # extract relevant segment indicator columns
#     demo_cols_cat = ['segment_' + i for i in list(pd.unique(df_test_final_fmt.loc[idx]['segment']))]
#     demo_cols_excl = list(set(demo_cols_all) - set(demo_cols_cat))
#     feature_cols_cat = list(set(X_test.columns) - set(demo_cols_excl))
#
#     # create features and target
#
#     X = X_test_fmt.loc[idx][feature_cols_cat]
#     X.dropna(inplace=True)
#     idx_new = X.index
#     print(cat + ': ' + str(len(idx_new)))
#     y = y_test_fmt.loc[idx_new]
#     #print(X)
#
#     # create empty dataframe
#     df_temp_fmt = pd.DataFrame()
#     # predict using estimator
#
#     #re-arrange features
#
#     df_temp_fmt['lower_wob_thresh'] = pd.DataFrame(best_estimators[cat][0].predict(X[best_estimators[cat][0].feature_names_in_]), index=X_test_fmt.loc[idx_new].index)
#     df_temp_fmt['mean_pop_predicted'] = pd.DataFrame(best_estimators[cat][1].predict(X[best_estimators[cat][1].feature_names_in_]), index=X_test_fmt.loc[idx_new].index)
#     df_temp_fmt['upper_wobble_thresh'] = pd.DataFrame(best_estimators[cat][2].predict(X[best_estimators[cat][2].feature_names_in_]), index=X_test_fmt.loc[idx_new].index)
#
#     df_out_fmt = pd.concat([df_out_fmt,df_temp_fmt], axis=0)

Race: 68156
Race: 68156
Core-Cume: 94746
Core-Cume: 94746
Age: 94788
Age: 94788
Gender: 77586
Gender: 77586


In [53]:
# df_out_fmt

Unnamed: 0,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh
20,71.647261,89.997511,98.699385
21,69.020207,90.223598,98.529371
22,71.647261,89.997511,98.699385
23,71.647261,89.997511,98.699385
24,71.647261,89.997511,98.699385
...,...,...,...
656721,59.299788,76.027729,91.449372
656722,59.299788,75.842813,91.449372
656723,60.783936,76.315299,91.449372
656724,60.962091,75.842813,91.449372


In [99]:
# added this line of code since entire format being scored - 02/14/2023
df_out_fmt = df_out

In [100]:
df_test_final_fmt = df_test_final

In [101]:
df_out.to_pickle('df_out_stage_03052023_03132021_U1.pkl')
df_out_fmt.to_pickle('df_out_fmt_stage_03052023_03132021_U1.pkl')

In [3]:
# df_out = pd.read_pickle('df_out_stage_02122023_02192021.pkl')
# df_out_fmt = pd.read_pickle('df_out_fmt_stage_02122023_02192021.pkl')

In [4]:
df_out

Unnamed: 0,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh
282,69.707349,87.357206,97.580157
283,69.707349,87.357206,97.580157
284,69.707349,86.938477,97.246663
285,68.326534,86.938477,97.246663
286,68.326534,86.938477,97.580157
...,...,...,...
663488,54.498126,67.784587,87.376244
663489,54.498126,68.866443,88.720244
663490,54.498126,69.158211,88.720244
663491,54.498126,68.866443,88.720244


### Process output & write to Excel

In [102]:
df_out_final = df_test_final[id_cols + ['taa_quintile']].join(df_out, how='left')

In [103]:
df_out_final['segment'] = df_out_final['breakout_name'].apply(func=(lambda x: breakout_map[x] if (x in breakout_map.keys()) else None))

In [104]:
df_out_final_fmt = df_test_final_fmt[id_cols + ['taa_quintile']].join(df_out_fmt, how='left')

In [105]:
df_out_final_fmt['segment'] = df_out_final_fmt['breakout_name'].apply(func=(lambda x: breakout_map[x] if (x in breakout_map.keys()) else None))

In [106]:
df_out_final

Unnamed: 0,mediabase_id,station_id,week_dt,breakout_id,breakout_name,demo_category,pop_co,pop_omt,gcr,taa_quintile,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh,segment
0,1085968,3323601,2023-01-15,-1,Non-Core,Core-Cume,,,G,2,81.061455,99.048349,109.955714,Non-Core
1,1085968,3323601,2023-01-15,1,Total,Total,,,G,2,,,,Total
2,1085968,3323601,2023-01-22,1,Total,Total,,,G,2,,,,Total
3,1085968,3323601,2023-01-29,1,Total,Total,,,G,2,,,,Total
4,1085968,3323601,2023-02-05,1,Total,Total,,,G,2,,,,Total
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355611,2760578,3322623,2023-02-05,401515,*Old*,Age,91.0,,R,3,75.683627,91.586075,107.128990,Old
355612,2760578,3322623,2023-02-12,401515,*Old*,Age,,,R,3,78.659469,96.436943,107.308827,Old
355613,2760578,3322623,2023-02-19,401515,*Old*,Age,85.0,,R,3,76.133035,91.586075,107.128990,Old
355614,2760578,3322623,2023-02-26,401515,*Old*,Age,97.0,,R,3,74.172907,90.194305,107.308827,Old


In [107]:
# song-artist lookup
song_query = '''
Select mediabase_id, song_name, artist_name
from data.songs_v as sv
'''
engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as conn:
    with conn.begin():
        df_song_lookup = pd.read_sql(song_query, con=conn)

In [108]:
station_query = '''
Select distinct station_id, call_letters
from data.stations_v as sv
'''

engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as conn:
    with conn.begin():
        df_station_lookup = pd.read_sql(station_query, con=conn)

In [109]:
df_song_lookup.set_index(['mediabase_id'], inplace=True)

In [110]:
df_station_lookup.set_index(['station_id'], inplace=True)

In [111]:
df_song_lookup['song_artist']  = df_song_lookup['song_name'] + ' (' + df_song_lookup['artist_name'] + ')'

In [57]:
df_song_lookup

Unnamed: 0_level_0,song_name,artist_name,song_artist
mediabase_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1157400,Psychotron,MEGADETH,Psychotron (MEGADETH)
2885413,The Blob,ZANIES,The Blob (ZANIES)
2420830,Poison (Live At Olympia Paris),ALICE COOPER,Poison (Live At Olympia Paris) (ALICE COOPER)
2733608,Perspective f/Hollywood Jake,DANTE TWEAKS,Perspective f/Hollywood Jake (DANTE TWEAKS)
2844167,Sunflower,TAMINO,Sunflower (TAMINO)
...,...,...,...
2620719,Get On My Wave,ANDREW MCMAHON IN WILDERNESS,Get On My Wave (ANDREW MCMAHON IN WILDERNESS)
2667991,Exaggeration,JUAN HAZE,Exaggeration (JUAN HAZE)
2438195,Every Week,DQ4E,Every Week (DQ4E)
2623051,Your Story Is Over!,AYREON,Your Story Is Over! (AYREON)


In [112]:
df_out_final['song_artist'] = df_out_final.join(df_song_lookup, on=['mediabase_id'], how='left')['song_artist']

In [113]:
df_out_final['call_letters'] = df_out_final.join(df_station_lookup, on=['station_id'], how='left')['call_letters']

In [114]:
df_out_final['wobble_flag'] = df_out_final.apply(lambda x: int((x['pop_co'] < np.floor(x['lower_wob_thresh'])) | (x['pop_co'] > np.ceil(x['upper_wobble_thresh']))), axis=1)

In [115]:
df_out_final.to_pickle('df_out_final_Urban_2023_01_15_2023_03_05.pkl')

In [116]:
df_out_final_fmt['song_artist'] = df_out_final_fmt.join(df_song_lookup, on=['mediabase_id'], how='left')['song_artist']

In [117]:
df_out_final_fmt['call_letters'] = df_out_final_fmt.join(df_station_lookup, on=['station_id'], how='left')['call_letters']

In [118]:
df_out_final_fmt['wobble_flag'] = df_out_final_fmt.apply(lambda x: int((x['pop_co'] < np.floor(x['lower_wob_thresh'])) | (x['pop_co'] > np.ceil(x['upper_wobble_thresh']))), axis=1)

In [119]:
df_out_final_fmt.to_pickle('df_out_final_fmt_Urban_2023_01_15_2023_03_05.pkl')

In [5]:
# df_out_final = pd.read_pickle('df_out_final_CHR_2022_12_25_2023_02_12.pkl')

In [9]:
df_out_final[(df_out_final['call_letters'] == 'KBKS-FM') & (df_out_final['song_artist'] == 'abcdefu (GAYLE)')]

Unnamed: 0,mediabase_id,station_id,week_dt,breakout_id,breakout_name,demo_category,pop_co,pop_omt,gcr,taa_quintile,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh,segment,song_artist,call_letters,wobble_flag
551695,2760708,3321797,2022-12-25,-2,F (Other),Gender,,,R,2,60.344388,76.647103,93.399436,Female_Other,abcdefu (GAYLE),KBKS-FM,0
551696,2760708,3321797,2023-01-01,-2,F (Other),Gender,,,R,2,62.207474,77.797221,93.130636,Female_Other,abcdefu (GAYLE),KBKS-FM,0
551697,2760708,3321797,2023-01-08,-2,F (Other),Gender,,,R,2,61.867408,77.797221,93.130636,Female_Other,abcdefu (GAYLE),KBKS-FM,0
551698,2760708,3321797,2023-01-15,-2,F (Other),Gender,,,R,2,62.125284,78.695010,93.130636,Female_Other,abcdefu (GAYLE),KBKS-FM,0
551699,2760708,3321797,2023-01-22,-2,F (Other),Gender,,,R,2,60.344388,76.788446,93.399436,Female_Other,abcdefu (GAYLE),KBKS-FM,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551788,2760708,3321797,2023-01-15,412759,F (18-24),Gender,,,R,2,48.776339,66.943894,86.120871,Female_(18-24),abcdefu (GAYLE),KBKS-FM,0
551789,2760708,3321797,2023-01-22,412759,F (18-24),Gender,,,R,2,47.283107,66.943894,86.641831,Female_(18-24),abcdefu (GAYLE),KBKS-FM,0
551790,2760708,3321797,2023-01-29,412759,F (18-24),Gender,,,R,2,48.619369,66.943894,86.120871,Female_(18-24),abcdefu (GAYLE),KBKS-FM,0
551791,2760708,3321797,2023-02-05,412759,F (18-24),Gender,,,R,2,48.776339,66.489081,85.720871,Female_(18-24),abcdefu (GAYLE),KBKS-FM,0


### Read pre-written files

In [126]:
# df_out_final = pd.read_pickle('df_out_final_KIIS-FM_2022_12_04_2023_01_22.pkl')

In [125]:
# df_out_final_fmt = pd.read_pickle('df_out_final_H1_2022_12_04_2023_01_22.pkl')

In [113]:
df_out_final_fmt

Unnamed: 0,mediabase_id,station_id,week_dt,breakout_id,breakout_name,demo_category,pop_co,pop_omt,gcr,taa_quintile,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh,segment,song_artist,call_letters,wobble_flag
2,1085550,3322808,2022-12-04,412759,F (18-24),Gender,,,G,5,69.020207,89.997511,98.529371,Female_(18-24),In The End (LINKIN PARK),KZHT-FM,0
3,1085550,3322808,2022-12-11,412759,F (18-24),Gender,,,G,5,71.647261,89.997511,98.699385,Female_(18-24),In The End (LINKIN PARK),KZHT-FM,0
4,1085550,3322808,2022-12-18,412759,F (18-24),Gender,,,G,5,69.020207,90.223598,98.529371,Female_(18-24),In The End (LINKIN PARK),KZHT-FM,0
5,1085550,3322808,2022-12-25,412759,F (18-24),Gender,,,G,5,71.647261,89.997511,98.699385,Female_(18-24),In The End (LINKIN PARK),KZHT-FM,0
6,1085550,3322808,2023-01-01,412759,F (18-24),Gender,,,G,5,71.647261,89.997511,98.699385,Female_(18-24),In The End (LINKIN PARK),KZHT-FM,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
659830,2754544,3322204,2022-12-25,401515,*Old*,Age,,,R,2,57.644843,79.531999,94.185463,Old,Enemy From League Of Legends (IMAGINE DRAGONS),WHQC-FM,0
659831,2754544,3322204,2023-01-01,401515,*Old*,Age,,,R,2,57.428034,78.403516,93.442238,Old,Enemy From League Of Legends (IMAGINE DRAGONS),WHQC-FM,0
659832,2754544,3322204,2023-01-08,401515,*Old*,Age,,,R,2,57.002017,78.105013,93.442238,Old,Enemy From League Of Legends (IMAGINE DRAGONS),WHQC-FM,0
659833,2754544,3322204,2023-01-15,401515,*Old*,Age,,,R,2,57.002017,78.907983,93.787264,Old,Enemy From League Of Legends (IMAGINE DRAGONS),WHQC-FM,0


In [92]:
pd.unique(df_out_final['week_dt'])

array([datetime.date(2023, 1, 22), datetime.date(2023, 1, 29),
       datetime.date(2023, 2, 5), datetime.date(2022, 12, 18),
       datetime.date(2022, 12, 25), datetime.date(2023, 1, 1),
       datetime.date(2023, 1, 8), datetime.date(2023, 1, 15)],
      dtype=object)

#### wobbles report

In [120]:
df_instances = df_out_final[(df_out_final['breakout_id'] > 1) & (df_out_final['wobble_flag'] == 1)][['station_id', 'mediabase_id']].drop_duplicates()

In [121]:
df_instances

Unnamed: 0,station_id,mediabase_id
37915,3322220,1355512
123490,3322220,1913063
173527,3322220,2141346
181088,3322623,2177853
194183,3322623,2256903
...,...,...
354951,3323601,2855533
355177,3328064,2855533
355381,3322439,2760578
355546,3322611,2760578


In [122]:
df_wobble = df_instances.join(df_out_final[(df_out_final['breakout_id'] > 0) & (~pd.isna(df_out_final['pop_co']))].set_index(['mediabase_id', 'station_id']), on = ['mediabase_id', 'station_id'], how='left')

In [123]:
df_wobble.shape

(7489, 17)

In [124]:
df_wobble = df_wobble.join(df_test.set_index(['mediabase_id', 'station_id', 'week_dt', 'breakout_id'])[['spins_non_on', 'market_spins']], on=['mediabase_id', 'station_id', 'week_dt', 'breakout_id'], how='left')

In [125]:
taa_query = '''
Select mediabase_id, station_id, week_dt, taa
from dbo.rr_scores_adds_from_prod as rsafp
where week_dt >= '2023-01-01'
and format='U1'
--and station_id = 3322022
'''

engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as conn:
    with conn.begin():
        df_taa_lookup = pd.read_sql(taa_query, con=conn)

In [126]:
df_wobble = df_wobble.join(df_taa_lookup.set_index(['mediabase_id', 'station_id', 'week_dt'])['taa'], on=['mediabase_id', 'station_id', 'week_dt'], how='left')

In [127]:
df_wobble['score_date'] = df_wobble['week_dt'] + np.timedelta64(8, 'D')

In [91]:
df_wobble[['score_date', 'week_dt']].drop_duplicates()

Unnamed: 0,score_date,week_dt
10030,2023-02-27,2023-02-19
16637,2023-01-23,2023-01-15
16637,2023-02-20,2023-02-12
42824,2023-01-16,2023-01-08
48972,2023-02-13,2023-02-05
114577,2023-01-09,2023-01-01
114577,2023-01-30,2023-01-22
141338,2023-02-06,2023-01-29


In [128]:
df_wobble_out = df_wobble.pivot(index=['call_letters', 'song_artist', 'score_date', 'gcr', 'market_spins', 'spins_non_on', 'taa'], columns=['breakout_name'], values=['pop_co', 'wobble_flag'])

In [129]:
df_wobble_out

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,pop_co,pop_co,pop_co,pop_co,pop_co,pop_co,pop_co,pop_co,pop_co,wobble_flag,wobble_flag,wobble_flag,wobble_flag,wobble_flag,wobble_flag,wobble_flag,wobble_flag,wobble_flag
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,breakout_name,*Core*,*Old*,*Young*,AA,Hispanic,TOTAL (F),TOTAL (M),Total,White,*Core*,*Old*,*Young*,AA,Hispanic,TOTAL (F),TOTAL (M),Total,White
call_letters,song_artist,score_date,gcr,market_spins,spins_non_on,taa,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
KQBT-FM,Back End (FINESSE2TYMES),2023-01-23,C,183,96,9.5813,104.0,89.0,90.0,92.0,82.0,89.0,90.0,90.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
KQBT-FM,Back End (FINESSE2TYMES),2023-02-06,C,96,32,9.5339,96.0,99.0,82.0,87.0,104.0,91.0,92.0,92.0,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,
KQBT-FM,Back End (FINESSE2TYMES),2023-02-20,C,102,35,9.6387,102.0,96.0,99.0,95.0,104.0,97.0,97.0,97.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
KQBT-FM,Back End (FINESSE2TYMES),2023-03-06,C,90,35,9.7161,103.0,103.0,93.0,99.0,98.0,96.0,101.0,98.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
KQBT-FM,Beautiful Lies f/Kehlani (BLEU),2023-01-23,R,18,12,8.5471,96.0,82.0,105.0,93.0,92.0,98.0,88.0,93.0,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WWPR-FM,What's My Name f/Queen Naija.. (FIVIO FOREIGN),2023-02-06,R,14,5,8.1972,94.0,85.0,85.0,91.0,77.0,88.0,82.0,85.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
WWPR-FM,What's My Name f/Queen Naija.. (FIVIO FOREIGN),2023-02-13,R,8,0,8.3570,80.0,84.0,81.0,80.0,84.0,83.0,80.0,82.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
WWPR-FM,What's My Name f/Queen Naija.. (FIVIO FOREIGN),2023-02-27,R,8,0,8.2721,85.0,93.0,82.0,87.0,81.0,91.0,75.0,85.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
WWPR-FM,What's My Name f/Queen Naija.. (FIVIO FOREIGN),2023-03-06,R,0,0,8.3293,94.0,87.0,89.0,95.0,80.0,83.0,98.0,89.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [141]:
df_wobble_out.columns

MultiIndex([(     'pop_co',    '*Core*'),
            (     'pop_co',     '*Old*'),
            (     'pop_co',   '*Young*'),
            (     'pop_co', 'F (18-24)'),
            (     'pop_co',  'Hispanic'),
            (     'pop_co',     'Total'),
            (     'pop_co',       'WAO'),
            ('wobble_flag',    '*Core*'),
            ('wobble_flag',     '*Old*'),
            ('wobble_flag',   '*Young*'),
            ('wobble_flag', 'F (18-24)'),
            ('wobble_flag',  'Hispanic'),
            ('wobble_flag',     'Total'),
            ('wobble_flag',       'WAO')],
           names=[None, 'breakout_name'])

#### percentage gaps report

In [130]:
out_cols = ['call_letters', 'song_artist', 'demo_category', 'segment', 'taa_quintile', 'pop_co', 'pop_omt', 'wobble_flag', 'mean_pop_predicted']
id_cols = ['station_id', 'mediabase_id', 'week_dt']

In [131]:
df_to_report = df_out_final[id_cols + out_cols]

In [97]:
df_taa_lookup

Unnamed: 0,mediabase_id,station_id,week_dt,taa
0,1909327,3321797,2022-12-18,8.3118
1,1611338,3322816,2022-12-18,8.6139
2,1609179,4955311,2022-12-18,8.9641
3,1611338,3322002,2022-12-18,9.0092
4,1611338,3322006,2022-12-18,8.6473
...,...,...,...,...
177395,2854579,3322916,2023-02-05,5.6767
177396,2855293,3322798,2023-02-05,5.6415
177397,2052083,3323400,2023-02-05,8.6008
177398,2110204,3322808,2023-02-05,6.9744


In [98]:
df_taa_lookup.set_index(id_cols)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,taa
station_id,mediabase_id,week_dt,Unnamed: 3_level_1
3321797,1909327,2022-12-18,8.3118
3322816,1611338,2022-12-18,8.6139
4955311,1609179,2022-12-18,8.9641
3322002,1611338,2022-12-18,9.0092
3322006,1611338,2022-12-18,8.6473
...,...,...,...
3322916,2854579,2023-02-05,5.6767
3322798,2855293,2023-02-05,5.6415
3323400,2052083,2023-02-05,8.6008
3322808,2110204,2023-02-05,6.9744


In [132]:
df_to_report = df_to_report.join(df_taa_lookup.set_index(id_cols), on=id_cols)

In [133]:
df_to_report_fmt = df_out_final_fmt[out_cols + id_cols]

In [135]:
df_to_report.sort_values(by=['call_letters', 'song_artist', 'week_dt', 'demo_category', 'segment'], inplace=True)

In [136]:
# define percentage gap lambdas
fn_diff_Age = lambda x: (x['Young'] - x['Old'])/x['Old']
fn_diff_Gender  = lambda x: (x['Total_Male'] - x['Total_Female'])/x['Total_Female']
fn_diff_Race_White = lambda x: (x['White'] - x['AA'])/(x['AA'])
fn_diff_Race_Hispanic = lambda x: (x['Hispanic'] - x['AA'])/(x['AA'])
fn_diff_Non_Core = lambda x: (x['Non-Core'] - x['Core'])/(x['Core'])

In [137]:
# ref_demo = {'Age': 'Old', 'Race': 'White', 'Core-Cume': 'Core', 'Gender': 'Female_(18-24)'}
df_to_report_pvt = df_to_report.pivot_table(index=['call_letters', 'song_artist', 'week_dt', 'taa'], columns=['segment'], values=[ 'mean_pop_predicted'])
df_to_report_pvt.columns = [i[1].replace(' ', '_') for i in df_to_report_pvt.columns]

In [138]:
df_to_report_pvt_fmt =  (df_to_report_fmt.groupby(['song_artist', 'segment', 'week_dt'])['mean_pop_predicted'].mean().reset_index()).pivot_table(index=['song_artist', 'week_dt'], columns=['segment'], values=[ 'mean_pop_predicted'])
df_to_report_pvt_fmt.columns = [i[1].replace(' ', '_') for i in df_to_report_pvt_fmt.columns]

In [139]:
df_perc_gaps = df_to_report_pvt.apply([fn_diff_Age, fn_diff_Gender, fn_diff_Non_Core, fn_diff_Race_Hispanic, fn_diff_Race_White], axis=1).reset_index()
df_perc_gaps.columns = ['call_letters', 'song_artist', 'week_dt', 'taa', 'age_gap', 'gender_gap', 'core_gap', 'race_gap_Hispanic', 'race_gap_White']

In [140]:
df_perc_gaps_fmt = df_to_report_pvt_fmt.apply([fn_diff_Age, fn_diff_Gender, fn_diff_Non_Core, fn_diff_Race_Hispanic, fn_diff_Race_White], axis=1).reset_index()
df_perc_gaps_fmt.columns = ['song_artist', 'week_dt', 'fmt_age_gap', 'fmt_gender_gap', 'fmt_core_gap', 'fmt_race_gap_Hispanic', 'fmt_race_gap_White']

In [141]:
df_perc_gaps = df_perc_gaps.join(df_perc_gaps_fmt.set_index(['song_artist', 'week_dt']), on=['song_artist', 'week_dt'], how='left')

In [142]:
df_perc_gaps['score_date'] = df_perc_gaps['week_dt'] + np.timedelta64(8, 'D')

In [144]:
df_perc_gaps.columns

Index(['call_letters', 'song_artist', 'week_dt', 'taa', 'age_gap',
       'gender_gap', 'core_gap', 'race_gap_Hispanic', 'race_gap_White',
       'fmt_age_gap', 'fmt_gender_gap', 'fmt_core_gap',
       'fmt_race_gap_Hispanic', 'fmt_race_gap_White', 'score_date'],
      dtype='object')

In [143]:
out_cols_gaps = ['call_letters', 'song_artist', 'score_date', 'age_gap','fmt_age_gap', 'gender_gap','fmt_gender_gap',
       'core_gap', 'fmt_core_gap','race_gap_Hispanic',
       'fmt_race_gap_Hispanic', 'race_gap_White',
       'fmt_race_gap_White', 'taa' ]

#### WOW data

In [111]:
df_perc_gaps.pivot_table(index=['song_artist', 'call_letters'], columns=['week_dt'], values=['age_gap'])

Unnamed: 0_level_0,Unnamed: 1_level_0,age_gap,age_gap,age_gap,age_gap,age_gap,age_gap,age_gap,age_gap
Unnamed: 0_level_1,week_dt,2022-12-18,2022-12-25,2023-01-01,2023-01-08,2023-01-15,2023-01-22,2023-01-29,2023-02-05
song_artist,call_letters,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
(You Drive Me) Crazy (BRITNEY SPEARS),KBKS-FM,-0.350009,-0.350009,-0.329761,-0.350009,-0.323371,-0.329761,-0.350426,-0.323894
(You Drive Me) Crazy (BRITNEY SPEARS),KHKS-FM,-0.297558,-0.298055,-0.300857,,,,,
(You Drive Me) Crazy (BRITNEY SPEARS),KKRZ-FM,-0.283002,-0.301122,-0.257236,,,,,
(You Drive Me) Crazy (BRITNEY SPEARS),KZHT-FM,-0.364955,-0.374464,-0.371970,-0.374464,-0.361526,-0.358929,,
(You Drive Me) Crazy (BRITNEY SPEARS),KZZP-FM,-0.315539,-0.325651,-0.328269,-0.325651,-0.311892,-0.310135,-0.317294,-0.317294
...,...,...,...,...,...,...,...,...,...
traitor (OLIVIA RODRIGO),WXXL-FM,0.152552,0.152552,0.147659,0.152552,0.152552,0.160212,0.160212,0.174941
uh oh (TATE MCRAE),WHYI-FM,,-0.201036,-0.200880,-0.201411,-0.189185,-0.193634,-0.193634,-0.190705
uh oh (TATE MCRAE),WKSC-FM,,,,-0.283404,-0.277952,-0.283598,-0.283598,-0.282821
uh oh (TATE MCRAE),WNCI-FM,,,,-0.068704,-0.049529,-0.109401,-0.109401,-0.105983


### Write Output to Excel

In [146]:
df_perc_gaps.to_pickle('df_perc_gaps_03122023_U1.pkl')
df_wobble_out.to_pickle('df_wobble_out_03122023_U1.pkl')

In [147]:
with pd.ExcelWriter('RR_by_Demographic_Urban_03132023.xlsx') as writer:
    df_perc_gaps[out_cols_gaps].to_excel(writer, sheet_name='Percentage Gaps')
    df_wobble_out.to_excel(writer, sheet_name='Wobbles')