In [4]:
# import packages
import pandas as pd
import pickle as pkl
import os
import numpy as np

In [5]:
from sqlalchemy import create_engine

def postgresql_engine(user, pwd, host, port, dbname):
    # Need pyycopg2-binary package
    sql_engine = create_engine('postgres://' + user + ':' + pwd + '@' + host + ':' + port + '/' + dbname, echo=False)
    return sql_engine

In [6]:
# DB username and password
import getpass

user = getpass.getpass()
pwd = getpass.getpass()

In [7]:
# misc db parameters
host= 'adds-postgres-dev.cfgztrijqgvp.us-east-1.rds.amazonaws.com'
dbname= 'musiclab'
port= '5432'

In [5]:
# get callout research for songs released in the past 2 years
data_query_train = '''
Select *
from adds_temp.demo_rr_features_h1 as rdfh
where pop_all is not null
'''

In [8]:
filter_rules = '''and song_weeks_since_last_spins <=13
and ((song_last_test_co_weeks <=26)
or (song_last_test_omt_weeks <=104)
or format_code in ('u4','l2'))
and (((station_test_1_plus=0 and station_test_1_id=1) or station_test_1_plus>0) or format_code in ('u4','l2'))
and (format_code<>'h1' or station_id<>3323403)
and (format_code<>'c1' or station_id<>3322825)
and (format_code<>'a2' or station_id<>3322799 or gcr<>'G')
and taa_quintile is not null
and gcr_adj is not null
'''

In [7]:
print(data_query_train + filter_rules)


Select *
from adds_temp.demo_rr_features_h1 as rdfh
where pop_all is not null
and song_weeks_since_last_spins <=13
and ((song_last_test_co_weeks <=26)
or (song_last_test_omt_weeks <=104)
or format_code in ('u4','l2'))
and (((station_test_1_plus=0 and station_test_1_id=1) or station_test_1_plus>0) or format_code in ('u4','l2'))
and (format_code<>'h1' or station_id<>3323403)
and (format_code<>'c1' or station_id<>3322825)
and (format_code<>'a2' or station_id<>3322799 or gcr<>'G')
and taa_quintile is not null
and gcr_adj is not null



In [8]:
engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as con:
    with con.connect():
        df_train = pd.read_sql(data_query_train + filter_rules, con=con)

In [9]:
#write to pkl file
df_train.to_pickle('df_train_2022_11_05_2020_11_28.pkl')

In [12]:
# read from pickle file
df_train = pd.read_pickle('df_train_2022_11_05_2020_11_28.pkl')

In [13]:
df_train.sort_values(by=['station_id', 'mediabase_id', 'breakout_name', 'week_dt', ], inplace=True)

In [9]:
# define demo segments and categories
breakout_category = {'*Core*': 'Core-Cume', '*Old*': 'Age', '*Young*': 'Age', 'Total': 'Total', 'White': 'Race',
                     'Non-Core': 'Core-Cume',
                     'Hispanic': 'Race', 'AA': 'Race', 'F': 'Gender', 'M': 'Gender', 'WAO': 'Race',
                     'F (25-29)': 'Gender', 'F (20-24)': 'Gender', 'F (18-29)': 'Gender', 'F (17-29)': 'Gender',
                     'F (20-23)': 'Gender', 'F (18-39)' : 'Gender',
                     'F (16-24)': 'Gender', 'F (30-34)': 'Gender', 'F (18-34)': 'Gender', 'F (24-29)': 'Gender',
                     'F (17-19)': 'Gender', 'F (15-26)': 'Gender', 'F (15-19)': 'Gender', 'F (15-24)': 'Gender',
                     'F (18-24)': 'Gender', 'F (20-29)': 'Gender', 'F (25-34)': 'Gender', 'F (Other)': 'Gender'}

breakout_map = {'*Core*': 'Core', '*Old*': 'Old', '*Young*': 'Young', 'Total': 'Total', 'White': 'White',
                'Non-Core': 'Non-Core',
                'Hispanic': 'Hispanic', 'AA': 'AA', 'F': 'Female', 'M': 'Male', 'WAO': 'White', 'F (25-29)': 'Female',
                'F (20-24)': 'Female', 'F (18-29)': 'Female', 'F (17-29)': 'Female', 'F (20-23)': 'Female',
                'F (16-24)': 'Female', 'F (30-34)': 'Female', 'F (18-34)': 'Female', 'F (24-29)': 'Female',
                'F (17-19)': 'Female', 'F (18-39)' : 'Female',
                'F (15-26)': 'Female', 'F (15-19)': 'Female', 'F (15-24)': 'Female',
                'F (18-24)': 'Female_(18-24)', 'F (20-29)': 'Female', 'F (25-34)': 'Female',
                'F (Other)': 'Female_Other'}

In [14]:
# create segment and category fields
df_train['segment'] = df_train['breakout_name'].apply(func=(lambda x: breakout_map[x] if (x in breakout_map.keys()) else None))
df_train['demo_category'] = df_train['breakout_name'].apply(func=(lambda x: breakout_category[x] if x in breakout_category.keys() else None))

In [15]:
# drop misc female breakouts
drop_idx = df_train[(df_train['segment'] == 'Female') | (pd.isna(df_train['segment']))].index
df_train.drop(drop_idx, inplace=True)

In [16]:
# Drop songs with just a single score in the past 2 years
df_train_week_ct = pd.DataFrame(df_train.groupby(['station_id', 'mediabase_id', 'breakout_id'])['week_dt'].count())
drop_idx = df_train.join(df_train_week_ct[df_train_week_ct['week_dt'] == 1], on=['station_id', 'mediabase_id', 'breakout_id'], how='right', rsuffix='_r').index
df_train.drop(index=drop_idx, inplace=True)

In [10]:
df_train.groupby(['demo_category', 'segment', 'taa_quintile'])['mediabase_id'].count()

demo_category  segment         taa_quintile
Age            Old             1                2252
                               2                6295
                               3                6168
                               4                8090
                               5               21146
               Young           1                2252
                               2                6295
                               3                6173
                               4                8088
                               5               21143
Core-Cume      Core            1                2251
                               2                6293
                               3                6169
                               4                8089
                               5               21150
               Non-Core        1                2242
                               2                6266
                               3                6147
  

### Investigate columns with missing data

In [11]:
len(df_train[pd.isna(df_train['mediabase_id'])])

0

In [12]:
len(df_train[df_train['gcr'] == df_train['gcr_adj']])/len(df_train)

1.0

In [17]:
len(df_train.columns)

108

In [7]:
df_train.shape

(614649, 108)

In [8]:
df_train.groupby(['station_test_1_plus', 'station_test_1_id'])['mediabase_id'].count()

station_test_1_plus  station_test_1_id
0                    1                     58773
1                    0                    497747
                     1                     58129
Name: mediabase_id, dtype: int64

##### isolate numeric and categorical columns

In [17]:
# constants
num_cols_like = ['artist_count', 'feat_artist', 'feat_artist_song', 'mscore', 'spins','pop_prior',
                 'pop_artist_prior', 'song_age_weeks', 'song_last_test']
cat_cols_like = ['Market_Name', 'taa_quintile', 'segment', 'gcr', 'gcr_adj', 'omt_co_flag']
target = ['pop_all']
id_cols = ['mediabase_id', 'station_id', 'week_dt', 'breakout_id', 'breakout_name', 'demo_category', 'pop_co', 'pop_omt', 'gcr']

exclude_cols_like = ['date','song_last_test_any_weeks','song_last_test_co_weeks', 'song_last_test_omt_weeks',
                     'std_pop_prior', 'std_pop_artist_prior']#,'_unv']#,'univ_spins', 'market_spins']

In [18]:
id_cols = id_cols
target_col = target
exclude_cols = df_train.columns[df_train.columns.str.contains('|'.join(exclude_cols_like), regex=True)]

cat_cols = list(set(df_train.columns[df_train.columns.str.contains('|'.join(cat_cols_like), regex=True)]) - set(
    id_cols) - set(exclude_cols))

num_cols = list(set(df_train.select_dtypes(exclude=['object', 'datetime64']).columns) & set(
df_train.columns[(df_train.columns.str.contains('|'.join(num_cols_like), regex=True))]) - set(id_cols) - set(cat_cols) - set(exclude_cols))

feature_cols = list(set(list(num_cols) + list(cat_cols)))

In [19]:
exclude_cols

Index(['song_release_date', 'song_last_test_any_weeks',
       'song_last_test_co_weeks', 'song_last_test_omt_weeks', 'std_pop_prior',
       'mr_pop_prior_date', 'mr_pop_prior_unv_date', 'std_pop_artist_prior'],
      dtype='object')

##### check missingness by different feature types

In [34]:
# check missingness by instance (station-song combination) for categorical variables
cols_avl_cat = df_train.groupby(['station_id', 'mediabase_id'])[cat_cols].agg(lambda x: 1 - sum(pd.isnull(x))/(1.0*len(x))).reset_index()

In [35]:
cols_avl_cat.describe()

Unnamed: 0,station_id,mediabase_id,taa_quintile,gcr_adj,omt_co_flag,Market_Name,segment
count,4774.0,4774.0,4774.0,4774.0,4774.0,4774.0,4774.0
mean,3344216.0,2450651.0,1.0,1.0,1.0,1.0,1.0
std,174230.3,394312.8,0.0,0.0,0.0,0.0,0.0
min,3321797.0,1085550.0,1.0,1.0,1.0,1.0,1.0
25%,3322025.0,2355676.0,1.0,1.0,1.0,1.0,1.0
50%,3322828.0,2583267.0,1.0,1.0,1.0,1.0,1.0
75%,3323410.0,2720314.0,1.0,1.0,1.0,1.0,1.0
max,4762077.0,2848773.0,1.0,1.0,1.0,1.0,1.0


In [24]:
df_train[(df_train['station_id'] == 3321797) & (df_train['mediabase_id'] == 1086587) & (pd.isna(df_train['segment']))]

Unnamed: 0,mediabase_id,station_id,week_dt,artist_id,format_code,FirstLast,SongTitle,Market_Name,song_release_date,breakout_id,...,mean_pop_artist_prior,std_pop_artist_prior,count_pop_artist_prior,max_pop_artist_prior_unv,min_pop_artist_prior_unv,mean_pop_artist_prior_unv,count_pop_artist_prior_unv,mr_pop_artist_prior_unv,segment,demo_category
2125,1086587,3321797,2022-07-31,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,,,,,,,,,,
2128,1086587,3321797,2022-08-21,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,64.0,,1.0,64.0,64.0,64.0,1.0,64.0,,
2132,1086587,3321797,2022-09-04,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,71.0,9.899495,2.0,78.0,64.0,68.666667,3.0,78.0,,
2133,1086587,3321797,2022-09-18,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,71.333333,7.023769,3.0,78.0,64.0,70.0,6.0,72.0,,
2136,1086587,3321797,2022-10-02,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,72.0,5.887841,4.0,78.0,64.0,70.8,10.0,74.0,,
2138,1086587,3321797,2022-10-16,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,70.6,5.98331,5.0,78.0,64.0,70.733333,15.0,65.0,,
2139,1086587,3321797,2022-10-30,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,71.5,5.787918,6.0,78.0,64.0,70.952381,21.0,76.0,,
2141,1086587,3321797,2022-11-27,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428784,...,72.714286,6.183696,7.0,80.0,64.0,71.392857,28.0,80.0,,
2087,1086587,3321797,2022-07-31,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428762,...,,,,,,,,,,
2090,1086587,3321797,2022-08-21,26365333,{format_code},KATE BUSH,Running Up That Hill (A Deal..,Seattle,2022-06-01,428762,...,59.0,,1.0,59.0,59.0,59.0,1.0,59.0,,


In [72]:
# Investigate taa_quintile
# df_temp = df_train[pd.isna(df_train['taa_quintile'])][['station_id', 'mediabase_id', 'week_dt']]
# len(df_temp)
# df_temp.groupby(['week_dt']).agg({'mediabase_id':len}).sort_values(by=['mediabase_id'], ascending=False)
# df_temp.groupby(['mediabase_id']).agg({'week_dt':len}).sort_values(by=['week_dt'], ascending=False)
# df_temp.groupby(['station_id', 'mediabase_id']).agg({'week_dt':len}).sort_values(by=['week_dt'], ascending=False)
# idx = (df_train['station_id'] == 3323400) & (df_train['mediabase_id'] == 2591543)
# df_train.loc[idx].groupby(['week_dt'])['mediabase_id'].count()
# idx = (df_train['station_id'] == 3323400) & (df_train['mediabase_id'] == 2591543) & (pd.isna(df_train['taa_quintile'])) & (df_train['breakout_name'] == 'Total')
# df_train.loc[idx]

In [149]:
# Investigate gcr and gcr_adj
# df_temp = df_train[(pd.isna(df_train['gcr_adj'])) & (~pd.isna(df_train['gcr']))][['station_id', 'mediabase_id', 'week_dt']]
# len(df_temp)
# print(df_temp.groupby(['week_dt']).agg({'mediabase_id':len}).sort_values(by=['mediabase_id'], ascending=False))
# print(df_temp.groupby(['mediabase_id']).agg({'week_dt':len}).sort_values(by=['week_dt'], ascending=False))
# print(df_temp.groupby(['station_id', 'mediabase_id']).agg({'week_dt':len}).sort_values(by=['week_dt'], ascending=False))
# idx = (df_train['station_id'] == 3322002) & (df_train['mediabase_id'] == 2294907)
# df_train.loc[idx].groupby(['week_dt'])['mediabase_id'].count()
# idx = (df_train['station_id'] == 3323404) & (df_train['mediabase_id'] == 2629560) & (pd.isna(df_train['gcr_adj'])) & (df_train['breakout_name'] == 'Total')
# df_train.loc[idx]
# [np.min(df_temp['week_dt']), np.max(df_temp['week_dt'])]

[datetime.date(2020, 11, 22), datetime.date(2021, 1, 24)]

###### Investigate Numeric Columns

In [20]:
# Investigate Numeric Columns
num_cols_spins = [col for col in num_cols if 'spins' in col]
num_cols_pop = [col for col in num_cols if 'pop' in col]
num_cols_other = list(set(num_cols) - set(num_cols_spins) - set(num_cols_pop))
num_cols_spins_perc = [i for i in num_cols_spins if (('perc_diff_' in i) or ('per_diff' in i))]
num_cols_spins_nonperc = list(set(num_cols_spins) - set(num_cols_spins_perc))

In [17]:
[len(num_cols), len(num_cols_spins), len(num_cols_pop), len(num_cols_other)]

[75, 52, 19, 4]

In [18]:
num_cols_other

['artist_count', 'feat_artist', 'song_age_weeks', 'feat_artist_song']

In [19]:
num_cols_pop

['min_pop_prior',
 'mean_pop_prior_unv',
 'min_pop_artist_prior_unv',
 'count_pop_artist_prior_unv',
 'max_pop_artist_prior_unv',
 'mean_pop_artist_prior_unv',
 'mr_pop_prior_unv',
 'max_pop_prior',
 'mr_pop_prior',
 'count_pop_artist_prior',
 'max_pop_artist_prior',
 'min_pop_artist_prior',
 'mr_pop_artist_prior_unv',
 'mean_pop_artist_prior',
 'med_pop_prior',
 'min_pop_prior_unv',
 'count_pop_prior_unv',
 'mean_pop_prior',
 'max_pop_prior_unv']

In [20]:
df_train.shape

(377628, 110)

In [68]:
print(581054 + 33595)

614649


In [21]:
# Backfill pop based data
df_train[num_cols_pop] = df_train.groupby(['station_id', 'mediabase_id', 'breakout_id'])[num_cols_pop].bfill()

In [22]:
# Fill missing perc spin diffs with 1.0
df_train[num_cols_spins_perc] = df_train[num_cols_spins_perc].transform(lambda x: x.fillna(1.0))

In [23]:
# Backfill non perc diff spins diff
df_train[num_cols_spins_nonperc] = df_train.groupby(['station_id', 'mediabase_id', 'breakout_id'])[num_cols_spins_nonperc].bfill()

In [43]:
num_cols_spins_nonperc

['avg_market_spins_prior',
 'avg_station_artist_spins_prior',
 'mr_market_spins_spins',
 'mr_song_univ_spins_prior',
 'market_artist_spins',
 'song_univ_spins',
 'avg_artist_univ_spins_prior',
 'total_market_artist_spins_prior',
 'total_station_artist_spins_prior',
 'song_market_weeks_since_first_spins',
 'diff_spins_song_market_prior',
 'song_weeks_since_last_spins',
 'station_spins',
 'mr_artist_univ_spins',
 'diff_market_spins_spins_prior',
 'avg_song_univ_spins_prior',
 'artist_weeks_since_first_spins',
 'total_market_spins_prior',
 'diff_artist_univ_spins_prior',
 'diff_spins_song_station_prior',
 'mr_spins_artist_station_prior',
 'format_spins',
 'artist_station_weeks_since_first_spins',
 'artist_univ_spins',
 'song_weeks_since_first_spins',
 'diff_market_artist_spins_prior',
 'diff_spins_artist_station_prior',
 'spins_non_on',
 'total_spins_song_station_prior',
 'diff_song_univ_spins_prior',
 'total_song_univ_spins_prior',
 'avg_spins_song_station_prior',
 'spins_total',
 'avg_m

In [24]:
# check missingness by instance (station-song combination) for categorical variables
cols_avl_num = df_train.groupby(['station_id', 'mediabase_id', 'breakout_name'])[num_cols].agg(lambda x: 1 - sum(pd.isnull(x))/(1.0*len(x))).reset_index()

In [25]:
cols_avl_num = cols_avl_num.join(pd.DataFrame(df_train.groupby(['station_id', 'mediabase_id', 'breakout_name']).count()['week_dt']), on=['station_id', 'mediabase_id', 'breakout_name'], rsuffix='_r')

In [26]:
cols_avl_num[num_cols].mean().reset_index().sort_values(by=[0])

Unnamed: 0,index,0
0,min_pop_prior,1.0
53,med_pop_prior,1.0
52,song_weeks_since_first_spins,1.0
51,station_spins,1.0
50,per_diff_market_spins_spins_prior,1.0
...,...,...
22,total_station_artist_spins_prior,1.0
21,max_pop_prior,1.0
20,artist_univ_spins,1.0
18,format_spins,1.0


In [48]:
idx = df_train[cat_cols + num_cols].dropna(axis=1).index
df_train.loc[idx].shape

(377628, 110)

In [49]:
df_train.shape

(377628, 110)

In [93]:
cols_avl_num[(cols_avl_num['song_last_test_co_weeks'] < 1) & (cols_avl_num['breakout_name'] == 'Total')][['station_id', 'mediabase_id', 'breakout_name', 'week_dt']]

Unnamed: 0,station_id,mediabase_id,breakout_name,week_dt
2788,3321799,1388281,Total,2
2812,3321799,1640575,Total,2
7090,3322002,1243640,Total,2
7100,3322002,1249237,Total,2
7134,3322002,1261285,Total,3
...,...,...,...,...
56105,3323602,2397182,Total,2
56124,3323602,2422489,Total,3
56130,3323602,2422949,Total,2
56179,3323602,2445876,Total,3


In [95]:
df_train[(df_train['station_id'] == 3323602) & (df_train['mediabase_id'] == 2348056) & (df_train['breakout_name'] == 'Total')][['station_id', 'mediabase_id', 'breakout_name', 'week_dt', 'song_last_test_any_weeks','song_last_test_co_weeks', 'song_last_test_omt_weeks']]

Unnamed: 0,station_id,mediabase_id,breakout_name,week_dt,song_last_test_any_weeks,song_last_test_co_weeks,song_last_test_omt_weeks
64814,3323602,2348056,Total,2020-11-22,0.0,,0.0
64815,3323602,2348056,Total,2021-02-07,0.0,0.0,1.0
64816,3323602,2348056,Total,2021-09-26,0.0,8.0,0.0
64817,3323602,2348056,Total,2022-08-21,0.0,0.0,1.0


### Prep Data and create train/test splits

In [60]:
id_cols

['mediabase_id',
 'station_id',
 'week_dt',
 'breakout_id',
 'breakout_name',
 'demo_category',
 'pop_co',
 'pop_omt',
 'gcr']

In [46]:
# Extract train data
[np.min(df_train['week_dt']),pd.to_datetime(np.min(df_train['week_dt'])) + np.timedelta64(2,'Y'), np.max(df_train['week_dt'])]
scoring_date = pd.to_datetime('2022-11-22')
train_idx = df_train['week_dt'] < scoring_date.date()

df_train_final = df_train.loc[train_idx][id_cols + feature_cols + target_col]
X_train = pd.get_dummies(df_train_final[feature_cols], columns=cat_cols)
y_train = df_train_final[target]

In [64]:
demo_cats = list(set(breakout_category.values()) - set(['Total']))

In [30]:
X_train.shape

(369153, 125)

In [32]:
# imports for model training
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, GroupKFold
from sklearn.metrics import make_scorer, mean_pinball_loss

In [34]:
low_alpha = 0.05
high_alpha = 0.95

param_grid = dict(
    learning_rate=[.2, .1, .05],
    n_estimators=[5, 10, 15],
    max_depth=[2, 4, 6],
    min_samples_leaf=[5, 10, 20],
    min_samples_split=[5, 10, 20]
)

n_iter = 50
n_splits = 5

In [66]:
import time
demo_cols_all = [col for col in X_train.columns if 'segment_' in col]

In [34]:
best_scores = {}
best_estimators = {}

for cat in demo_cats:
    tic = time.perf_counter()
    idx = (df_train_final['demo_category'] == cat)
    print(cat + ': ' + str(sum(idx)))

    # extract relevant segment indicator columns
    demo_cols_cat = ['segment_' + i for i in list(pd.unique(df_train_final.loc[idx]['segment']))]
    demo_cols_excl = list(set(demo_cols_all) - set(demo_cols_cat))
    feature_cols_cat = list(set(X_train.columns) - set(demo_cols_excl))

    # create features and target
    X = X_train.loc[idx][feature_cols_cat]
    y = y_train.loc[idx]

    # quantile regressor

    # gradient boosted quantile regressor
    group_kfold = GroupKFold(n_splits=n_splits)

    # train model for upper threshold given features
    cv = group_kfold.split(X.loc[idx], y.loc[idx], df_train_final.loc[idx]['mediabase_id'])
    neg_mean_pinball_loss_high = make_scorer(
        mean_pinball_loss,
        alpha=high_alpha,
        greater_is_better=False,  # maximize the negative loss
    )

    model_high_thresh = GradientBoostingRegressor(loss="quantile", alpha=high_alpha,
                                                  random_state=0)

    rs_high_thresh = RandomizedSearchCV(
        model_high_thresh,
        param_grid,
        n_iter=n_iter,
        scoring=neg_mean_pinball_loss_high,
        cv=cv,
        verbose=1,
        random_state=0,
        n_jobs=-1
    )

    rs_high_thresh.fit(X.loc[idx], np.ravel(y.loc[idx]))
    print(cat + ": Fitting for upper wobble threshold completed")

    # train model for lower threshold given features
    cv = group_kfold.split(X.loc[idx], y.loc[idx], df_train_final.loc[idx]['mediabase_id'])
    neg_mean_pinball_loss_low = make_scorer(
        mean_pinball_loss,
        alpha=low_alpha,
        greater_is_better=False,  # maximize the negative loss
    )

    model_low_thresh = GradientBoostingRegressor(loss="quantile", alpha=low_alpha,
                                                 random_state=0)

    rs_low_thresh = RandomizedSearchCV(
        model_low_thresh,
        param_grid,
        n_iter=n_iter,
        scoring=neg_mean_pinball_loss_low,
        cv=cv,
        verbose=1,
        random_state=0,
        n_jobs=-1
    )

    rs_low_thresh.fit(X.loc[idx], np.ravel(y.loc[idx]))
    print(cat + ": Fitting for lower wobble threshold completed")

    # train model for mean pop score given features
    cv = group_kfold.split(X.loc[idx], y.loc[idx], df_train_final.loc[idx]['mediabase_id'])
    model_mean = GradientBoostingRegressor(loss="squared_error")

    rs_mean = RandomizedSearchCV(
        model_mean,
        param_grid,
        n_iter=n_iter,
        scoring='neg_mean_absolute_error',
        cv=cv,
        verbose=1,
        random_state=0,
        n_jobs=-1
    )

    rs_mean.fit(X.loc[idx], np.ravel(y.loc[idx]))
    print(cat + ": Fitting for mean pop completed")

    toc = time.perf_counter()
    time_elapsed = toc-tic
    print('Total time elapsed for ' + cat + ': ' + '%.2f'%time_elapsed)

    best_scores[cat] = [rs_low_thresh.best_score_, rs_mean.best_score_, rs_high_thresh.best_score_]
    best_estimators[cat] = [rs_low_thresh.best_estimator_, rs_mean.best_estimator_, rs_high_thresh.best_estimator_]

Core-Cume: 85772


NameError: name 'GroupKFold' is not defined

In [51]:
import pickle
pickle.dump(best_scores, open('best_scores_all.pkl', "wb"))
pickle.dump(best_estimators, open('best_estimators_all.pkl', "wb"))

In [21]:
best_estimators = pd.read_pickle('best_estimators_all.pkl')

In [23]:
best_estimators

{'Age': [GradientBoostingRegressor(alpha=0.05, learning_rate=0.2, loss='quantile',
                            max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15,
                            random_state=0),
  GradientBoostingRegressor(learning_rate=0.2, max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15),
  GradientBoostingRegressor(alpha=0.95, learning_rate=0.2, loss='quantile',
                            max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15,
                            random_state=0)],
 'Gender': [GradientBoostingRegressor(alpha=0.05, learning_rate=0.2, loss='quantile',
                            max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15,
                            random_state=0),
  GradientBoostingRegressor(learning_rate=0.2, max_depth=6, min_samples_leaf=10,
  

### Prep scoring data & score


In [24]:
test_data_query = '''
Select *
from adds_temp.demo_rr_features_h1 as rdfh
where week_dt >= '2022-11-22'
'''

In [25]:
engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as conn:
    with conn.connect():
        df_test = pd.read_sql(test_data_query + filter_rules, conn)

In [27]:
df_test.shape

(659835, 108)

In [None]:
df_test.to_pickle('df_score_2021_01_09_2020_11_28.pkl')

In [24]:
df_test = pd.read_pickle('df_score_2021_01_09_2020_11_28.pkl')

In [28]:
# create segment and category fields
df_test['segment'] = df_test['breakout_name'].apply(func=(lambda x: breakout_map[x] if (x in breakout_map.keys()) else None))
df_test['demo_category'] = df_test['breakout_name'].apply(func=(lambda x: breakout_category[x] if x in breakout_category.keys() else None))

In [29]:
# drop misc female breakouts
drop_idx = df_test[(df_test['segment'] == 'Female') | (pd.isna(df_test['segment']))].index
df_test.drop(drop_idx, inplace=True)

In [47]:
# Backfill pop based data
df_test[num_cols_pop] = df_test.groupby(['station_id', 'mediabase_id', 'breakout_id'])[num_cols_pop].bfill()

In [48]:
# Fill missing perc spin diffs with 1.0
df_test[num_cols_spins_perc] = df_test[num_cols_spins_perc].transform(lambda x: x.fillna(1.0))

In [49]:
# Backfill non perc diff spins diff
df_test[num_cols_spins_nonperc] = df_test.groupby(['station_id', 'mediabase_id', 'breakout_id'])[num_cols_spins_nonperc].bfill()

In [50]:
pd.unique(df_test['week_dt'])

array([datetime.date(2022, 12, 4), datetime.date(2022, 12, 11),
       datetime.date(2022, 12, 18), datetime.date(2022, 12, 25),
       datetime.date(2023, 1, 1), datetime.date(2023, 1, 8),
       datetime.date(2023, 1, 15), datetime.date(2023, 1, 22)],
      dtype=object)

In [39]:
drop_idx = df_test[pd.to_datetime(df_test['week_dt']) == pd.to_datetime('2022-11-27')].index
df_test.drop(drop_idx, inplace=True)

In [51]:
df_test[(df_test['station_id'] == 3322022) & (df_test['breakout_name'] == 'Total')].groupby(['week_dt']).apply(lambda x: len(pd.unique(x['mediabase_id'])))

week_dt
2022-12-04    291
2022-12-11    290
2022-12-18    290
2022-12-25    288
2023-01-01    290
2023-01-08    294
2023-01-15    292
2023-01-22    289
dtype: int64

In [41]:
df_test[(df_test['station_id'] == 3322022) & (df_test['breakout_name'] == 'Total')].groupby(['week_dt'])['mediabase_id'].count()

week_dt
2022-12-04    291
2022-12-11    290
2022-12-18    290
2022-12-25    288
2023-01-01    290
2023-01-08    294
2023-01-15    292
2023-01-22    289
Name: mediabase_id, dtype: int64

In [52]:
df_test[df_test['station_id'] == 3322022][num_cols].apply(lambda x: 1 - pd.isnull(x).sum()/len(x)).sort_values()

max_pop_artist_prior                 0.997058
mean_pop_artist_prior                0.997058
count_pop_artist_prior               0.997058
max_pop_prior                        0.997058
mr_pop_prior                         0.997058
                                       ...   
max_pop_artist_prior_unv             1.000000
perc_diff_spins_song_market_prior    1.000000
mr_spins_song_station_prior          1.000000
avg_artist_univ_spins_prior          1.000000
mr_pop_artist_prior_unv              1.000000
Length: 75, dtype: float64

In [73]:
df_test[df_test['station_id'] == 3322022][cat_cols].apply(lambda x: 1 - pd.isnull(x).sum()/len(x)).sort_values()

gcr_adj         1.0
Market_Name     1.0
omt_co_flag     1.0
segment         1.0
taa_quintile    1.0
dtype: float64

In [72]:
df_test['omt_co_flag'] = df_test['omt_co_flag'].fillna(0)

In [44]:
df_test[pd.isna(df_test['med_pop_prior'])]['mediabase_id'].drop_duplicates()

12614     1181047
14449     1186003
51243     1375592
138309    1767066
163169    1855587
174158    1904766
189505    1961471
208760    2050851
211069    2062455
236509    2154641
240792    2173957
261955    2256903
284770    2306701
291654    2339915
308668    2387396
344461    2457068
397581    2554446
409235    2582475
416020    2584541
428816    2605093
476648    2691407
487368    2697903
492701    2708592
501968    2716380
508605    2720314
511664    2720326
522782    2723707
530376    2738262
539381    2742545
542715    2744465
560968    2768404
581184    2789100
586920    2794968
593556    2798183
601067    2804653
611107    2808913
614711    2797282
628348    2813036
632362    2819942
633886    2823991
635132    2827240
638538    2831030
639893    2820569
642133    2831617
646485    2833906
648200    2838198
651249    2841905
651887    2848768
656077    2848773
656628    2850510
657773    2863849
Name: mediabase_id, dtype: int64

In [101]:
df_test[(df_test['mediabase_id'] == 1354780) & (df_test['breakout_name'] == 'Total') & (df_test['station_id'] == 3322916)][id_cols + ['med_pop_prior']]

Unnamed: 0,mediabase_id,station_id,week_dt,breakout_id,breakout_name,demo_category,pop_co,pop_omt,gcr,med_pop_prior
36963,1354780,3322916,2022-11-27,1,Total,Total,,,G,
36964,1354780,3322916,2022-12-04,1,Total,Total,,,G,
36965,1354780,3322916,2022-12-11,1,Total,Total,,,G,
36966,1354780,3322916,2022-12-18,1,Total,Total,,,G,
36967,1354780,3322916,2022-12-25,1,Total,Total,,,G,
36968,1354780,3322916,2023-01-01,1,Total,Total,,,G,
36969,1354780,3322916,2023-01-08,1,Total,Total,109.0,,G,


In [54]:
df_test.shape

(385772, 110)

In [81]:
drop_idx = list(set(df_test.index) - set(df_test[feature_cols].dropna().index))
df_test.drop(drop_idx, inplace=True)

#### score KIIS-FM

In [82]:
scoring_date = pd.to_datetime('2022-12-04')
test_idx = (df_test['week_dt'] >= scoring_date.date()) & (df_test['station_id'] == 3322022)

df_test_final = df_test.loc[test_idx][id_cols + feature_cols + target_col]
X_test = pd.get_dummies(df_test_final[feature_cols], columns=cat_cols)

missing_cols = list(set(X_train.columns) - set(X_test.columns))

y_test = df_test_final[target]

In [75]:
df_test_final

Unnamed: 0,mediabase_id,station_id,week_dt,breakout_id,breakout_name,demo_category,pop_co,pop_omt,gcr,mean_pop_prior,...,mr_market_spins_spins,market_spins,diff_spins_song_market_prior,max_pop_artist_prior_unv,mr_spins_song_station_prior,gcr_adj,diff_song_univ_spins_prior,min_pop_artist_prior_unv,format_spins,pop_all
2267,1088410,3322022,2022-12-18,400756,Hispanic,Race,,,G,99.000000,...,172,175,6,99.0,6,G,3.0,99.0,17,
2268,1088410,3322022,2022-12-25,400756,Hispanic,Race,,,G,99.000000,...,175,50,-11,99.0,12,G,-125.0,99.0,1,
2269,1088410,3322022,2023-01-01,400756,Hispanic,Race,,,G,99.000000,...,50,0,-1,99.0,1,G,-50.0,99.0,0,
2270,1088410,3322022,2023-01-08,400756,Hispanic,Race,,,G,99.000000,...,0,0,0,99.0,0,G,0.0,99.0,0,
2271,1088410,3322022,2023-01-15,400756,Hispanic,Race,,,G,99.000000,...,0,0,0,99.0,0,G,0.0,99.0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
659614,2754544,3322022,2022-12-25,414582,WAO,Race,,,R,75.652174,...,71,57,2,96.0,2,R,433.0,54.0,210,
659615,2754544,3322022,2023-01-01,414582,WAO,Race,,,R,75.652174,...,57,67,-2,96.0,4,R,30.0,54.0,181,
659616,2754544,3322022,2023-01-08,414582,WAO,Race,,,R,75.652174,...,67,0,-2,96.0,2,R,-761.0,54.0,103,
659617,2754544,3322022,2023-01-15,414582,WAO,Race,,,R,75.652174,...,0,0,0,96.0,0,R,157.0,54.0,118,


In [76]:
df_test_final[feature_cols].dropna()

Unnamed: 0,mean_pop_prior,song_weeks_since_last_spins,total_spins_song_station_prior,omt_co_flag,per_diff_market_artist_spins_prior,mr_pop_prior_unv,mr_market_artist_spins_prior,min_pop_prior_unv,feat_artist_song,market_artist_spins,...,taa_quintile,mr_market_spins_spins,market_spins,diff_spins_song_market_prior,max_pop_artist_prior_unv,mr_spins_song_station_prior,gcr_adj,diff_song_univ_spins_prior,min_pop_artist_prior_unv,format_spins
2267,99.000000,1.0,806.0,0,0.017442,99.0,172,99.0,1,175,...,5,172,175,6,99.0,6,G,3.0,99.0,17
2268,99.000000,1.0,823.0,0,-0.714286,99.0,175,99.0,1,50,...,5,175,50,-11,99.0,12,G,-125.0,99.0,1
2269,99.000000,1.0,824.0,0,-1.000000,99.0,50,99.0,1,0,...,5,50,0,-1,99.0,1,G,-50.0,99.0,0
2270,99.000000,2.0,824.0,0,1.000000,99.0,0,99.0,1,0,...,5,0,0,0,99.0,0,G,0.0,99.0,0
2271,99.000000,3.0,824.0,0,1.000000,99.0,0,99.0,1,0,...,5,0,0,0,99.0,0,G,0.0,99.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
659614,75.652174,1.0,1544.0,0,-0.197183,88.0,71,54.0,1,57,...,2,71,57,2,96.0,2,R,433.0,54.0,210
659615,75.652174,0.0,1548.0,0,0.175439,88.0,57,54.0,1,67,...,2,57,67,-2,96.0,4,R,30.0,54.0,181
659616,75.652174,1.0,1550.0,0,-1.000000,84.0,67,54.0,1,0,...,2,67,0,-2,96.0,2,R,-761.0,54.0,103
659617,75.652174,0.0,1550.0,0,1.000000,84.0,0,54.0,1,0,...,2,0,0,0,96.0,0,R,157.0,54.0,118


In [83]:
set(X_train.columns) - set(X_test.columns)

{'Market_Name_Atlanta',
 'Market_Name_Austin',
 'Market_Name_Baltimore',
 'Market_Name_Boston',
 'Market_Name_Charlotte',
 'Market_Name_Chicago',
 'Market_Name_Cincinnati',
 'Market_Name_Columbus, OH',
 'Market_Name_Dallas',
 'Market_Name_Denver',
 'Market_Name_Detroit',
 'Market_Name_Miami',
 'Market_Name_Minneapolis',
 'Market_Name_Nashville',
 'Market_Name_New York',
 'Market_Name_Orlando',
 'Market_Name_Philadelphia',
 'Market_Name_Phoenix',
 'Market_Name_Pittsburgh',
 'Market_Name_Portland, OR',
 'Market_Name_Raleigh',
 'Market_Name_Salt Lake City',
 'Market_Name_San Diego',
 'Market_Name_San Francisco',
 'Market_Name_Seattle',
 'Market_Name_St. Louis',
 'Market_Name_Tampa',
 'Market_Name_Washington, DC',
 'omt_co_flag_OMT_CO',
 'omt_co_flag_OMT_only',
 'segment_AA'}

In [84]:
for i in missing_cols:
    X_test[i] = 0

In [85]:
X_test = X_test[X_train.columns]

In [62]:
pd.unique(X_test['omt_co_flag_CO_only'])

array([0, 1], dtype=uint8)

In [86]:
df_out = pd.DataFrame(columns=['lower_wob_thresh', 'mean_pop_predicted', 'upper_wobble_thresh'])
for cat in demo_cats:
    #tic = time.perf_counter()
    idx = (df_test_final['demo_category'] == cat)
    print(cat + ': ' + str(sum(idx)))

    # extract relevant segment indicator columns
    demo_cols_cat = ['segment_' + i for i in list(pd.unique(df_test_final.loc[idx]['segment']))]
    demo_cols_excl = list(set(demo_cols_all) - set(demo_cols_cat))
    feature_cols_cat = list(set(X_test.columns) - set(demo_cols_excl))

    # create features and target
    X = X_test.loc[idx][feature_cols_cat]
    if cat == 'Race':
        X['segment_AA'] = 0
    y = y_test.loc[idx]
    #print(X)

    # create empty dataframe
    df_temp = pd.DataFrame()
    # predict using estimator

    #re-arrange features

    df_temp['lower_wob_thresh'] = pd.DataFrame(best_estimators[cat][0].predict(X[best_estimators[cat][0].feature_names_in_]), index=X_test.loc[idx].index)
    df_temp['mean_pop_predicted'] = pd.DataFrame(best_estimators[cat][1].predict(X[best_estimators[cat][1].feature_names_in_]), index=X_test.loc[idx].index)
    df_temp['upper_wobble_thresh'] = pd.DataFrame(best_estimators[cat][2].predict(X[best_estimators[cat][2].feature_names_in_]), index=X_test.loc[idx].index)

    df_out = pd.concat([df_out,df_temp], axis=0)

Core-Cume: 4638
Age: 4638
Race: 4598
Gender: 3122


In [45]:
df_out

Unnamed: 0,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh
1359,68.511838,82.960296,98.077451
1360,69.581628,82.973632,98.077451
1361,69.018166,83.449665,98.077451
1362,69.018166,83.449665,98.077451
1363,69.747890,83.639983,98.077451
...,...,...,...
518289,55.041558,73.456735,88.718066
519671,71.085823,79.251774,87.379395
519672,70.970772,79.251774,87.495749
519683,54.196172,57.071973,75.961581


#### score format alone without KIIS-FM

In [87]:
scoring_date = pd.to_datetime('2022-12-04')
test_idx_fmt = (df_test['week_dt'] >= scoring_date.date())

df_test_final_fmt = df_test.loc[test_idx_fmt][id_cols + ['total_respondents'] + feature_cols + target_col]
X_test_fmt = pd.get_dummies(df_test_final_fmt[feature_cols], columns=cat_cols)

missing_cols_fmt = list(set(X_train.columns) - set(X_test_fmt.columns))

y_test_fmt = df_test_final_fmt[target]

In [88]:
missing_cols_fmt

['omt_co_flag_OMT_only', 'omt_co_flag_OMT_CO']

In [89]:
for i in missing_cols_fmt:
    X_test_fmt[i] = 0

In [90]:
X_test_fmt = X_test_fmt[X_train.columns]

In [91]:
df_out_fmt = pd.DataFrame(columns=['lower_wob_thresh', 'mean_pop_predicted', 'upper_wobble_thresh'])
for cat in demo_cats:
    #tic = time.perf_counter()
    idx = (df_test_final_fmt['demo_category'] == cat)
    print(cat + ': ' + str(sum(idx)))

    # extract relevant segment indicator columns
    demo_cols_cat = ['segment_' + i for i in list(pd.unique(df_test_final_fmt.loc[idx]['segment']))]
    demo_cols_excl = list(set(demo_cols_all) - set(demo_cols_cat))
    feature_cols_cat = list(set(X_test.columns) - set(demo_cols_excl))

    # create features and target

    X = X_test_fmt.loc[idx][feature_cols_cat]
    X.dropna(inplace=True)
    idx_new = X.index
    print(cat + ': ' + str(len(idx_new)))
    y = y_test_fmt.loc[idx_new]
    #print(X)

    # create empty dataframe
    df_temp_fmt = pd.DataFrame()
    # predict using estimator

    #re-arrange features

    df_temp_fmt['lower_wob_thresh'] = pd.DataFrame(best_estimators[cat][0].predict(X[best_estimators[cat][0].feature_names_in_]), index=X_test_fmt.loc[idx_new].index)
    df_temp_fmt['mean_pop_predicted'] = pd.DataFrame(best_estimators[cat][1].predict(X[best_estimators[cat][1].feature_names_in_]), index=X_test_fmt.loc[idx_new].index)
    df_temp_fmt['upper_wobble_thresh'] = pd.DataFrame(best_estimators[cat][2].predict(X[best_estimators[cat][2].feature_names_in_]), index=X_test_fmt.loc[idx_new].index)

    df_out_fmt = pd.concat([df_out_fmt,df_temp_fmt], axis=0)

Core-Cume: 95115
Core-Cume: 95115
Age: 95144
Age: 95144
Race: 69062
Race: 69062
Gender: 76870
Gender: 76870


In [92]:
df_out_fmt

Unnamed: 0,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh
29,70.915061,92.427825,103.928914
30,72.044496,92.427825,103.928914
31,70.349429,92.427825,103.928914
32,72.044496,92.427825,103.928914
33,72.044496,92.427825,103.928914
...,...,...,...
659767,57.810650,76.479388,89.565672
659768,57.644389,76.416436,89.811995
659769,57.726578,76.128290,89.811995
659770,57.644389,76.416436,89.811995


In [93]:
df_out.to_pickle('df_out_stage_01222023_01292021.pkl')
df_out_fmt.to_pickle('df_out_fmt_stage_01222023_01292021.pkl')

### Process output & write to Excel

In [94]:
df_out_final = df_test_final[id_cols + ['taa_quintile']].join(df_out, how='left')

In [95]:
df_out_final['segment'] = df_out_final['breakout_name'].apply(func=(lambda x: breakout_map[x] if (x in breakout_map.keys()) else None))

In [96]:
df_out_final_fmt = df_test_final_fmt[id_cols + ['taa_quintile']].join(df_out_fmt, how='left')

In [97]:
df_out_final_fmt['segment'] = df_out_final_fmt['breakout_name'].apply(func=(lambda x: breakout_map[x] if (x in breakout_map.keys()) else None))

In [98]:
df_out_final

Unnamed: 0,mediabase_id,station_id,week_dt,breakout_id,breakout_name,demo_category,pop_co,pop_omt,gcr,taa_quintile,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh,segment
2267,1088410,3322022,2022-12-18,400756,Hispanic,Race,,,G,5,76.882964,98.076248,104.457429,Hispanic
2268,1088410,3322022,2022-12-25,400756,Hispanic,Race,,,G,5,73.364436,98.076248,104.123936,Hispanic
2269,1088410,3322022,2023-01-01,400756,Hispanic,Race,,,G,5,76.675495,99.530475,104.123936,Hispanic
2270,1088410,3322022,2023-01-08,400756,Hispanic,Race,,,G,5,74.410205,99.530475,104.457429,Hispanic
2271,1088410,3322022,2023-01-15,400756,Hispanic,Race,,,G,5,74.410205,99.530475,104.457429,Hispanic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
659614,2754544,3322022,2022-12-25,414582,WAO,Race,,,R,2,59.185196,76.007226,92.769153,White
659615,2754544,3322022,2023-01-01,414582,WAO,Race,,,R,2,58.788332,75.987736,92.769153,White
659616,2754544,3322022,2023-01-08,414582,WAO,Race,,,R,2,58.607518,75.987736,92.769153,White
659617,2754544,3322022,2023-01-15,414582,WAO,Race,,,R,2,58.788332,76.007226,92.769153,White


In [99]:
# song-artist lookup
song_query = '''
Select mediabase_id, song_name, artist_name
from data.songs_v as sv
'''
engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as conn:
    with conn.begin():
        df_song_lookup = pd.read_sql(song_query, con=conn)

In [100]:
station_query = '''
Select distinct station_id, call_letters
from data.stations_v as sv
'''

engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as conn:
    with conn.begin():
        df_station_lookup = pd.read_sql(station_query, con=conn)

In [101]:
df_song_lookup.set_index(['mediabase_id'], inplace=True)

In [102]:
df_station_lookup.set_index(['station_id'], inplace=True)

In [103]:
df_song_lookup['song_artist']  = df_song_lookup['song_name'] + ' (' + df_song_lookup['artist_name'] + ')'

In [104]:
df_song_lookup

Unnamed: 0_level_0,song_name,artist_name,song_artist
mediabase_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1929447,Burnin' It Down,STEVE EARLE,Burnin' It Down (STEVE EARLE)
1090491,Into The Mystic,VAN MORRISON,Into The Mystic (VAN MORRISON)
2318924,Young Black America,MEEK MILL,Young Black America (MEEK MILL)
2873870,Weightless,ARLO PARKS,Weightless (ARLO PARKS)
2131197,Hero Of The Day (Live),METALLICA,Hero Of The Day (Live) (METALLICA)
...,...,...,...
2620719,Get On My Wave,ANDREW MCMAHON IN WILDERNESS,Get On My Wave (ANDREW MCMAHON IN WILDERNESS)
2667991,Exaggeration,JUAN HAZE,Exaggeration (JUAN HAZE)
2438195,Every Week,DQ4E,Every Week (DQ4E)
2623051,Your Story Is Over!,AYREON,Your Story Is Over! (AYREON)


In [105]:
df_out_final['song_artist'] = df_out_final.join(df_song_lookup, on=['mediabase_id'], how='left')['song_artist']

In [106]:
df_out_final['call_letters'] = df_out_final.join(df_station_lookup, on=['station_id'], how='left')['call_letters']

In [107]:
df_out_final['wobble_flag'] = df_out_final.apply(lambda x: int((x['pop_co'] < np.floor(x['lower_wob_thresh'])) | (x['pop_co'] > np.ceil(x['upper_wobble_thresh']))), axis=1)

In [111]:
df_out_final.to_pickle('df_out_final_KIIS-FM_2022_12_04_2023_01_22.pkl')

In [108]:
df_out_final_fmt['song_artist'] = df_out_final_fmt.join(df_song_lookup, on=['mediabase_id'], how='left')['song_artist']

In [109]:
df_out_final_fmt['call_letters'] = df_out_final_fmt.join(df_station_lookup, on=['station_id'], how='left')['call_letters']

In [110]:
df_out_final_fmt['wobble_flag'] = df_out_final_fmt.apply(lambda x: int((x['pop_co'] < np.floor(x['lower_wob_thresh'])) | (x['pop_co'] > np.ceil(x['upper_wobble_thresh']))), axis=1)

In [112]:
df_out_final_fmt.to_pickle('df_out_final_H1_2022_12_04_2023_01_22.pkl')

### Read pre-written files

In [126]:
df_out_final = pd.read_pickle('df_out_final_KIIS-FM_2022_12_04_2023_01_22.pkl')

In [125]:
df_out_final_fmt = pd.read_pickle('df_out_final_H1_2022_12_04_2023_01_22.pkl')

In [113]:
df_out_final_fmt

Unnamed: 0,mediabase_id,station_id,week_dt,breakout_id,breakout_name,demo_category,pop_co,pop_omt,gcr,taa_quintile,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh,segment,song_artist,call_letters,wobble_flag
2,1085550,3322808,2022-12-04,412759,F (18-24),Gender,,,G,5,69.020207,89.997511,98.529371,Female_(18-24),In The End (LINKIN PARK),KZHT-FM,0
3,1085550,3322808,2022-12-11,412759,F (18-24),Gender,,,G,5,71.647261,89.997511,98.699385,Female_(18-24),In The End (LINKIN PARK),KZHT-FM,0
4,1085550,3322808,2022-12-18,412759,F (18-24),Gender,,,G,5,69.020207,90.223598,98.529371,Female_(18-24),In The End (LINKIN PARK),KZHT-FM,0
5,1085550,3322808,2022-12-25,412759,F (18-24),Gender,,,G,5,71.647261,89.997511,98.699385,Female_(18-24),In The End (LINKIN PARK),KZHT-FM,0
6,1085550,3322808,2023-01-01,412759,F (18-24),Gender,,,G,5,71.647261,89.997511,98.699385,Female_(18-24),In The End (LINKIN PARK),KZHT-FM,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
659830,2754544,3322204,2022-12-25,401515,*Old*,Age,,,R,2,57.644843,79.531999,94.185463,Old,Enemy From League Of Legends (IMAGINE DRAGONS),WHQC-FM,0
659831,2754544,3322204,2023-01-01,401515,*Old*,Age,,,R,2,57.428034,78.403516,93.442238,Old,Enemy From League Of Legends (IMAGINE DRAGONS),WHQC-FM,0
659832,2754544,3322204,2023-01-08,401515,*Old*,Age,,,R,2,57.002017,78.105013,93.442238,Old,Enemy From League Of Legends (IMAGINE DRAGONS),WHQC-FM,0
659833,2754544,3322204,2023-01-15,401515,*Old*,Age,,,R,2,57.002017,78.907983,93.787264,Old,Enemy From League Of Legends (IMAGINE DRAGONS),WHQC-FM,0


In [120]:
pd.unique(df_out_final['week_dt'])

array([datetime.date(2022, 12, 18), datetime.date(2022, 12, 25),
       datetime.date(2023, 1, 1), datetime.date(2023, 1, 8),
       datetime.date(2023, 1, 15), datetime.date(2023, 1, 22),
       datetime.date(2022, 12, 4), datetime.date(2022, 12, 11)],
      dtype=object)

#### wobbles report

In [114]:
df_instances = df_out_final[(df_out_final['breakout_id'] > 1) & (df_out_final['wobble_flag'] == 1)][['station_id', 'mediabase_id']].drop_duplicates()

In [115]:
df_instances

Unnamed: 0,station_id,mediabase_id
23666,3322022,1234685
222630,3322022,2073734
272809,3322022,2256903
418725,3322022,2582475
419919,3322022,2591406
423186,3322022,2597601
451907,3322022,2654543
487906,3322022,2703939
499536,3322022,2716380
505971,3322022,2720314


In [116]:
df_wobble = df_instances.join(df_out_final[(df_out_final['breakout_id'] > 0) & (~pd.isna(df_out_final['pop_co']))].set_index(['mediabase_id', 'station_id']), on = ['mediabase_id', 'station_id'], how='left')

In [129]:
len(df_wobble)

595

In [132]:
df_wobble = df_wobble.join(df_test.set_index(['mediabase_id', 'station_id', 'week_dt', 'breakout_id'])[['spins_non_on', 'market_spins']], on=['mediabase_id', 'station_id', 'week_dt', 'breakout_id'], how='left')

In [134]:
taa_query = '''
Select mediabase_id, station_id, week_dt, taa
from dbo.rr_scores_adds_from_prod as rsafp
where week_dt >= '2022-12-04'
and format='H1'
and station_id = 3322022
'''

engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as conn:
    with conn.begin():
        df_taa_lookup = pd.read_sql(taa_query, con=conn)

In [137]:
df_wobble = df_wobble.join(df_taa_lookup.set_index(['mediabase_id', 'station_id', 'week_dt'])['taa'], on=['mediabase_id', 'station_id', 'week_dt'], how='left')

In [145]:
df_wobble['score_date'] = df_wobble['week_dt'] + np.timedelta64(8, 'D')

In [146]:
df_wobble[['score_date', 'week_dt']].drop_duplicates()

Unnamed: 0,score_date,week_dt
23666,2023-01-23,2023-01-15
272809,2023-01-02,2022-12-25
272809,2023-01-09,2023-01-01
272809,2023-01-30,2023-01-22


In [147]:
df_wobble_out = df_wobble.pivot(index=['call_letters', 'song_artist', 'score_date', 'gcr', 'market_spins', 'spins_non_on', 'taa'], columns=['breakout_name'], values=['pop_co', 'wobble_flag'])

In [148]:
df_wobble_out

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,pop_co,pop_co,pop_co,pop_co,pop_co,pop_co,pop_co,wobble_flag,wobble_flag,wobble_flag,wobble_flag,wobble_flag,wobble_flag,wobble_flag
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,breakout_name,*Core*,*Old*,*Young*,F (18-24),Hispanic,Total,WAO,*Core*,*Old*,*Young*,F (18-24),Hispanic,Total,WAO
call_letters,song_artist,score_date,gcr,market_spins,spins_non_on,taa,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
KIIS-FM,Anti-Hero (TAYLOR SWIFT),2023-01-02,C,213,83,7.9582,81.0,86.0,67.0,78.0,69.0,74.0,84.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
KIIS-FM,Anti-Hero (TAYLOR SWIFT),2023-01-09,C,199,74,7.7473,69.0,94.0,56.0,56.0,74.0,68.0,55.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
KIIS-FM,Anti-Hero (TAYLOR SWIFT),2023-01-23,C,192,49,7.5557,70.0,88.0,55.0,57.0,66.0,66.0,69.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
KIIS-FM,Anti-Hero (TAYLOR SWIFT),2023-01-30,C,161,29,7.5274,76.0,95.0,58.0,60.0,70.0,76.0,89.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
KIIS-FM,As It Was (HARRY STYLES),2023-01-02,C,203,81,9.2475,86.0,89.0,73.0,80.0,85.0,78.0,61.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
KIIS-FM,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
KIIS-FM,Woman (DOJA CAT),2023-01-02,R,58,40,8.9875,96.0,106.0,84.0,82.0,90.0,91.0,93.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
KIIS-FM,Woman (DOJA CAT),2023-01-09,R,40,28,8.8104,79.0,88.0,74.0,71.0,90.0,78.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
KIIS-FM,Woman (DOJA CAT),2023-01-23,R,24,16,8.4411,76.0,103.0,73.0,74.0,88.0,83.0,67.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
KIIS-FM,Woman (DOJA CAT),2023-01-30,R,31,23,8.4996,96.0,87.0,94.0,96.0,97.0,91.0,75.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [141]:
df_wobble_out.columns

MultiIndex([(     'pop_co',    '*Core*'),
            (     'pop_co',     '*Old*'),
            (     'pop_co',   '*Young*'),
            (     'pop_co', 'F (18-24)'),
            (     'pop_co',  'Hispanic'),
            (     'pop_co',     'Total'),
            (     'pop_co',       'WAO'),
            ('wobble_flag',    '*Core*'),
            ('wobble_flag',     '*Old*'),
            ('wobble_flag',   '*Young*'),
            ('wobble_flag', 'F (18-24)'),
            ('wobble_flag',  'Hispanic'),
            ('wobble_flag',     'Total'),
            ('wobble_flag',       'WAO')],
           names=[None, 'breakout_name'])

#### percentage gaps report

In [149]:
out_cols = ['call_letters', 'song_artist', 'demo_category', 'segment','week_dt', 'taa_quintile', 'pop_co', 'pop_omt', 'wobble_flag', 'mean_pop_predicted']

In [150]:
df_to_report = df_out_final[out_cols]

In [151]:
df_to_report_fmt = df_out_final_fmt[out_cols]

In [152]:
df_to_report.sort_values(by=['call_letters', 'song_artist', 'week_dt', 'demo_category', 'segment'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [153]:
# define percentage gap lambdas
fn_diff_Age = lambda x: (x['Young'] - x['Old'])/x['Old']
fn_diff_Gender  = lambda x: (x['Female_Other'] - x['Female_(18-24)'])/x['Female_(18-24)']
fn_diff_Race_AA = lambda x: (x['AA'] - x['White'])/(x['White'])
fn_diff_Race_Hispanic = lambda x: (x['Hispanic'] - x['White'])/(x['White'])
fn_diff_Non_Core = lambda x: (x['Non-Core'] - x['Core'])/(x['Core'])

In [154]:
ref_demo = {'Age': 'Old', 'Race': 'White', 'Core-Cume': 'Core', 'Gender': 'Female_(18-24)'}
df_to_report_pvt = df_to_report.pivot_table(index=['call_letters', 'song_artist', 'week_dt'], columns=['segment'], values=[ 'mean_pop_predicted'])
df_to_report_pvt.columns = [i[1].replace(' ', '_') for i in df_to_report_pvt.columns]

In [155]:
df_to_report_pvt_fmt =  (df_to_report_fmt.groupby(['song_artist', 'segment', 'week_dt'])['mean_pop_predicted'].mean().reset_index()).pivot_table(index=['song_artist', 'week_dt'], columns=['segment'], values=[ 'mean_pop_predicted'])
df_to_report_pvt_fmt.columns = [i[1].replace(' ', '_') for i in df_to_report_pvt_fmt.columns]

In [156]:
df_perc_gaps = df_to_report_pvt.apply([fn_diff_Age, fn_diff_Gender, fn_diff_Non_Core, fn_diff_Race_Hispanic], axis=1).reset_index()
df_perc_gaps.columns = ['call_letters', 'song_artist', 'week_dt', 'age_gap', 'gender_gap', 'core_gap', 'race_gap']

In [157]:
df_perc_gaps_fmt = df_to_report_pvt_fmt.apply([fn_diff_Age, fn_diff_Gender, fn_diff_Non_Core, fn_diff_Race_Hispanic], axis=1).reset_index()
df_perc_gaps_fmt.columns = ['song_artist', 'week_dt', 'fmt_age_gap', 'fmt_gender_gap', 'fmt_core_gap', 'fmt_race_gap']

In [158]:
df_perc_gaps = df_perc_gaps.join(df_perc_gaps_fmt.set_index(['song_artist', 'week_dt']), on=['song_artist', 'week_dt'], how='left')

In [160]:
df_perc_gaps['score_date'] = df_perc_gaps['week_dt'] + np.timedelta64(8, 'D')

In [161]:
df_perc_gaps.columns

Index(['call_letters', 'song_artist', 'week_dt', 'age_gap', 'gender_gap',
       'core_gap', 'race_gap', 'fmt_age_gap', 'fmt_gender_gap', 'fmt_core_gap',
       'fmt_race_gap', 'score_date'],
      dtype='object')

In [164]:
out_cols_gaps = ['call_letters', 'song_artist', 'score_date', 'age_gap','fmt_age_gap', 'gender_gap','fmt_gender_gap',
       'core_gap', 'fmt_core_gap','race_gap',
       'fmt_race_gap' ]

#### WOW data

In [70]:
df_perc_gaps.pivot_table(index=['song_artist', 'call_letters'], columns=['week_dt'], values=['age_gap'])

Unnamed: 0_level_0,Unnamed: 1_level_0,age_gap,age_gap,age_gap,age_gap,age_gap,age_gap,age_gap
Unnamed: 0_level_1,week_dt,2022-11-27,2022-12-04,2022-12-11,2022-12-18,2022-12-25,2023-01-01,2023-01-08
song_artist,call_letters,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
24K Magic (BRUNO MARS),KIIS-FM,-0.012953,-0.023978,-0.020834,-0.030042,-0.020834,-0.020834,-0.020834
2step f/Lil Baby (ED SHEERAN),KIIS-FM,-0.222609,,,,,,
34+35 (ARIANA GRANDE),KIIS-FM,-0.046156,-0.046834,-0.046834,-0.046156,-0.046834,-0.055155,-0.018519
7 Rings (ARIANA GRANDE),KIIS-FM,0.004703,0.004703,0.004703,0.004703,0.004703,0.017950,0.004703
About Damn Time (LIZZO),KIIS-FM,-0.164564,-0.177611,-0.166288,-0.163729,-0.163729,-0.173263,-0.163729
...,...,...,...,...,...,...,...,...
ily (i love you baby) f/Emilee (SURF MESA),KIIS-FM,-0.110192,-0.110192,-0.102936,-0.110192,-0.106010,-0.107820,
pov (ARIANA GRANDE),KIIS-FM,-0.030285,-0.030991,-0.033136,-0.032433,-0.033136,-0.033136,-0.030991
she's all i wanna be (TATE MCRAE),KIIS-FM,-0.101500,-0.108290,-0.104688,-0.101480,-0.098199,-0.101847,-0.106149
"thank u, next (ARIANA GRANDE)",KIIS-FM,-0.036366,-0.036366,-0.036366,-0.036366,,,


### Write Output to Excel

In [162]:
df_perc_gaps.to_pickle('df_perc_gaps_01292023.pkl')
df_wobble_out.to_pickle('df_wobble_out_01292023.pkl')

In [165]:
with pd.ExcelWriter('RR_by_Demographic_KIIS-FM_01292023.xlsx') as writer:
    df_perc_gaps[out_cols_gaps].to_excel(writer, sheet_name='Percentage Gaps')
    df_wobble_out.to_excel(writer, sheet_name='Wobbles')