In [1]:
# import packages
import pandas as pd
import pickle as pkl
import os
import numpy as np

In [2]:
from sqlalchemy import create_engine

def postgresql_engine(user, pwd, host, port, dbname):
    # Need pyycopg2-binary package
    sql_engine = create_engine('postgres://' + user + ':' + pwd + '@' + host + ':' + port + '/' + dbname, echo=False)
    return sql_engine

In [3]:
# DB username and password
import getpass

user = getpass.getpass()
pwd = getpass.getpass()

In [4]:
# misc db parameters
host= 'adds-postgres-dev.cfgztrijqgvp.us-east-1.rds.amazonaws.com'
dbname= 'musiclab'
port= '5432'

In [5]:
# get callout research for songs released in the past 2 years
data_query_train = '''
Select *
from adds_temp.demo_rr_features_h1 as rdfh
where pop_all is not null
and breakout_name = 'Total'
'''

In [5]:
filter_rules = '''and song_weeks_since_last_spins <=13
and ((song_last_test_co_weeks <=26)
or (song_last_test_omt_weeks <=104)
or format_code in ('u4','l2'))
and (((station_test_1_plus=0 and station_test_1_id=1) or station_test_1_plus>0) or format_code in ('u4','l2'))
and (format_code<>'h1' or station_id<>3323403)
and (format_code<>'c1' or station_id<>3322825)
and (format_code<>'a2' or station_id<>3322799 or gcr<>'G')
and taa_quintile is not null
and gcr_adj is not null
'''

In [8]:
engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as con:
    with con.connect():
        df_train = pd.read_sql(data_query_train + filter_rules, con=con)

In [7]:
df_train.sort_values(by=['station_id', 'mediabase_id', 'breakout_name', 'week_dt', ], inplace=True)

In [8]:
# define demo segments and categories
breakout_category = {'*Core*': 'Core-Cume', '*Old*': 'Age', '*Young*': 'Age', 'Total': 'Total', 'White': 'Race',
                     'Non-Core': 'Core-Cume',
                     'Hispanic': 'Race', 'AA': 'Race', 'F': 'Gender', 'M': 'Gender', 'WAO': 'Race',
                     'F (25-29)': 'Gender', 'F (20-24)': 'Gender', 'F (18-29)': 'Gender', 'F (17-29)': 'Gender',
                     'F (20-23)': 'Gender', 'F (18-39)' : 'Gender',
                     'F (16-24)': 'Gender', 'F (30-34)': 'Gender', 'F (18-34)': 'Gender', 'F (24-29)': 'Gender',
                     'F (17-19)': 'Gender', 'F (15-26)': 'Gender', 'F (15-19)': 'Gender', 'F (15-24)': 'Gender',
                     'F (18-24)': 'Gender', 'F (20-29)': 'Gender', 'F (25-34)': 'Gender', 'F (Other)': 'Gender'}

breakout_map = {'*Core*': 'Core', '*Old*': 'Old', '*Young*': 'Young', 'Total': 'Total', 'White': 'White',
                'Non-Core': 'Non-Core',
                'Hispanic': 'Hispanic', 'AA': 'AA', 'F': 'Female', 'M': 'Male', 'WAO': 'White', 'F (25-29)': 'Female',
                'F (20-24)': 'Female', 'F (18-29)': 'Female', 'F (17-29)': 'Female', 'F (20-23)': 'Female',
                'F (16-24)': 'Female', 'F (30-34)': 'Female', 'F (18-34)': 'Female', 'F (24-29)': 'Female',
                'F (17-19)': 'Female', 'F (18-39)' : 'Female',
                'F (15-26)': 'Female', 'F (15-19)': 'Female', 'F (15-24)': 'Female',
                'F (18-24)': 'Female_(18-24)', 'F (20-29)': 'Female', 'F (25-34)': 'Female',
                'F (Other)': 'Female_Other'}

In [9]:
# create segment and category fields
df_train['segment'] = df_train['breakout_name'].apply(func=(lambda x: breakout_map[x] if (x in breakout_map.keys()) else None))
df_train['demo_category'] = df_train['breakout_name'].apply(func=(lambda x: breakout_category[x] if x in breakout_category.keys() else None))

In [10]:
# drop misc female breakouts
drop_idx = df_train[(df_train['segment'] == 'Female') | (pd.isna(df_train['segment']))].index
df_train.drop(drop_idx, inplace=True)

In [11]:
# Drop songs with just a single score in the past 2 years
df_train_week_ct = pd.DataFrame(df_train.groupby(['station_id', 'mediabase_id', 'breakout_id'])['week_dt'].count())
drop_idx = df_train.join(df_train_week_ct[df_train_week_ct['week_dt'] == 1], on=['station_id', 'mediabase_id', 'breakout_id'], how='right', rsuffix='_r').index
df_train.drop(index=drop_idx, inplace=True)

In [10]:
df_train.groupby(['demo_category', 'segment', 'taa_quintile'])['mediabase_id'].count()

demo_category  segment         taa_quintile
Age            Old             1                2252
                               2                6295
                               3                6168
                               4                8090
                               5               21146
               Young           1                2252
                               2                6295
                               3                6173
                               4                8088
                               5               21143
Core-Cume      Core            1                2251
                               2                6293
                               3                6169
                               4                8089
                               5               21150
               Non-Core        1                2242
                               2                6266
                               3                6147
  

#### isolate numeric and categorical columns

In [12]:
# constants
num_cols_like = ['artist_count', 'feat_artist', 'feat_artist_song', 'mscore', 'spins','pop_prior',
                 'pop_artist_prior', 'song_age_weeks', 'song_last_test']
cat_cols_like = ['Market_Name', 'taa_quintile', 'segment', 'gcr', 'gcr_adj', 'omt_co_flag']
target = ['pop_all']
id_cols = ['mediabase_id', 'station_id', 'week_dt', 'breakout_id', 'breakout_name', 'demo_category', 'pop_co', 'pop_omt', 'gcr']

exclude_cols_like = ['date','song_last_test_any_weeks','song_last_test_co_weeks', 'song_last_test_omt_weeks',
                     'std_pop_prior', 'std_pop_artist_prior']#,'_unv']#,'univ_spins', 'market_spins']

In [13]:
id_cols = id_cols
target_col = target
exclude_cols = df_train.columns[df_train.columns.str.contains('|'.join(exclude_cols_like), regex=True)]

cat_cols = list(set(df_train.columns[df_train.columns.str.contains('|'.join(cat_cols_like), regex=True)]) - set(
    id_cols) - set(exclude_cols))

num_cols = list(set(df_train.select_dtypes(exclude=['object', 'datetime64']).columns) & set(
df_train.columns[(df_train.columns.str.contains('|'.join(num_cols_like), regex=True))]) - set(id_cols) - set(cat_cols) - set(exclude_cols))

feature_cols = list(set(list(num_cols) + list(cat_cols)))

#### investigate & impute numeric columns

In [15]:
# Investigate Numeric Columns
num_cols_spins = [col for col in num_cols if 'spins' in col]
num_cols_pop = [col for col in num_cols if 'pop' in col]
num_cols_other = list(set(num_cols) - set(num_cols_spins) - set(num_cols_pop))
num_cols_spins_perc = [i for i in num_cols_spins if (('perc_diff_' in i) or ('per_diff' in i))]
num_cols_spins_nonperc = list(set(num_cols_spins) - set(num_cols_spins_perc))

In [21]:
# Backfill pop based data
df_train[num_cols_pop] = df_train.groupby(['station_id', 'mediabase_id', 'breakout_id'])[num_cols_pop].bfill()

In [22]:
# Fill missing perc spin diffs with 1.0
df_train[num_cols_spins_perc] = df_train[num_cols_spins_perc].transform(lambda x: x.fillna(1.0))

In [23]:
# Backfill non perc diff spins diff
df_train[num_cols_spins_nonperc] = df_train.groupby(['station_id', 'mediabase_id', 'breakout_id'])[num_cols_spins_nonperc].bfill()

In [48]:
idx = df_train[cat_cols + num_cols].dropna(axis=1).index
df_train.loc[idx].shape

(377628, 110)

In [49]:
df_train.shape

(377628, 110)

### Prep Data and create train/test splits

In [16]:
# Extract train data
[np.min(df_train['week_dt']),pd.to_datetime(np.min(df_train['week_dt'])) + np.timedelta64(2,'Y'), np.max(df_train['week_dt'])]
scoring_date = pd.to_datetime('2022-11-22')
train_idx = df_train['week_dt'] < scoring_date.date()

df_train_final = df_train.loc[train_idx][id_cols + feature_cols + target_col]
X_train = pd.get_dummies(df_train_final[feature_cols], columns=cat_cols)
y_train = df_train_final[target]

In [17]:
demo_cats = list(set(breakout_category.values()) - set(['Total']))

In [30]:
X_train.shape

(369153, 125)

In [32]:
# imports for model training
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, GroupKFold
from sklearn.metrics import make_scorer, mean_pinball_loss

In [34]:
low_alpha = 0.05
high_alpha = 0.95

param_grid = dict(
    learning_rate=[.2, .1, .05],
    n_estimators=[5, 10, 15],
    max_depth=[2, 4, 6],
    min_samples_leaf=[5, 10, 20],
    min_samples_split=[5, 10, 20]
)

n_iter = 50
n_splits = 5

In [18]:
import time
demo_cols_all = [col for col in X_train.columns if 'segment_' in col]

In [None]:
best_scores = {}
best_estimators = {}

for cat in demo_cats:
    tic = time.perf_counter()
    idx = (df_train_final['demo_category'] == cat)
    print(cat + ': ' + str(sum(idx)))

    # extract relevant segment indicator columns
    demo_cols_cat = ['segment_' + i for i in list(pd.unique(df_train_final.loc[idx]['segment']))]
    demo_cols_excl = list(set(demo_cols_all) - set(demo_cols_cat))
    feature_cols_cat = list(set(X_train.columns) - set(demo_cols_excl))

    # create features and target
    X = X_train.loc[idx][feature_cols_cat]
    y = y_train.loc[idx]

    # quantile regressor

    # gradient boosted quantile regressor
    group_kfold = GroupKFold(n_splits=n_splits)

    # train model for upper threshold given features
    cv = group_kfold.split(X.loc[idx], y.loc[idx], df_train_final.loc[idx]['mediabase_id'])
    neg_mean_pinball_loss_high = make_scorer(
        mean_pinball_loss,
        alpha=high_alpha,
        greater_is_better=False,  # maximize the negative loss
    )

    model_high_thresh = GradientBoostingRegressor(loss="quantile", alpha=high_alpha,
                                                  random_state=0)

    rs_high_thresh = RandomizedSearchCV(
        model_high_thresh,
        param_grid,
        n_iter=n_iter,
        scoring=neg_mean_pinball_loss_high,
        cv=cv,
        verbose=1,
        random_state=0,
        n_jobs=-1
    )

    rs_high_thresh.fit(X.loc[idx], np.ravel(y.loc[idx]))
    print(cat + ": Fitting for upper wobble threshold completed")

    # train model for lower threshold given features
    cv = group_kfold.split(X.loc[idx], y.loc[idx], df_train_final.loc[idx]['mediabase_id'])
    neg_mean_pinball_loss_low = make_scorer(
        mean_pinball_loss,
        alpha=low_alpha,
        greater_is_better=False,  # maximize the negative loss
    )

    model_low_thresh = GradientBoostingRegressor(loss="quantile", alpha=low_alpha,
                                                 random_state=0)

    rs_low_thresh = RandomizedSearchCV(
        model_low_thresh,
        param_grid,
        n_iter=n_iter,
        scoring=neg_mean_pinball_loss_low,
        cv=cv,
        verbose=1,
        random_state=0,
        n_jobs=-1
    )

    rs_low_thresh.fit(X.loc[idx], np.ravel(y.loc[idx]))
    print(cat + ": Fitting for lower wobble threshold completed")

    # train model for mean pop score given features
    cv = group_kfold.split(X.loc[idx], y.loc[idx], df_train_final.loc[idx]['mediabase_id'])
    model_mean = GradientBoostingRegressor(loss="squared_error")

    rs_mean = RandomizedSearchCV(
        model_mean,
        param_grid,
        n_iter=n_iter,
        scoring='neg_mean_absolute_error',
        cv=cv,
        verbose=1,
        random_state=0,
        n_jobs=-1
    )

    rs_mean.fit(X.loc[idx], np.ravel(y.loc[idx]))
    print(cat + ": Fitting for mean pop completed")

    toc = time.perf_counter()
    time_elapsed = toc-tic
    print('Total time elapsed for ' + cat + ': ' + '%.2f'%time_elapsed)

    best_scores[cat] = [rs_low_thresh.best_score_, rs_mean.best_score_, rs_high_thresh.best_score_]
    best_estimators[cat] = [rs_low_thresh.best_estimator_, rs_mean.best_estimator_, rs_high_thresh.best_estimator_]

In [51]:
import pickle
pickle.dump(best_scores, open('best_scores_all.pkl', "wb"))
pickle.dump(best_estimators, open('best_estimators_all.pkl', "wb"))

In [19]:
best_estimators = pd.read_pickle('best_estimators_all.pkl')

In [20]:
best_estimators

{'Age': [GradientBoostingRegressor(alpha=0.05, learning_rate=0.2, loss='quantile',
                            max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15,
                            random_state=0),
  GradientBoostingRegressor(learning_rate=0.2, max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15),
  GradientBoostingRegressor(alpha=0.95, learning_rate=0.2, loss='quantile',
                            max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15,
                            random_state=0)],
 'Gender': [GradientBoostingRegressor(alpha=0.05, learning_rate=0.2, loss='quantile',
                            max_depth=6, min_samples_leaf=10,
                            min_samples_split=20, n_estimators=15,
                            random_state=0),
  GradientBoostingRegressor(learning_rate=0.2, max_depth=6, min_samples_leaf=10,
  

### Prep scoring data & score


In [21]:
test_data_query = '''
Select *
from adds_temp.demo_rr_features_h1 as rdfh
where week_dt >= '2022-11-29'
and breakout_name = 'Total'
'''

In [22]:
engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as conn:
    with conn.connect():
        df_test = pd.read_sql(test_data_query + filter_rules, conn)

In [23]:
df_test.shape

(656836, 108)

In [24]:
# create segment and category fields
df_test['segment'] = df_test['breakout_name'].apply(func=(lambda x: breakout_map[x] if (x in breakout_map.keys()) else None))
df_test['demo_category'] = df_test['breakout_name'].apply(func=(lambda x: breakout_category[x] if x in breakout_category.keys() else None))

In [25]:
# drop misc female breakouts
drop_idx = df_test[(df_test['segment'] == 'Female') | (pd.isna(df_test['segment']))].index
df_test.drop(drop_idx, inplace=True)

In [26]:
# Backfill pop based data
df_test[num_cols_pop] = df_test.groupby(['station_id', 'mediabase_id', 'breakout_id'])[num_cols_pop].bfill()

In [27]:
# Fill missing perc spin diffs with 1.0
df_test[num_cols_spins_perc] = df_test[num_cols_spins_perc].transform(lambda x: x.fillna(1.0))

In [28]:
# Backfill non perc diff spins diff
df_test[num_cols_spins_nonperc] = df_test.groupby(['station_id', 'mediabase_id', 'breakout_id'])[num_cols_spins_nonperc].bfill()

In [36]:
df_test['omt_co_flag'] = df_test['omt_co_flag'].fillna(0)

In [38]:
# drop rows with missing features
drop_idx = list(set(df_test.index) - set(df_test[feature_cols].dropna().index))
df_test.drop(drop_idx, inplace=True)

#### score KIIS-FM

In [39]:
scoring_date = pd.to_datetime('2022-12-11')
test_idx = (df_test['week_dt'] >= scoring_date.date()) & (df_test['station_id'] == 3322022)

df_test_final = df_test.loc[test_idx][id_cols + feature_cols + target_col]
X_test = pd.get_dummies(df_test_final[feature_cols], columns=cat_cols)

missing_cols = list(set(X_train.columns) - set(X_test.columns))

y_test = df_test_final[target]

In [40]:
df_test_final

Unnamed: 0,mediabase_id,station_id,week_dt,breakout_id,breakout_name,demo_category,pop_co,pop_omt,gcr,max_pop_artist_prior,...,song_market_weeks_since_first_spins,total_spins_song_station_prior,mr_pop_prior,mr_artist_univ_spins,segment,per_diff_market_spins_spins_prior,station_artist_spins,diff_market_artist_spins_prior,diff_spins_song_station_prior,pop_all
1691,1086587,3322022,2022-12-11,-2,F (Other),Gender,,,R,94.0,...,26.0,979.0,81.0,478.0,Female_Other,-1.000000,0,-49,-1,
1692,1086587,3322022,2022-12-18,-2,F (Other),Gender,,,R,94.0,...,27.0,979.0,81.0,369.0,Female_Other,1.000000,2,48,2,
1693,1086587,3322022,2022-12-25,-2,F (Other),Gender,,,R,94.0,...,28.0,981.0,81.0,339.0,Female_Other,-0.166667,5,-8,3,
1694,1086587,3322022,2023-01-01,-2,F (Other),Gender,,,R,94.0,...,29.0,986.0,81.0,678.0,Female_Other,-0.475000,2,-19,-3,
1695,1086587,3322022,2023-01-08,-2,F (Other),Gender,,,R,94.0,...,30.0,988.0,81.0,632.0,Female_Other,-1.000000,0,-21,-2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656536,2779283,3322022,2023-01-01,414582,WAO,Race,,,R,90.0,...,46.0,683.0,72.0,524.0,White,-0.500000,11,9,-1,
656537,2779283,3322022,2023-01-08,414582,WAO,Race,,,R,90.0,...,47.0,684.0,72.0,574.0,White,0.000000,7,-4,0,
656538,2779283,3322022,2023-01-15,414582,WAO,Race,,,R,90.0,...,48.0,685.0,72.0,474.0,White,-1.000000,0,-7,-1,
656539,2779283,3322022,2023-01-22,414582,WAO,Race,,,R,90.0,...,49.0,685.0,72.0,377.0,White,1.000000,0,0,0,


In [41]:
df_test_final[feature_cols].dropna()

Unnamed: 0,max_pop_artist_prior,max_pop_prior_unv,min_pop_prior_unv,diff_song_univ_spins_prior,avg_song_univ_spins_prior,total_station_artist_spins_prior,diff_spins_song_market_prior,total_song_univ_spins_prior,total_spins_non_on_song_station_prior,diff_artist_univ_spins_prior,...,count_pop_prior_unv,song_market_weeks_since_first_spins,total_spins_song_station_prior,mr_pop_prior,mr_artist_univ_spins,segment,per_diff_market_spins_spins_prior,station_artist_spins,diff_market_artist_spins_prior,diff_spins_song_station_prior
1691,94.0,99.0,50.0,-109.0,3233.481481,979.0,-1,87304.0,724.0,-109.0,...,292.0,26.0,979.0,81.0,478.0,Female_Other,-1.000000,0,-49,-1
1692,94.0,99.0,50.0,-30.0,3131.178571,979.0,2,87673.0,724.0,-30.0,...,292.0,27.0,979.0,81.0,369.0,Female_Other,1.000000,2,48,2
1693,94.0,99.0,50.0,339.0,3034.896552,981.0,3,88012.0,726.0,339.0,...,292.0,28.0,981.0,81.0,339.0,Female_Other,-0.166667,5,-8,3
1694,94.0,99.0,50.0,-46.0,2956.333333,986.0,-3,88690.0,731.0,-46.0,...,300.0,29.0,986.0,81.0,678.0,Female_Other,-0.475000,2,-19,-3
1695,94.0,99.0,50.0,-533.0,2881.354839,988.0,-2,89322.0,733.0,-533.0,...,300.0,30.0,988.0,81.0,632.0,Female_Other,-1.000000,0,-21,-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656536,90.0,93.0,47.0,13.0,654.531915,692.0,-1,30763.0,473.0,50.0,...,33.0,46.0,683.0,72.0,524.0,White,-0.500000,11,9,-1
656537,90.0,93.0,47.0,-69.0,642.354167,703.0,0,30833.0,474.0,-100.0,...,33.0,47.0,684.0,72.0,574.0,White,0.000000,7,-4,0
656538,90.0,93.0,47.0,41.0,629.265306,710.0,-1,30834.0,475.0,-97.0,...,33.0,48.0,685.0,72.0,474.0,White,-1.000000,0,-7,-1
656539,90.0,93.0,47.0,-41.0,617.520000,710.0,0,30876.0,475.0,-149.0,...,33.0,49.0,685.0,72.0,377.0,White,1.000000,0,0,0


In [43]:
set(X_train.columns) - set(X_test.columns)

{'Market_Name_Atlanta',
 'Market_Name_Austin',
 'Market_Name_Baltimore',
 'Market_Name_Boston',
 'Market_Name_Charlotte',
 'Market_Name_Chicago',
 'Market_Name_Cincinnati',
 'Market_Name_Columbus, OH',
 'Market_Name_Dallas',
 'Market_Name_Denver',
 'Market_Name_Detroit',
 'Market_Name_Miami',
 'Market_Name_Minneapolis',
 'Market_Name_Nashville',
 'Market_Name_New York',
 'Market_Name_Orlando',
 'Market_Name_Philadelphia',
 'Market_Name_Phoenix',
 'Market_Name_Pittsburgh',
 'Market_Name_Portland, OR',
 'Market_Name_Raleigh',
 'Market_Name_Salt Lake City',
 'Market_Name_San Diego',
 'Market_Name_San Francisco',
 'Market_Name_Seattle',
 'Market_Name_St. Louis',
 'Market_Name_Tampa',
 'Market_Name_Washington, DC',
 'omt_co_flag_OMT_CO',
 'omt_co_flag_OMT_only',
 'segment_AA'}

In [44]:
for i in missing_cols:
    X_test[i] = 0

In [45]:
X_test = X_test[X_train.columns]

In [62]:
pd.unique(X_test['omt_co_flag_CO_only'])

array([0, 1], dtype=uint8)

In [46]:
df_out = pd.DataFrame(columns=['lower_wob_thresh', 'mean_pop_predicted', 'upper_wobble_thresh'])
for cat in demo_cats:
    #tic = time.perf_counter()
    idx = (df_test_final['demo_category'] == cat)
    print(cat + ': ' + str(sum(idx)))

    # extract relevant segment indicator columns
    demo_cols_cat = ['segment_' + i for i in list(pd.unique(df_test_final.loc[idx]['segment']))]
    demo_cols_excl = list(set(demo_cols_all) - set(demo_cols_cat))
    feature_cols_cat = list(set(X_test.columns) - set(demo_cols_excl))

    # create features and target
    X = X_test.loc[idx][feature_cols_cat]
    if cat == 'Race':
        X['segment_AA'] = 0
    y = y_test.loc[idx]
    #print(X)

    # create empty dataframe
    df_temp = pd.DataFrame()
    # predict using estimator

    #re-arrange features

    df_temp['lower_wob_thresh'] = pd.DataFrame(best_estimators[cat][0].predict(X[best_estimators[cat][0].feature_names_in_]), index=X_test.loc[idx].index)
    df_temp['mean_pop_predicted'] = pd.DataFrame(best_estimators[cat][1].predict(X[best_estimators[cat][1].feature_names_in_]), index=X_test.loc[idx].index)
    df_temp['upper_wobble_thresh'] = pd.DataFrame(best_estimators[cat][2].predict(X[best_estimators[cat][2].feature_names_in_]), index=X_test.loc[idx].index)

    df_out = pd.concat([df_out,df_temp], axis=0)

Gender: 3108
Race: 4583
Core-Cume: 4626
Age: 4626


In [47]:
df_out

Unnamed: 0,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh
1691,69.018166,83.449665,98.077451
1692,69.018166,83.449665,98.077451
1693,69.747890,83.639983,98.077451
1694,68.851904,83.449665,98.077451
1695,68.511838,83.212240,98.077451
...,...,...,...
656464,60.475703,74.660621,91.121738
656465,58.755278,73.899343,91.121738
656466,60.485782,74.325760,91.121738
656467,58.933434,73.899343,91.121738


In [54]:
df_out.to_pickle('df_out_stage_01292023_02052021.pkl')

### Process output & write to Excel

In [55]:
df_out_final = df_test_final[id_cols + ['taa_quintile']].join(df_out, how='left')

In [56]:
df_out_final['segment'] = df_out_final['breakout_name'].apply(func=(lambda x: breakout_map[x] if (x in breakout_map.keys()) else None))

In [59]:
df_out_final

Unnamed: 0,mediabase_id,station_id,week_dt,breakout_id,breakout_name,demo_category,pop_co,pop_omt,gcr,taa_quintile,lower_wob_thresh,mean_pop_predicted,upper_wobble_thresh,segment
1691,1086587,3322022,2022-12-11,-2,F (Other),Gender,,,R,5,69.018166,83.449665,98.077451,Female_Other
1692,1086587,3322022,2022-12-18,-2,F (Other),Gender,,,R,5,69.018166,83.449665,98.077451,Female_Other
1693,1086587,3322022,2022-12-25,-2,F (Other),Gender,,,R,5,69.747890,83.639983,98.077451,Female_Other
1694,1086587,3322022,2023-01-01,-2,F (Other),Gender,,,R,5,68.851904,83.449665,98.077451,Female_Other
1695,1086587,3322022,2023-01-08,-2,F (Other),Gender,,,R,5,68.511838,83.212240,98.077451,Female_Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656536,2779283,3322022,2023-01-01,414582,WAO,Race,,,R,1,59.051873,71.220858,86.694550,White
656537,2779283,3322022,2023-01-08,414582,WAO,Race,,,R,1,58.871058,70.994946,85.891615,White
656538,2779283,3322022,2023-01-15,414582,WAO,Race,,,R,1,57.882549,71.798682,85.891615,White
656539,2779283,3322022,2023-01-22,414582,WAO,Race,,,R,1,58.871058,70.994946,85.891615,White


In [60]:
# song-artist lookup
song_query = '''
Select mediabase_id, song_name, artist_name
from data.songs_v as sv
'''
engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as conn:
    with conn.begin():
        df_song_lookup = pd.read_sql(song_query, con=conn)

In [61]:
station_query = '''
Select distinct station_id, call_letters
from data.stations_v as sv
'''

engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as conn:
    with conn.begin():
        df_station_lookup = pd.read_sql(station_query, con=conn)

In [62]:
df_song_lookup.set_index(['mediabase_id'], inplace=True)

In [63]:
df_station_lookup.set_index(['station_id'], inplace=True)

In [64]:
df_song_lookup['song_artist']  = df_song_lookup['song_name'] + ' (' + df_song_lookup['artist_name'] + ')'

In [65]:
df_song_lookup

Unnamed: 0_level_0,song_name,artist_name,song_artist
mediabase_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1147620,Mr. Crowley (Live),OZZY OSBOURNE,Mr. Crowley (Live) (OZZY OSBOURNE)
1960216,Street Fighting Man (Live '13),ROLLING STONES,Street Fighting Man (Live '13) (ROLLING STONES)
2691117,Storybook Ending,BLUMES,Storybook Ending (BLUMES)
1804878,You Take My... (Live '76),QUEEN,You Take My... (Live '76) (QUEEN)
2131197,Hero Of The Day (Live),METALLICA,Hero Of The Day (Live) (METALLICA)
...,...,...,...
2437613,Valentine's Day,QUEEN OF THE MEADOW,Valentine's Day (QUEEN OF THE MEADOW)
2667991,Exaggeration,JUAN HAZE,Exaggeration (JUAN HAZE)
2438195,Every Week,DQ4E,Every Week (DQ4E)
2623051,Your Story Is Over!,AYREON,Your Story Is Over! (AYREON)


In [66]:
df_out_final['song_artist'] = df_out_final.join(df_song_lookup, on=['mediabase_id'], how='left')['song_artist']

In [67]:
df_out_final['call_letters'] = df_out_final.join(df_station_lookup, on=['station_id'], how='left')['call_letters']

In [68]:
df_out_final['wobble_flag'] = df_out_final.apply(lambda x: int((x['pop_co'] < np.floor(x['lower_wob_thresh'])) | (x['pop_co'] > np.ceil(x['upper_wobble_thresh']))), axis=1)

In [69]:
df_out_final.to_pickle('df_out_final_KIIS-FM_2022_12_11_2023_01_29.pkl')