In [1]:
# import packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from sqlalchemy import create_engine

def postgresql_engine(user, pwd, host, port, dbname):
    # Need pyycopg2-binary package
    sql_engine = create_engine('postgres://' + user + ':' + pwd + '@' + host + ':' + port + '/' + dbname, echo=False)
    return sql_engine

In [3]:
# DB username and password
import getpass

user = getpass.getpass()
pwd = getpass.getpass()

In [4]:
# misc db parameters
host= 'adds-postgres-dev.cfgztrijqgvp.us-east-1.rds.amazonaws.com'
dbname= 'musiclab'
port= '5432'

In [5]:
# get callout research for songs released in the past 2 years
data_query = '''
Select f.format_code, sv.mediabase_id, c.cmm_station_calls, c.test_date as week_dt, c.pop, mss.spins_non_on
FROM
    data.songs_v sv
JOIN
(
Select
mss."SongID",
mss."C_Let",
mss."StartDate",
    mss."EndDate",
    COALESCE("DP1", 0) + COALESCE("DP2", 0) + COALESCE("DP3", 0) + COALESCE("DP4", 0) + COALESCE("DP5", 0) AS spins_total,
    COALESCE("DP2", 0) + COALESCE("DP3", 0) + COALESCE("DP4", 0) + COALESCE("DP5", 0) AS spins_non_on
    from dbo."MediabaseSongSpins" mss
) mss


ON
    sv.mediabase_id = mss."SongID"
JOIN data.cmm AS c
ON
    c.cmm_station_calls = mss."C_Let"
    AND c.song_id = sv.song_id
    AND c.project_type IN ('Callout', 'Omt')
    AND c.breakout_name = 'Total'
    and c.test_date = mss."StartDate" + interval '8 days'
join data.formats AS f
on f.format_id = c.format_id
WHERE
    ( sv.song_release_date::date <= '2022-08-31'::date
        AND sv.song_release_date::date >= '2018-09-01')
'''

In [6]:
engine = postgresql_engine(user, pwd, host, port, dbname)
with engine.connect() as conn:
    with conn.begin():
        df = pd.read_sql(data_query, con=conn)

In [7]:
id_cols = ['format_code', 'cmm_station_calls', 'mediabase_id']

In [8]:
df.sort_values(by=['format_code', 'cmm_station_calls', 'mediabase_id', 'week_dt'], inplace=True)

In [9]:
df['cuml_spins_non_on'] = df.groupby(id_cols)['spins_non_on'].cumsum()

In [10]:
df[(df['mediabase_id'] == 2436510) & (df['cmm_station_calls'] == 'KIIS-FM')]

Unnamed: 0,format_code,mediabase_id,cmm_station_calls,week_dt,pop,spins_non_on,cuml_spins_non_on
171857,H1,2436510,KIIS-FM,2018-11-05,60,13,13
142987,H1,2436510,KIIS-FM,2018-11-12,76,25,38
171919,H1,2436510,KIIS-FM,2018-11-19,74,66,104
143025,H1,2436510,KIIS-FM,2018-12-03,70,70,174
171354,H1,2436510,KIIS-FM,2018-12-03,70,70,244
143514,H1,2436510,KIIS-FM,2018-12-17,76,41,285
143634,H1,2436510,KIIS-FM,2019-01-07,88,29,314
172171,H1,2436510,KIIS-FM,2019-01-14,82,43,357
143946,H1,2436510,KIIS-FM,2019-01-28,83,51,408
171472,H1,2436510,KIIS-FM,2019-02-04,92,47,455


In [11]:
df['cuml_spins_non_on_bucket'] = pd.cut(df['cuml_spins_non_on'], bins=pd.interval_range(150, np.max(df['cuml_spins_non_on']), freq=50))

In [12]:
df[(df['mediabase_id'] == 2436510) & (df['cmm_station_calls'] == 'KIIS-FM')]

Unnamed: 0,format_code,mediabase_id,cmm_station_calls,week_dt,pop,spins_non_on,cuml_spins_non_on,cuml_spins_non_on_bucket
171857,H1,2436510,KIIS-FM,2018-11-05,60,13,13,
142987,H1,2436510,KIIS-FM,2018-11-12,76,25,38,
171919,H1,2436510,KIIS-FM,2018-11-19,74,66,104,
143025,H1,2436510,KIIS-FM,2018-12-03,70,70,174,"(150.0, 200.0]"
171354,H1,2436510,KIIS-FM,2018-12-03,70,70,244,"(200.0, 250.0]"
143514,H1,2436510,KIIS-FM,2018-12-17,76,41,285,"(250.0, 300.0]"
143634,H1,2436510,KIIS-FM,2019-01-07,88,29,314,"(300.0, 350.0]"
172171,H1,2436510,KIIS-FM,2019-01-14,82,43,357,"(350.0, 400.0]"
143946,H1,2436510,KIIS-FM,2019-01-28,83,51,408,"(400.0, 450.0]"
171472,H1,2436510,KIIS-FM,2019-02-04,92,47,455,"(450.0, 500.0]"


In [13]:
df_unique_comb = df[['format_code', 'mediabase_id']].drop_duplicates()

In [14]:
df_unique_comb['format_code']

167988    A1
137836    A1
167985    A1
146857    A1
45241     A1
          ..
121177    Y0
31383     Y0
177407    Y0
193726    Y0
85038     Y0
Name: format_code, Length: 3629, dtype: object

In [16]:
from kneed import KneeLocator
import os

df_knees = pd.DataFrame(columns=['format_code', 'mediabase_id', 'spins_threshold'])
spins_lb = 150
spins_ub = 1000
for i in df_unique_comb.index:
    # iso data by format and agg by spins bucket
    idx = (df['mediabase_id'] == df_unique_comb.loc[i]['mediabase_id']) & (df['format_code'] == df_unique_comb.loc[i]['format_code']) & (df['cuml_spins_non_on'] > spins_lb) & (df['cuml_spins_non_on'] <= spins_ub)
    df_temp = pd.DataFrame(df[idx].groupby(['format_code', 'mediabase_id', 'cuml_spins_non_on_bucket']).agg({'pop': np.nanmean}).dropna().reset_index().set_index(['cuml_spins_non_on_bucket'])['pop'])

    #identify knee
    if(len(df_temp) > 5):
        kl = KneeLocator([i.right for i in df_temp.index], df_temp['pop'], curve='concave')

    # collect data
        df_out = pd.DataFrame([(df_unique_comb.loc[i]['format_code'], df_unique_comb.loc[i]['mediabase_id'], kl.knee)], columns=['format_code', 'mediabase_id', 'spins_threshold'])
        df_knees = pd.concat([df_out, df_knees])
        if (df_unique_comb.loc[i]['format_code'] == 'H1'):
            kl.plot_knee(figsize=(4,4))
            plt.savefig(os.getcwd() + '/img/' + str(df_unique_comb.loc[i]['mediabase_id']) + '.jpeg', format='jpeg', dpi=300)
            plt.close()

In [111]:
df_knees.groupby(['format_code']).agg({'mediabase_id':len})

Unnamed: 0_level_0,mediabase_id
format_code,Unnamed: 1_level_1
A1,15
A2,73
C1,107
H1,169
L1,80
L3,42
R1,6
R2,7
R3,38
U1,154


In [112]:
df_knees.groupby(['format_code']).apply(lambda x: np.quantile(x['spins_threshold'], 0.9))

format_code
A1    280.0
A2    300.0
C1    300.0
H1    310.0
L1    300.0
L3    300.0
R1    275.0
R2    320.0
R3    300.0
U1    300.0
U2    265.0
Y0    310.0
dtype: float64