In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
import random

  from pandas import MultiIndex, Int64Index


In [3]:
from sqlalchemy import create_engine, text

def postgresql_engine(user, pwd, host, port, dbname):
    # Need psycopg2-binary package
    sql_engine = create_engine('postgres://' + user + ':' + pwd + '@' + host + ':' + port + '/' + dbname, echo=False)
    return sql_engine

In [None]:
# DB username & password
import getpass

username = getpass.getpass()
password = getpass.getpass()

In [None]:
# misc db parameters
url= 'adds-postgres-dev.cfgztrijqgvp.us-east-1.rds.amazonaws.com'
database= 'musiclab'
port= '5432'

In [None]:
data_query = '''
Select *
from adds_temp.ebw_metric_analysis as ema
'''

In [None]:
engine = postgresql_engine(username, password, url, port, database)
with engine.connect() as conn:
    with conn.begin():
        df_ebw_metrics = pd.read_sql(data_query, con=conn)

In [None]:
len(df_ebw_metrics)

In [None]:
df_ebw_metrics.drop_duplicates(inplace=True)

In [None]:
df_ebw_metrics.head()

In [None]:
# song-artist lookup
song_query = '''
Select mediabase_id, song_name, artist_name
from data.songs_v as sv
'''
engine = postgresql_engine(username, password, url, port, database)
with engine.connect() as conn:
    with conn.begin():
        df_song_lookup = pd.read_sql(song_query, con=conn)

In [None]:
df_song_lookup.set_index(['mediabase_id'], inplace=True)

In [None]:
df_song_lookup.head()

In [None]:
# extract all formats
all_formats = list(pd.unique(df_ebw_metrics['format_code']))
all_formats.sort()
all_formats

In [None]:
# define major formats
major_formats = ['C1', 'H1', 'U1']

#### Unique songs

In [None]:
df_ebw_metrics.groupby(['format_code']).apply(lambda x:(len(pd.unique(x['mediabase_id'])), len(pd.unique(x['station_id']))))

#### Unique songs and stations

In [None]:
df_ebw_metrics.groupby(['format_code']).apply(lambda x:len((x[['mediabase_id', 'station_id']].drop_duplicates())))

### Calculated fields for analysis

In [None]:
# Look at two appearances in top quintile of callout research
df_pop_quintile = pd.DataFrame(df_ebw_metrics[~pd.isna(df_ebw_metrics['pop'])].groupby(['format_code', 'cmm_station_calls', 'week_dt']).apply(lambda x: np.quantile(x['pop'], 0.80)), columns=['top_quintile_cutoff'])

In [None]:
df_pop_quintile

In [None]:
df_ebw_metrics['is_top_quintile'] = df_ebw_metrics.join(df_pop_quintile, on=['format_code', 'cmm_station_calls', 'week_dt'], rsuffix='_r').apply(lambda x: int(x['pop'] >= x['top_quintile_cutoff']), axis=1)

In [None]:
df_ebw_metrics['week_dt'] = pd.to_datetime(df_ebw_metrics['week_dt'])

In [None]:
df_ebw_metrics.sort_values(by=['format_code', 'call_letters', 'mediabase_id', 'week_dt'], inplace=True)

In [None]:
df_ebw_metrics['num_top_quintile'] = df_ebw_metrics.groupby(['format_code', 'call_letters', 'mediabase_id'])['is_top_quintile'].cumsum()

In [None]:
df_ebw_metrics['cuml_spins_non_on'] = df_ebw_metrics.groupby(['format_code', 'call_letters', 'mediabase_id'])['spins_non_on'].cumsum()

In [None]:
100*(int(np.max(df_ebw_metrics['cuml_spins_non_on'])/100) + 1)

In [None]:
df_ebw_metrics['cuml_spins_bucket'] = pd.cut(df_ebw_metrics['cuml_spins_non_on'], bins=pd.interval_range(start=0, end=100*(int(np.max(df_ebw_metrics['cuml_spins_non_on'])/100) + 1), freq=100))

In [None]:
df_ebw_metrics['weeks_since_release'] = ((df_ebw_metrics['week_dt'] - pd.to_datetime(df_ebw_metrics['song_release_date']))/np.timedelta64(1, 'W')).apply(int)

In [None]:
df_ebw_metrics = df_ebw_metrics.join(df_song_lookup, on=['mediabase_id'])

In [None]:
ddl_range = np.arange(15, 26, 1)
f2b_range = np.arange(0.85, 1.18, 0.03)

In [None]:
ddl_geq_cols = ['ddl_geq_' + str(int(i)) for i in ddl_range]
ddl_track_cols = ['ddl_track_' + str(int(i)) for i in ddl_range]

f2b_leq_cols = ['f2b_leq_' + '%.2f'%i for i in f2b_range]
f2b_track_cols = ['f2b_track_' + '%.2f'%i  for i in f2b_range]

In [None]:
for i in range(len(ddl_range)):
    df_ebw_metrics[ddl_geq_cols[i]] = df_ebw_metrics['ddl_metric'].apply(lambda x: int(x >= ddl_range[i]))
    df_ebw_metrics[ddl_track_cols[i]] = df_ebw_metrics.groupby(['format_code', 'call_letters', 'mediabase_id'])[ddl_geq_cols[i]].cumsum()

    df_ebw_metrics[f2b_leq_cols[i]] = df_ebw_metrics['f2b_ratio'].apply(lambda x: int(x <= f2b_range[i]))
    df_ebw_metrics[f2b_track_cols[i]] = df_ebw_metrics.groupby(['format_code', 'call_letters', 'mediabase_id'])[f2b_leq_cols[i]].cumsum()

In [None]:
df_ebw_metrics[df_ebw_metrics['f2b_track_1.00'] >= 2].groupby(['format_code', 'call_letters', 'mediabase_id']).agg({'week_dt':np.min, 'cuml_spins_non_on': np.min, 'weeks_since_release': np.min})

In [None]:
df_ebw_metrics[(df_ebw_metrics['mediabase_id']==2436510) & (df_ebw_metrics['call_letters'] == 'KIIS-FM')].head(20)

### Analyze Favorite, DDL and F2B ratio

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
pd.options.mode.chained_assignment = None  # default='warn'
import seaborn as sns
sns.set_theme(style='white')

In [None]:
# Look a distribution of DDL and F2b for major formats
for fmt in major_formats:
    idx = (df_ebw_metrics['format_code']==fmt) & (~pd.isna(df_ebw_metrics['pop']))
    fig = plt.figure(figsize=(12.5, 5))
    ax1= plt.subplot(1,3,1)
    df_ebw_metrics[idx]['ddl_metric'].plot(ax=ax1, kind='hist', bins=np.arange(0,45,5))

    ax2= plt.subplot(1,3,2)
    df_ebw_metrics[idx]['ddl_metric'].plot(ax=ax2, kind='hist', bins=np.arange(0,45,5))

    ax3=plt.subplot(1,3,3)
    df_ebw_metrics[idx]['f2b_ratio'].plot(ax=ax3, kind='hist', bins=np.arange(0,20,0.5))

    ax1.set_aspect(1/ax1.get_data_ratio())
    ax2.set_aspect(1/ax2.get_data_ratio())
    ax3.set_aspect(1/ax3.get_data_ratio())

    plt.suptitle(fmt)
    fig.tight_layout()
    plt.show()

In [None]:
# plot out favorite, f2b ratio, and pop scoress
for fmt in major_formats:
    df_temp = df_ebw_metrics[(df_ebw_metrics['format_code']==fmt) & (~pd.isna(df_ebw_metrics['pop'])) & (df_ebw_metrics['f2b_ratio']<=2)]

    # Discretize DDL and F2B
    df_temp.loc[:, 'fav_bucket'] = pd.cut(df_temp['fav_metric'], bins=pd.interval_range(start=0, end=40, freq=5))
    df_temp.loc[:, 'ddl_bucket'] = pd.cut(df_temp['ddl_metric'], bins=pd.interval_range(start=0, end=40, freq=5))

    df_temp_agg_f2b = pd.pivot_table(df_temp.groupby(['ddl_bucket', 'fav_bucket']).agg({'f2b_ratio': np.mean}).reset_index(), index = ['fav_bucket'], columns=['ddl_bucket'])
    df_temp_agg_pop = pd.pivot_table(df_temp.groupby(['ddl_bucket', 'fav_bucket']).agg({'pop': np.median}).reset_index(), index = ['fav_bucket'], columns=['ddl_bucket'])

    # print(df_temp_agg_f2b)
    ax1 = plt.subplot(1,2,1)
    im = ax1.imshow(df_temp_agg_f2b, origin='lower')

    # Show all ticks and label them with the respective list entries
    ax1.set_yticks(ticks=np.arange(len(df_temp_agg_f2b.index)), labels=df_temp_agg_f2b.index)
    ax1.set_xticks(ticks=np.arange(len( df_temp_agg_f2b.columns)), labels=['(' + str(i.left) + ',' +  str(i.right) + ']' for (j,i) in df_temp_agg_f2b.columns])
    #
    # # Rotate the tick labels and set their alignment.
    plt.setp(ax1.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    #
    # # Loop over data dimensions and create text annotations.
    for i in np.arange(len(df_temp_agg_f2b.columns)):
        for j in np.arange(len(df_temp_agg_f2b.index)):
            text = ax1.text(i, j, "%.2f"%df_temp_agg_f2b.iloc[j, i],
                           ha="center", va="center", color="w", fontsize='small')

    ax1.set_title("F2B ratio")
    ax1.set_aspect(1/ax1.get_data_ratio())
    ax1.grid(False)

    # plot pop
    ax2 = plt.subplot(1,2,2)
    im = ax2.imshow(df_temp_agg_pop, origin='lower')

    # Show all ticks and label them with the respective list entries
    ax2.set_yticks(ticks=np.arange(len(df_temp_agg_pop.index)), labels=df_temp_agg_pop.index)
    ax2.set_xticks(ticks=np.arange(len( df_temp_agg_pop.columns)), labels=['(' + str(i.left) + ',' +  str(i.right) + ']' for (j,i) in df_temp_agg_pop.columns])
    #
    # # Rotate the tick labels and set their alignment.
    plt.setp(ax2.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    #
    # # Loop over data dimensions and create text annotations.
    for i in np.arange(len(df_temp_agg_pop.columns)):
        for j in np.arange(len(df_temp_agg_pop.index)):
            text = ax2.text(i, j, "%.2f"%df_temp_agg_pop.iloc[j, i],
                           ha="center", va="center", color="w", fontsize = 'x-small')

    ax2.set_title("Avg. Pop ")
    ax2.set_aspect(1/ax2.get_data_ratio())
    ax2.grid(False)

    plt.tight_layout()
    break

### Examples of Burnout and indicators (spins, market spins, streams)

In [None]:
# explore songs suggested by Marc
artists = ['WALKER HAYES', 'SAM HUNT', 'GABBY BARRETT']
idx = (df_ebw_metrics['artist_name'].isin(artists)) & (df_ebw_metrics['weeks_since_release'] <= 104)
df_ebw_metrics[idx].groupby (['mediabase_id', 'song_name', 'artist_name']).agg({'cuml_spins_non_on': np.max, 'song_release_date': np.min}).reset_index().sort_values(by=['artist_name', 'cuml_spins_non_on'], ascending=False)

In [None]:
songs_sample = ['Fancy Like', 'Hard To Forget', 'Kinfolks', 'Breaking Up Was Easy In The...', '23', 'The Good Ones', 'I Hope f/Charlie Puth']

In [None]:
idx = (df_ebw_metrics['ddl_track_20'] >= 2) & (df_ebw_metrics['format_code'] == 'C1') & (df_ebw_metrics['song_name'].isin(songs_sample))

df_brn_temp = df_ebw_metrics.loc[idx].groupby(['format_code', 'call_letters', 'mediabase_id']).agg({'week_dt': np.min})

In [None]:
df_brn_instances = df_ebw_metrics.join(df_brn_temp, on=['format_code', 'call_letters', 'mediabase_id'], how='inner', rsuffix='_r')[['mediabase_id', 'call_letters', 'week_dt', 'song_name', 'artist_name', 'pop', 'ddl_metric', 'fav_metric', 'f2b_ratio', 'spins_non_on', 'cuml_spins_non_on', 'MarketSpinsToDate', 'stream_count']]

In [None]:
df_brn_instances

In [1]:
# Inspect Data for Gabby Barrett (I Hope f/Charlie Puth)
ax = plt.gca()
df_brn_instances[(df_brn_instances['call_letters'] == 'WPGB-FM') & (df_brn_instances['mediabase_id'] == 2466281) & (~pd.isna(df_brn_instances['pop']))].set_index(['week_dt'])['ddl_metric'].plot(ax=ax, marker='o')
ax1=ax.twinx()
df_brn_instances[(df_brn_instances['call_letters'] == 'WPGB-FM') & (df_brn_instances['mediabase_id'] == 2466281) & (~pd.isna(df_brn_instances['pop']))].set_index(['week_dt'])['stream_count'].rolling(4).mean().plot(ax=ax1, marker='^')

NameError: name 'plt' is not defined

In [47]:
# Plot pop/ddl/fav, f2b, spins(weekly, market), streams

### Analyze candidate population for burnout analysis

### DDL and F2B analysis