In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
import random

In [2]:
from sqlalchemy import create_engine, text

def postgresql_engine(user, pwd, host, port, dbname):
    # Need psycopg2-binary package
    sql_engine = create_engine('postgres://' + user + ':' + pwd + '@' + host + ':' + port + '/' + dbname, echo=False)
    return sql_engine

In [3]:
# DB username & password
import getpass

username = getpass.getpass()
password = getpass.getpass()

In [6]:
# misc db parameters
url= 'adds-postgres-dev.cfgztrijqgvp.us-east-1.rds.amazonaws.com'
database= 'musiclab'
port= '5432'

In [9]:
data_query = '''
Select *
from adds_temp.ebw_metric_analysis as ema
'''

In [10]:
engine = postgresql_engine(username, password, url, port, database)
with engine.connect() as conn:
    with conn.begin():
        df_ebw_metrics = pd.read_sql(data_query, con=conn)

In [11]:
df_ebw_metrics.head()

Unnamed: 0,week_dt,mediabase_id,song_id,call_letters,format_code,song_release_date,last_callout_date,first_spin_date,StartDate,EndDate,...,gcr,gcr_adj,station_id,cmm_station_calls,test_date,breakout_respondents,pop,fav_metric,ddl_metric,f2b_ratio
0,2018-10-01,2356990,290730733,WMIB-FM,U1,2018-10-01,2019-01-14,2018-10-08,2018-09-30,2018-10-06,...,,,3322924.0,WMIB-FM,2018-10-01,90.0,73.0,20.0,6.0,3.33
1,2018-10-01,2438895,554650644,WCOL-FM,C1,2018-10-01,2021-04-12,2019-01-28,NaT,NaT,...,,,,,,,,,,
2,2018-10-01,2438895,554650644,WFUS-FM,C1,2018-10-01,2021-03-29,2019-02-25,NaT,NaT,...,,,,,,,,,,
3,2018-10-01,2438895,554650644,WEBG-FM,C1,2018-10-01,2020-08-03,2019-02-04,NaT,NaT,...,,,,,,,,,,
4,2018-10-01,2439230,555200329,WDAS-FM,U2,2018-10-01,2020-04-06,2021-03-08,NaT,NaT,...,,,,,,,,,,


In [None]:
# song-artist lookup

In [None]:
# pull in data for analysis

In [33]:
all_formats = list(pd.unique(df_ebw_metrics['format_code']))
all_formats.sort()
all_formats

['A1', 'A2', 'C1', 'H1', 'L1', 'L3', 'O1', 'R1', 'R2', 'R3', 'U1', 'U2', 'Y0']

#### Unique songs

In [24]:
df_ebw_metrics.groupby(['format_code']).apply(lambda x:(len(pd.unique(x['mediabase_id'])), len(pd.unique(x['station_id']))))

format_code
A1     (92, 10)
A2     (272, 9)
C1    (272, 19)
H1    (452, 33)
L1     (263, 5)
L3      (86, 3)
O1      (29, 2)
R1     (160, 2)
R2     (128, 6)
R3     (253, 9)
U1    (499, 17)
U2     (133, 4)
Y0     (366, 7)
dtype: object

#### Unique songs and stations

In [20]:
df_ebw_metrics.groupby(['format_code']).apply(lambda x:len((x[['mediabase_id', 'station_id']].drop_duplicates())))

format_code
A1     490
A2    1149
C1    3408
H1    7259
L1     818
L3     228
O1      58
R1     319
R2     496
R3     969
U1    3125
U2     417
Y0    1390
dtype: int64

### Analyze Favorite, DDL and F2B ratio

In [26]:
# plot out favorite, f2b ratio, and pop scores
for fmt in all_formats:
    print(df_ebw_metrics[(df_ebw_metrics['format_code']==fmt) & (~pd.isna(df_ebw_metrics['pop']))].head())
    break

         week_dt  mediabase_id    song_id call_letters format_code  \
0     2018-10-01       2356990  290730733      WMIB-FM          U1   
2322  2018-10-01       2356990  290730733      WUSL-FM          U1   
2601  2018-10-08       2356990  290730733      WGCI-FM          U1   
2602  2018-10-08       2356990  290730733      KQBT-FM          U1   
2603  2018-10-08       2356990  290730733      WJLB-FM          U1   

     song_release_date last_callout_date first_spin_date  StartDate  \
0           2018-10-01        2019-01-14      2018-10-08 2018-09-30   
2322        2018-10-01        2018-10-01      2018-10-08 2018-09-30   
2601        2018-10-01        2019-03-18      2018-10-08 2018-10-07   
2602        2018-10-01        2019-06-03      2018-10-08 2018-10-07   
2603        2018-10-01        2020-03-09      2018-10-08 2018-10-07   

        EndDate  ...   gcr  gcr_adj  station_id  cmm_station_calls  \
0    2018-10-06  ...  None     None   3322924.0            WMIB-FM   
2322 2018-10