In [1]:
import pandas as pd
import numpy as np
import re
import os


import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
pd.options.mode.chained_assignment = None  # default='warn'
import seaborn as sns
sns.set_theme(style='white')

#### Setup DB connection

In [2]:
from sqlalchemy import create_engine, text

def postgresql_engine(user, pwd, host, port, dbname):
    # Need psycopg2-binary package
    sql_engine = create_engine('postgres://' + user + ':' + pwd + '@' + host + ':' + port + '/' + dbname, echo=False)
    return sql_engine

In [3]:
# DB username & password
import getpass

username = getpass.getpass()
password = getpass.getpass()

In [4]:
# misc db parameters
url= 'adds-postgres-dev.cfgztrijqgvp.us-east-1.rds.amazonaws.com'
database= 'musiclab'
port= '5432'

In [5]:
data_query = '''
Select *
from adds_temp.ebw_metric_analysis as ema
'''

In [6]:
engine = postgresql_engine(username, password, url, port, database)
with engine.connect() as conn:
    with conn.begin():
        df_ebw_metrics = pd.read_sql(data_query, con=conn)

#### POP Top Quintile Check

In [5]:
engine = postgresql_engine(username, password, url, port, database)

query_top_quint_check = '''
Select c.test_date, f.format_code, c.cmm_station_calls, c.song_id, c.pop
from data.cmm as c, data.formats as f
where c.project_type='Callout'
and c.test_date >= '2019-01-01'
and c.breakout_id=1
and f.format_id=c.format_id
'''

with engine.connect() as conn:
    with conn.begin():
        df_top_quint_check = pd.read_sql(query_top_quint_check, con=conn)

In [6]:
# aggregate by week and station
df_top_quint_check_agg = df_top_quint_check.groupby(['format_code', 'cmm_station_calls', 'test_date']).apply(lambda x: np.quantile(x['pop'], 0.80)).reset_index(name='top_quintile_cutoff')

In [7]:
with engine.connect() as conn:
    with conn.begin():
        df_top_quint_check_agg.to_sql(name='ebw_top_quintile_cutoff_py', con=conn, schema='adds_temp', chunksize=5000)

#### read in raw features from postgres DB