In [1]:
import sys
sys.path.append('/Users/drew/Desktop/nyp')

In [2]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from nyp.models import Selection

Session = sessionmaker(create_engine('sqlite:///../data/raw.db'))
s = Session()

In [8]:
from nyp.markov import Chain
import pandas as pd

df = pd.read_csv('../data/testdata_20190125.txt', sep='\t', index_col=[0,1])

df.loc[pd.isnull(df.composer_country), 'composer_country'] = 'Unknown'
df.loc[pd.isnull(df.composer_birth_century), 'composer_birth_century'] = 'Unknown'
df.loc[pd.isnull(df.work_type), 'work_type'] = 'Other'

In [9]:
score_df = df.reset_index().drop('concert_id', axis=1).drop_duplicates('selection_id').set_index('selection_id').copy()
break_row = pd.Series({c: '___BREAK__' for c in score_df.columns}, name=999999)
score_df = score_df.append(break_row)

In [10]:
df.columns

Index(['has_opus', 'is_arrangement', 'work_type', 'composer_country',
       'composer_birth_century', 'composer_concert_selections', 'soloist_type',
       'selection_performances', 'percent_after_intermission_bin',
       'avg_percent_of_concert_bin'],
      dtype='object')

In [23]:
chain_params = {
    'has_opus': {'cull': False},
    'is_arrangement': {'cull': False}, 
    'work_type': {'cull': False}, # , 'cull_threshold': 500}, 
    'composer_country': {'cull': True, 'cull_threshold': 150}, 
    'composer_birth_century': {'cull': True, 'cull_threshold': 300},
    'composer_concert_selections': {'cull': False},
    'soloist_type': {'cull': True, 'cull_threshold': 300}, 
    'selection_performances': {'cull': False},
    'percent_after_intermission_bin': {'cull': False},
    'avg_percent_of_concert_bin': {'cull': False}
}

In [24]:
state_size = 3
chains = {}
for col in chain_params:
    chains[col] = Chain(df[col], state_size=state_size, **chain_params[col])
    score_df[col] = chains[col].transform_scoring_series(score_df[col])

In [25]:
def update_state(state, state_size, selection):
    selection_row = score_df.loc[selection.id]
    for k in state:
        state[k] = state[k][1:] + (selection_row[k],)
    return state

def reset_state(chains, state_size):
    return {k: ('___BREAK__',) * state_size for k in chains.keys()}

In [59]:
# score_df is currently unweighted...right?
# need to train backwards
# chain-specific options (no culling for work and selection popularity)
# allow different state size per chain?

state = reset_state(chains, state_size)
scorecols = [col + '_score' for col in df.columns]

weights = {
    'has_opus': 1.0,
    'is_arrangement': 1.0,
    'work_type': 5.0,
    'composer_country': 10.0,
    'composer_birth_century': 10.0,
    'composer_concert_selections': 10.0,
    'soloist_type': 10.0,
    'selection_performances': 20.0,
    'percent_after_intermission_bin': 20.0,
    'avg_percent_of_concert_bin': 10.0
}

end = False
has_intermission = False
while not end:
    for col in df.columns:
        score_df[col + '_score'] = chains[col].score_series(score_df[col], state[col]).values * weights[col]

    score_df['final_score'] = score_df[scorecols].sum(axis=1) / sum(weights.values())

    selection_usable = (score_df[scorecols] == 0).sum(1) == 0
    selection_series = score_df.loc[selection_usable].final_score
    selection_idx = selection_series.sample(weights=selection_series).index

    if selection_idx == 999999:
        print('--- End ---')
        state = reset_state(chains, state_size)
        end = True
        continue
    
    if selection_idx == 4:
        if not has_intermission:
            has_intermission = True
            selection = s.query(Selection).get(selection_idx)
            state = update_state(state, state_size, selection)
            print('--- Intermission ---')
            continue
        else:
            while selection_idx == 4 or selection_idx == 999999:
                selection_idx = selection_series.sample(weights=selection_series).index
    
    selection = s.query(Selection).get(selection_idx)
    state = update_state(state, state_size, selection)
    print(selection)

<Selection 6002: Selection(s) from <Work 5596: ENCHANTRESS, THE by <Composer 164: Tchaikovsky, Pyotr Ilyich>>>
--- Intermission ---
<Selection 6338: Full work of <Work 5915: CONCERTO, PIANO, NO. 2, OP. 16, G MINOR by <Composer 674: Prokofiev, Sergei>>>
<Selection 4800: Selection(s) from <Work 4473: FACADE [FAÇADE] by <Composer 953: Walton, William>>>
<Selection 3623: Full work of <Work 3396: RHAPSODY IN BLUE, PIANO, ORCHESTRA by <Composer 769: Gershwin, George>>>
<Selection 499: Full work of <Work 483: SYMPHONY NO. 4, E MINOR, OP.50 (ARCADIAN) by <Composer 41: Bristow, George Frederick>>>
<Selection 8688: Selection(s) from <Work 8075: SONGS OF SEPARATION by <Composer 914: Still, William Grant>>>
<Selection 3845: Full work of <Work 3600: PICKWICK PAPERS, THE by <Composer 849: Coates, Albert>>>
--- End ---
