In [1]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from nyp.models import Selection

Session = sessionmaker(create_engine('sqlite:///../data/raw.db'))
s = Session()

In [5]:
from nyp.markov import Chain
import pandas as pd

df = pd.read_csv('../data/testdata_20190125.txt', sep='\t', index_col=[0,1])

df.loc[pd.isnull(df.composer_country), 'composer_country'] = 'Unknown'
df.loc[pd.isnull(df.composer_death_century), 'composer_death_century'] = 'Unknown'
df.loc[pd.isnull(df.work_type), 'work_type'] = 'Other'

In [6]:
score_df = df.reset_index().drop('concert_id', axis=1).drop_duplicates('selection_id').set_index('selection_id').copy()
break_row = pd.Series({c: '___BREAK__' for c in score_df.columns}, name=999999)
score_df = score_df.append(break_row)

In [21]:
state_size = 3
chains = {}
for col in df.columns:
    chains[col] = Chain(df[col], state_size=state_size, cull_threshold=200)
    score_df[col] = chains[col].transform_scoring_series(score_df[col])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [22]:
def update_state(state, state_size, selection):
    selection_row = score_df.loc[selection.id]
    for k in state:
        state[k] = state[k][1:] + (selection_row[k],)
    return state

def reset_state(chains, state_size):
    return {k: ('___BREAK__',) * state_size for k in chains.keys()}

In [35]:
# score_df is currently unweighted...right?
# need to train backwards
# chain-specific options (no culling for work and selection popularity)

state = reset_state(chains, state_size)
scorecols = [col + '_score' for col in df.columns]

weights = {
    'has_opus': 1,
    'is_arrangement': 1,
    'work_type': 2,
    'composer_country': 2,
    'composer_death_century': 2,
    'composer_concert_selections': 2,
    'soloist_type': 2
}

end = False
has_intermission = False
while not end:
    for col in df.columns:
        score_df[col + '_score'] = chains[col].score_series(score_df[col], state[col]).values * weights[col]

    score_df['final_score'] = score_df[scorecols].sum(axis=1) / sum(weights.values())

    selection_usable = (score_df[scorecols] == 0).sum(1) == 0
    selection_series = score_df.loc[selection_usable].final_score
    selection_idx = selection_series.sample(weights=selection_series).index

    if selection_idx == 999999:
        print('--- End ---')
        state = reset_state(chains, state_size)
        end = True
        continue
    
    if selection_idx == 4:
        if not has_intermission:
            has_intermission = True
            selection = s.query(Selection).get(selection_idx)
            state = update_state(state, state_size, selection)
            print('--- Intermission ---')
            continue
        else:
            while selection_idx == 4 or selection_idx == 999999:
                selection_idx = selection_series.sample(weights=selection_series).index
    
    selection = s.query(Selection).get(selection_idx)
    state = update_state(state, state_size, selection)
    print(selection)

<Selection 9791: Full work of <Work 9095: FANFARE by <Composer 1944: Sheng, Bright>>>
--- Intermission ---
<Selection 6872: Full work of <Work 6409: OVERTURE FOR AN ENGLISH OPERA by <Composer 15: Haydn, Franz Joseph>>>
<Selection 190: Full work of <Work 186: SYMPHONY NO. 2, C MAJOR, OP. 61 by <Composer 51: Schumann, Robert>>>
<Selection 3099: Selection(s) from <Work 2907: MORNING, NOON, AND NIGHT IN VIENNA by <Composer 315: Suppe, Franz Von>>>
--- End ---
