In [1]:
# database imports
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, scoped_session
from nyp.models import Selection

from dotenv import load_dotenv
from os import getenv

load_dotenv()
Session = scoped_session(sessionmaker(create_engine(getenv('MYSQL_CON_DEV'))))

Read in our export set and set its index appropriately

In [2]:
import pandas as pd
df = pd.read_csv('../data/train_export.txt.gz', compression='gzip', sep='\t')
df.set_index(['concert_id', 'selection_id'], inplace=True)

Import ChainEnsemble and set our params per export column

In [3]:
from nyp.markov import ChainEnsemble, ChainEnsembleScorer

In [4]:
chain_params = {
    'full_work': {'cull': False, 'state_size': 1}, 
    'has_opus': {'cull': False, 'state_size': 2}, 
    'is_arrangement': {'cull': False, 'state_size': 1}, 
    'composer_country': {'cull': True, 'cull_threshold': 150}, 
    'composer_concert_selections': {'cull': False, 'state_size': 2}, 
    'soloist_type': {'cull': True, 'cull_threshold': 300}, 
    'selection_performances': {'cull': False, 'state_size': 3},
    'work_type': {},
    'composer_birth_century': {'cull': True, 'cull_threshold': 300},
    'percent_after_intermission_bin': {},
    'avg_percent_of_concert_bin': {}
}
base_params = {'cull': False, 'state_size': 4}

Train the ensemble model

In [6]:
model = ChainEnsemble(chain_configs=chain_params, base_chain_config=base_params)
model.train(df, n_jobs=4);

save the model to use elsewhere

In [7]:
import pickle
pickle.dump(model, open('../data/model_v1.p', 'wb'))

Create a scoring object and make a prediction using just these five columns weighted as listed here

In [9]:
scorer = ChainEnsembleScorer(model)

In [11]:
from nyp.markov import rescaled_power_weight, sum_weighted_log_odds

In [14]:
prediction_weights = {
    'work_type': 25.0,
    'composer_country': 1.5,
    'composer_birth_century': 8.5,
    'soloist_type': 15.0,
#     'percent_after_intermission_bin': 2.5
}

program = scorer.generate_program(
    case_weight_exponent=.5, 
    weighted_average_exponent=2.5,
    feature_weights=prediction_weights,  
    break_weight=1,
#     summary_step=sum_weighted_log_odds
    summary_step=rescaled_power_weight
)

for s in program:
    print(Session.query(Selection).get(s))

<Selection 455: Full work of <Work 441: CONCERTO, PIANO, NO. 4, D MINOR, OP. 70 by <Composer 98: Rubinstein, Anton>>>
<Selection 4: Full work of <Work 4: Intermission by <Composer 4: No Composer>>>
<Selection 896: Full work of <Work 860: SYMPHONIC FUGUE, OP.8 by <Composer 234: Koch, Friedrich E.>>>
