Read in our export set and set its index appropriately

In [1]:
import pandas as pd
df = pd.read_csv('../data/train_export.txt.gz', compression='gzip', sep='\t')
df.set_index(['concert_id', 'selection_id'], inplace=True)

Import ChainEnsemble and set our params per export column

In [2]:
from nyp.markov import ChainEnsemble, ChainEnsembleScorer

In [3]:
chain_params = {
    'full_work': {'cull': False, 'state_size': 1}, 
    'has_opus': {'cull': False, 'state_size': 2}, 
    'is_arrangement': {'cull': False, 'state_size': 1}, 
    'composer_country': {'cull': True, 'cull_threshold': 150, 'state_size': 3}, 
    'composer_concert_selections': {'cull': False, 'state_size': 2}, 
    'soloist_type': {'cull': True, 'cull_threshold': 300, 'state_size': 3}, 
    'selection_performances': {'cull': False, 'state_size': 3},
    'work_type': {},
    'composer_birth_century': {'cull': True, 'cull_threshold': 300},
    'percent_after_intermission_bin': {},
    'avg_percent_of_concert_bin': {}
}
base_params = {'cull': False, 'state_size': 4}

Train the ensemble model

In [4]:
model = ChainEnsemble(chain_configs=chain_params, base_chain_config=base_params)
model.train(df, n_jobs=4);

save the model to use elsewhere

In [5]:
import pickle
pickle.dump(model, open('../data/model_v1.p', 'wb'))

Create a scoring object and make a prediction using just these five columns weighted as listed here

In [6]:
scorer = ChainEnsembleScorer(model)

In [7]:
prediction_weights = {
    'work_type': 3.0,
    'composer_country': 1.0,
    'composer_birth_century': 1.0,
    'soloist_type': 2.0,
    'percent_after_intermission_bin': 4.0
}

In [8]:
scorer.generate_program(prediction_weights, break_weight=10)

[8125, 4386, 4, 564, 1406]