In [5]:
from model_evaluation import *
from model_train import *

# set seed for cross-validation sampling
seed = 0

# set scoring function
scorer = make_scorer(balanced_accuracy)

# load datasets
p_wine = get_abspath('winequality.csv', 'data/experiments')
p_seismic = get_abspath('seismic-bumps.csv', 'data/experiments')
df_wine = pd.read_csv(p_wine)
df_seismic = pd.read_csv(p_seismic)
dfs = {'wine': df_wine, 'seismic': df_seismic}
dnames = ['wine', 'seismic']

# load pickled models
estimators = {'KNN': None, 'DT': None}
mnames = ['KNN', 'DT']
for df in dnames:
    for m in mnames:
        mfile = '{}/{}_best_estimator.pkl'.format(m, df)
        model = load_pickled_model(get_abspath(mfile, filepath='models'))
        estimators[m] = model

# generate learning curves
for df in dnames:
    # get train and test splits
    X_train, X_test, y_train, y_test = split_data(dfs[df], seed=seed)
    for name, estimator in estimators.iteritems():
        create_learning_curve(estimator, scorer, X_train, y_train, data_name=df, clf_name=name)

In [None]:
clf_name = 'DT'
data_name = 'seismic'
estimator = estimators['DT']
create_learning_curve(estimator, scorer, X_train, y_train, data_name, clf_name, cv=5)

In [None]:
clf_name = 'DT'
data_name = 'seismic'
estimator = estimators['DT']
dataset = dfs['seismic']

# set training sizes and intervals
train_sizes = np.arange(0.01, 1.0, 0.025)

# initialise variables
train_time = []
predict_time = []
df_final = []

# iterate through training sizes and capture training and predict times
for i, train_data in enumerate(train_sizes):
    print "Train size is: ", train_data
    print "Test size is:", 1-train_data
    X_train, X_test, y_train, y_test = split_data(
        dataset, test_size=1 - train_data)
    start_train = timeit.default_timer()
    estimator.fit(X_train, y_train)
    end_train = timeit.default_timer()
    estimator.predict(X_test)
    end_predict = timeit.default_timer()
    train_time.append(end_train - start_train)
    predict_time.append(end_predict - end_train)
    df_final.append([train_data, train_time[i], predict_time[i]])

In [None]:
df_final