In [49]:
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from skopt import BayesSearchCV
import numpy as np
import pickle

with open('../out/embeddings.pkl', 'rb') as f:
    embeddings_train, embeddings_test, y_train, y_test = pickle.load(f)
X = np.array(embeddings_train)
y = y_train.to_numpy()

print('Linear Support Vector Classifier')

outer_results = list()
best_model = list()

outer_cv = StratifiedKFold(n_splits=10, random_state=153, shuffle=True)
for train_ix, test_ix in outer_cv.split(embeddings_train, y_train):
	# split data
	train_x, validate_x = X[train_ix], X[test_ix]
	train_y, validate_y = y[train_ix], y[test_ix]
	# configure the cross-validation procedure
	inner_cv = StratifiedKFold(n_splits=10, random_state=153, shuffle=True)
	# define search
	pipe_svm = make_pipeline(StandardScaler(), LinearSVC(random_state=153))
	opt_svm = BayesSearchCV(
		pipe_svm,
		{'linearsvc__C': (1e-6, 1e+6, 'log-uniform')},
		cv=inner_cv,
		n_jobs=-1
	)
	# execute search
	result = opt_svm.fit(train_x, train_y)
	# evaluate the best performing model on the hold out dataset
	accuracy = result.best_estimator_.score(validate_x, validate_y)
	# store the result
	outer_results.append(accuracy)
	# store the best model for later
	best_model.append(result.best_estimator_)
	# report progress
	print('>acc=%.3f, est=%.3f, cfg=%s' % (accuracy, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

result = np.where(outer_results == np.amax(outer_results))
print(best_model[result[0][0]].score(embeddings_test, list(y_test)))

print('K Nearest Neighbours Classifier')

outer_results = list()
best_model = list()

outer_cv = StratifiedKFold(n_splits=10, random_state=153, shuffle=True)
for train_ix, test_ix in outer_cv.split(embeddings_train, y_train):
	# split data
	train_x, validate_x = X[train_ix], X[test_ix]
	train_y, validate_y = y[train_ix], y[test_ix]
	# configure the cross-validation procedure
	inner_cv = StratifiedKFold(n_splits=10, random_state=153, shuffle=True)
	# define search
	grid_params = {
		'n_neighbors': [3,5,11,19],
		'weights': ['uniform', 'distance'],
		'metric': ['manhattan', 'euclidean']
	}
	opt_knn = GridSearchCV(
		KNeighborsClassifier(),
		grid_params,
		cv=inner_cv,
		n_jobs=-1
	)
	# execute search
	result = opt_knn.fit(train_x, train_y)
	# evaluate the best performing model on the hold out dataset
	accuracy = result.best_estimator_.score(validate_x, validate_y)
	# store the result
	outer_results.append(accuracy)
	# store the best model for later
	best_model.append(result.best_estimator_)
	# report progress
	print('>acc=%.3f, est=%.3f, cfg=%s' % (accuracy, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))

result = np.where(outer_results == np.amax(outer_results))
print(best_model[result[0][0]].score(embeddings_test, list(y_test)))

Linear Support Vector Classifier




>acc=0.850, est=0.820, cfg=OrderedDict([('linearsvc__C', 0.0006006520837316877)])




>acc=0.788, est=0.831, cfg=OrderedDict([('linearsvc__C', 0.0012929018682259253)])




>acc=0.832, est=0.823, cfg=OrderedDict([('linearsvc__C', 0.0007181391054191661)])




>acc=0.848, est=0.828, cfg=OrderedDict([('linearsvc__C', 0.0011812324996890674)])




>acc=0.804, est=0.834, cfg=OrderedDict([('linearsvc__C', 0.0007736907799804864)])




>acc=0.830, est=0.820, cfg=OrderedDict([('linearsvc__C', 0.0006220197106329326)])




>acc=0.839, est=0.823, cfg=OrderedDict([('linearsvc__C', 0.0007593276875726514)])




>acc=0.804, est=0.838, cfg=OrderedDict([('linearsvc__C', 0.001006088140006706)])




>acc=0.786, est=0.816, cfg=OrderedDict([('linearsvc__C', 0.00022055408313825496)])




>acc=0.830, est=0.825, cfg=OrderedDict([('linearsvc__C', 0.0008423422069955893)])
Accuracy: 0.821 (0.023)
0.8266666666666667
K Nearest Neighbours Classifier
>acc=0.788, est=0.782, cfg={'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
>acc=0.752, est=0.788, cfg={'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
>acc=0.770, est=0.780, cfg={'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
>acc=0.768, est=0.792, cfg={'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
>acc=0.777, est=0.791, cfg={'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
>acc=0.777, est=0.793, cfg={'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
>acc=0.768, est=0.792, cfg={'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
>acc=0.777, est=0.784, cfg={'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
>acc=0.777, est=0.798, cfg={'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
>acc=0.821, est=

In [4]:
diction = {'something':1, 'what':2}
print(diction.get)

<built-in method get of dict object at 0x109631a00>
