In [85]:
import pandas as pd
import numpy as np

# example of a super learner using the mlens library
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from mlens.ensemble import SuperLearner


In [91]:
def get_models():
	models = list()
	models.append(MultinomialNB())
	models.append(BernoulliNB())
	# Commented out to save time:
	models.append(LogisticRegression(solver = 'liblinear', max_iter = 200))
	# models.append(DecisionTreeClassifier())
	# models.append(SVC(gamma = 'scale', probability = True))
	# models.append(GaussianNB())
	# models.append(KNeighborsClassifier())
	# models.append(AdaBoostClassifier())
	# models.append(BaggingClassifier())
	# models.append(RandomForestClassifier())
	# models.append(ExtraTreesClassifier())
	return models

In [93]:
seed = 2017
np.random.seed(seed)

def f1(y, p): return f1_score(y, p, average='micro')

# create the super learner
def get_super_learner(X):
	ensemble = SuperLearner(scorer=f1, folds=5, shuffle=True, random_state = seed)
	# add base models
	models = get_models()
	ensemble.add(models)
	# add the meta model
	ensemble.add_meta(LogisticRegression(max_iter = 400))
	return ensemble
 

In [94]:
# load press release data
train_data = pd.read_csv("../train_data.csv", index_col = 0)
train_labels = pd.read_csv("../train_labels.csv", index_col = 0)

X = train_data.values
# y = np.asarray([int(i) for i in r.train_labels])[idx]
y = np.asarray([int(i) for i in train_labels.values])

In [95]:
# create the inputs and outputs

# split
train, test, train_val, test_val = train_test_split(X, y, test_size = 0.20) # test_size = 0.50


In [96]:
print('Train', train.shape, train_val.shape, 'Test', test.shape, test_val.shape)

Train (2193, 3386) (2193,) Test (549, 3386) (549,)


In [113]:
# create the super learner
ensemble = get_super_learner(train)

# fit the super learner
ensemble.fit(train[:1000], train_val[:1000])

# summarize base learners
print(ensemble.data)

# make predictions on hold out set
test_hat = ensemble.predict(test)

print('Super Learner: %.3f' % (accuracy_score(test_val, test_hat) * 100))

TypeError: get_super_learner() got an unexpected keyword argument 'sample_size'

In [108]:
preds = ensemble.predict(train[:1000])

In [111]:
accuracy_score(train_val[:1000], preds)

In [112]:
vars(ensemble)

{'shuffle': True,
 'random_state': 2017,
 'scorer': <function __main__.f1(y, p)>,
 '_model_selection': False,
 '_verbose': False,
 'layers': [Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
     name='layer-1', propagate_features=None, raise_on_exception=True,
     random_state=9787, shuffle=True,
     stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
     indexer=FoldIndex(X=None, folds=5, raise_on_exception=True),
     learners=[Learner(attr='predict', backend='threading', dtype=<class 'numpy.float32'>,
      estimator=BernoulliNB(),
      indexer=FoldIndex(X=None, folds=5, raise_on_excep...n f1 at 0x7fa80fa0f400>)],
     n_jobs=-1, name='group-10', raise_on_exception=True, transformers=[])],
     verbose=0),
  Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
     name='layer-2', propagate_features=None, raise_on_exception=True,
     random_state=9787, shuffle=True,
     stack=[Group(backend='threading', dtype=<class 'numpy.floa

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

clf = BaggingClassifier(base_estimator=SVC(),).fit(train, train_val)

In [195]:
clfhat = clf.predict(test)

In [196]:
accuracy_score(y_val, clfhat) * 100

50.27322404371585

## Multinomial Textmodel

In [192]:
mnb = MultinomialNB()
y_pred = mnb.fit(X, y).predict(X_val)

In [193]:
accuracy_score(y_val, y_pred)

0.6612021857923497

In [69]:
print("Number of mislabeled points out of a total %d points : %d" % (y.shape[0], (y_val != y_pred).sum()))

Number of mislabeled points out of a total 549 points : 201


In [71]:
(y.shape[0] - (y_val != y_pred).sum()) / y.shape[0]

0.6338797814207651

In [None]:
accuracy_score(y_pred, clfhat) * 100