- (Gaussian) Multinomial Naive Bayes with Grid Search
- Logistic Regression
- Support Vector Machines
- Decision Tree
- Random Forest & Bagging Classifier
- Boosting XGBoost
- Neural Nets



In [28]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import sklearn.datasets as datasets
from sklearn.kernel_approximation import Nystroem
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings(action='once')

In [3]:
from LanguageModels.Word2Vec import Word2Vec
from Preprocessing.LemmatizerPreprocessor import LemmatizerPreprocessor
from Preprocessing.DataLoader import DataLoader

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rhythmsyed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rhythmsyed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rhythmsyed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rhythmsyed/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Load Dataset

In [None]:
file = open("../data/processed/data.p",'rb')
data = pickle.load(file)
file.close()

file = open("../data/processed/labels.p",'rb')
labels = pickle.load(file)
file.close()

In [None]:
label = labels[:,0]
print('embeddings: {}'.format(data.shape))
print('label: {}'.format(label.shape))

In [None]:
# get rid of Nones
# for i in range(label.shape[0]):
#     print(label[i])
#     if label[i] == None:
#         print(i)
# 299 -> 448

In [None]:
label = np.concatenate((label[:298], label[449::]), 0)
label=label.astype('int')
data = np.concatenate((data[:298], data[449::]), 0)
print('embeddings: {}'.format(data.shape))
print('label: {}'.format(label.shape))

### Support Vector Machine: Multi-Class Classification

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=42, stratify=label)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)

### Test on Training Data

In [None]:
pred_train = clf.predict(X_train)
accuracy_score(y_train, pred_train)

In [None]:
confusion_matrix(y_train, pred_train)

### Test on Testing Data

In [None]:
pred_test = clf.predict(X_test)
accuracy_score(y_test, pred_test)

In [None]:
confusion_matrix(y_test, pred_test)

## Linear Regression & Evaluation of Different Types of Regularization

## Vanilla Linear Regression (No Regularization)

In [None]:
reg = LinearRegression().fit(X_train, y_train)
print("Training Accuracy: " + str(reg.score(X_train, y_train)))
print("Training Accuracy: " + str(reg.score(X_test, y_test)))

## Ridge Regression (L2 Regularization)

In [None]:
parameters = {'alpha':[1e-4, 1e-3, 1e-2, 0.1, 0.2, 0.5, 1, 2, 5, 10]}
rid = Ridge()
clf = GridSearchCV(rid, parameters, cv=10, verbose=1)
clf.fit(X_train, y_train)
print("Training Accuracy: " + str(clf.score(X_train, y_train)))
print("Testing Accuracy: " + str(clf.score(X_test, y_test)))
print(clf.best_params_)

## Lasso Regression (L1 Regularization)

In [None]:
parameters = {'alpha':[1e-5, 1e-4, 1e-3, 1e-2, 0.1, 0.2, 0.5, 1, 2, 5, 10, 100]}

las = Lasso(max_iter = 10000)
clf = GridSearchCV(las, parameters, cv=10, verbose=1)
clf.fit(X_train, y_train)
print("Training Accuracy: " + str(clf.score(X_train, y_train)))
print("Testing Accuracy: " + str(clf.score(X_test, y_test)))
print(clf.best_params_)

## ElasticNet Regression (L1 + L2 Regularization)

In [None]:
parameters = {'alpha':[1e-5, 1e-4, 1e-3, 1e-2, 0.1, 0.2, 0.5, 1, 2, 5, 10, 100], 'l1_ratio':[0.05, 0.1, 0.15, 0.25, 0.5, 0.75, 1]}

las = ElasticNet(max_iter = 10000)
clf = GridSearchCV(las, parameters, cv=10, verbose=1)
clf.fit(X_train, y_train)
print("Training Accuracy: " + str(clf.score(X_train, y_train)))
print("Testing Accuracy: " + str(clf.score(X_test, y_test)))
print(clf.best_params_)

## Train/Test on Multiple Classifiers

In [12]:
# names = ["Linear SVM", "RBF SVM", "Gaussian Process", "Logistic Regression"
#          "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
#          "Naive Bayes", "Bagging Classifier"]

parameters = {
    'SVM': {'kernel': ['linear', 'rbf'], 'C':[0.01, 0.1, 1], 'gamma': ['scale']},
    'GaussianProcess': {'kernel': [1.0 * RBF(1.0)]},
    'LogisticRegression': {'random_state': [0], 'max_iter': [5000]},
    'DecisionTree': {'max_depth': [5]},
    'RandomForest': {'max_depth': [5], 'n_estimators': [10], 'max_features': [1]},
    'MLP': {'alpha': [1], 'max_iter': [5000]},
    'AdaBoost': {'n_estimators': [100]},
    'NaiveBayes': {},
    'BaggingClassifier': {'base_estimator': [SVC()], 'n_estimators': [10], 'random_state': [0]}
}

classifiers = {
    'SVM': SVC(),
    'GaussianProcess': GaussianProcessClassifier(),
    'LogisticRegression': LogisticRegression(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'MLP': MLPClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'NaiveBayes': GaussianNB(),
    'BaggingClassifier': BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=0)
}

score_dict = {}
for name in classifiers.keys():
    print('\n*** Training {} ***'.format(name))
    clf = GridSearchCV(classifiers[name], parameters[name], cv=5, verbose=1)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print('Best Params: {}, Test Score: {}'.format(clf.best_params_, score))
    score_dict[name] = score
print(score_dict)


*** Training SVM ***
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=T

Best Params: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}, Test Score: 0.631578947368421

*** Training GaussianProcess ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


KeyboardInterrupt: 

In [None]:
#yet another logistic regression 
logReg = LogisticRegression(random_state=0, max_iter = 50000)
logReg.fit(X_train, y_train)

test_training= logReg.score(X_train, y_train)
test_testing = logReg.score(X_test, y_test)

print("training data score", test_training, "\ntesting data score", test_testing)

In [None]:
#multiclass logistic regression
multiclassLogReg = LogisticRegression(multi_class='ovr')
multiclassLogReg.fit(X_train, y_train)

multi_test_training= multiclassLogReg.score(X_train, y_train)
multi_test_testing = multiclassLogReg.score(X_test, y_test)

print("training data score", multi_test_training, "\ntesting data score", multi_test_testing)

In [None]:
# Vanilla SVM, https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
y = np.array([1, 1, 2, 2])
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X, y)
print(clf.predict([[-0.8, -1]]))

In [None]:
# Multinomial Naive Bayes
rng = np.random.RandomState(1)
X = rng.randint(5, size=(6, 100))
y = np.array([1, 2, 3, 4, 5, 6])
clf = MultinomialNB()
clf.fit(X, y)
print(clf.predict(X[2:3]))

In [None]:
# Gaussian NB
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
Y = np.array([1, 1, 1, 2, 2, 2])
clf = GaussianNB()
clf.fit(X, Y)
print(clf.predict([[-0.8, -1]]))

clf_pf = GaussianNB()
clf_pf.partial_fit(X, Y, np.unique(Y))
print(clf_pf.predict([[-0.8, -1]]))

In [None]:
# Logistic Regression, https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(random_state=0, max_iter = 50000).fit(X, y)
print(clf.predict(X[:2, :]))

print(clf.predict_proba(X[:2, :]))
print(clf.score(X, y))

In [None]:
# Linear SVM for large datasets, https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC
X, y = datasets.load_digits(n_class=9, return_X_y=True)
data = X / 16
feature_map_nystroem = Nystroem(gamma=.2,
                                random_state=1,
                                n_components=300)
data_transformed = feature_map_nystroem.fit_transform(data)

clf = make_pipeline(StandardScaler(),
                  LinearSVC(random_state=0, tol=1e-5, max_iter = 50000))
clf.fit(data_transformed, y)
clf.score(data_transformed, y)

In [None]:
# Random Forest Classification, https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
X, y = make_classification(n_samples=1000, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
print(clf.predict([[0, 0, 0, 0]]))

In [None]:
# Decision Tree Classifier, https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
iris = load_iris()
cross_val_score(clf, iris.data, iris.target, cv=10)

In [None]:
# Bagging Classifier, https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
X, y = make_classification(n_samples=100, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)
clf = BaggingClassifier(base_estimator=SVC(),
                        n_estimators=10, random_state=0).fit(X, y)
clf.predict([[0, 0, 0, 0]])

In [None]:
# Gradient Boosting Classifier, https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
X, y = make_hastie_10_2(random_state=0)
X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
      max_depth=1, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
# MLP Classifier, https://scikit-learn.org/stable/modules/neural_networks_supervised.html
X = [[0., 0.], [1., 1.]]
y = [0, 1]
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)

clf.fit(X, y)
clf.predict([[2., 2.], [-1., -2.]])

### Using data pipeline for binary logistic regression

In [29]:
#Using data pipeline

data = DataLoader('../data/EMNLP2020.csv').load()

# Load preprocessor
lp = LemmatizerPreprocessor()

In [30]:
w2v = Word2Vec(path='../data/glove.6B/glove.6B.200d.txt')

In [31]:
# binary case
X, y = w2v.featurize(data, lp, mode='binary')

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1061, 200) (266, 200) (1061, 1) (266, 1)


In [33]:
#logistic regression with binary labels

logReg = LogisticRegression(random_state=0, max_iter = 50000)
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)
logReg.fit(X_train, y_train)

test_training= logReg.score(X_train, y_train)
test_testing = logReg.score(X_test, y_test)

y_pred = logReg.predict(X_test)

print("training data score", test_training, "\ntesting data score", test_testing)
print(classification_report(y_test, y_pred))

training data score 0.9491046182846371 
testing data score 0.9135338345864662
              precision    recall  f1-score   support

           0       0.89      0.93      0.91       126
           1       0.93      0.90      0.92       140

    accuracy                           0.91       266
   macro avg       0.91      0.91      0.91       266
weighted avg       0.91      0.91      0.91       266





### MultiClass with Negative Samples Case

In [34]:
# multiclass case
X, y = w2v.featurize(data, lp, mode='multiclass')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1061, 200) (266, 200) (1061, 1) (266, 1)


In [36]:
parameters = {
    'SVM': {'kernel': ['linear', 'rbf'], 'C':[0.01, 0.1, 1], 'gamma': ['scale']},
    'GaussianProcess': {'kernel': [1.0 * RBF(1.0)]},
    'LogisticRegression': {'random_state': [0], 'max_iter': [5000]},
    'DecisionTree': {'max_depth': [5]},
    'RandomForest': {'max_depth': [5], 'n_estimators': [10], 'max_features': [1]},
    'MLP': {'alpha': [1], 'max_iter': [5000]},
    'AdaBoost': {'n_estimators': [100]},
    'NaiveBayes': {},
    'BaggingClassifier': {'base_estimator': [SVC()], 'n_estimators': [10], 'random_state': [0]}
}

classifiers = {
    'SVM': SVC(),
    'GaussianProcess': GaussianProcessClassifier(),
    'LogisticRegression': LogisticRegression(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'MLP': MLPClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'NaiveBayes': GaussianNB(),
    'BaggingClassifier': BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=0)
}

score_dict = {}
for name in classifiers.keys():
    print('\n*** Training {} ***'.format(name))
    clf = GridSearchCV(classifiers[name], parameters[name], cv=5, verbose=1)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    print('Best Params: {}, Test Score: {}'.format(clf.best_params_, score))
    print(classification_report(y_test, y_pred))
    score_dict[name] = score
print(score_dict)


*** Training SVM ***
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=T

Best Params: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}, Test Score: 0.6353383458646616
              precision    recall  f1-score   support

           1       0.33      0.29      0.31        14
           2       0.38      0.60      0.46        30
           3       0.00      0.00      0.00         6
           4       0.27      0.26      0.27        23
           5       0.18      0.18      0.18        11
           6       0.00      0.00      0.00        10
           7       1.00      0.20      0.33         5
           8       0.20      0.08      0.12        12
           9       0.00      0.00      0.00         2
          10       0.40      0.15      0.22        13
          11       0.84      0.96      0.90       140

    accuracy                           0.64       266
   macro avg       0.33      0.25      0.25       266
weighted avg       0.58      0.64      0.59       266


*** Training GaussianProcess ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.1min finished
  y = column_or_1d(y, warn=True)


Best Params: {'kernel': 1**2 * RBF(length_scale=1)}, Test Score: 0.650375939849624
              precision    recall  f1-score   support

           1       0.31      0.36      0.33        14
           2       0.36      0.93      0.52        30
           3       0.00      0.00      0.00         6
           4       0.50      0.09      0.15        23
           5       0.00      0.00      0.00        11
           6       0.00      0.00      0.00        10
           7       0.00      0.00      0.00         5
           8       0.00      0.00      0.00        12
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00        13
          11       0.82      0.99      0.90       140

    accuracy                           0.65       266
   macro avg       0.18      0.21      0.17       266
weighted avg       0.53      0.65      0.56       266


*** Training LogisticRegression ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.9s finished
  y = column_or_1d(y, warn=True)


Best Params: {'max_iter': 5000, 'random_state': 0}, Test Score: 0.6390977443609023
              precision    recall  f1-score   support

           1       0.50      0.29      0.36        14
           2       0.42      0.67      0.51        30
           3       0.00      0.00      0.00         6
           4       0.27      0.17      0.21        23
           5       0.00      0.00      0.00        11
           6       0.00      0.00      0.00        10
           7       1.00      0.20      0.33         5
           8       0.25      0.08      0.12        12
           9       0.00      0.00      0.00         2
          10       0.50      0.08      0.13        13
          11       0.76      0.99      0.86       140

    accuracy                           0.64       266
   macro avg       0.34      0.23      0.23       266
weighted avg       0.55      0.64      0.57       266


*** Training DecisionTree ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s finished
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Best Params: {'max_depth': 5}, Test Score: 0.518796992481203
              precision    recall  f1-score   support

           1       0.17      0.21      0.19        14
           2       0.26      0.60      0.36        30
           3       0.00      0.00      0.00         6
           4       0.09      0.09      0.09        23
           5       0.00      0.00      0.00        11
           6       0.00      0.00      0.00        10
           7       0.00      0.00      0.00         5
           8       0.17      0.08      0.11        12
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00        13
          11       0.78      0.81      0.79       140

    accuracy                           0.52       266
   macro avg       0.13      0.16      0.14       266
weighted avg       0.46      0.52      0.48       266


*** Training RandomForest ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Params: {'max_depth': 5, 'max_f

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
  self.best_estimator_.fit(X, y, **fit_params)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.8s finished
  y = column_or_1d(y, warn=True)


Best Params: {'alpha': 1, 'max_iter': 5000}, Test Score: 0.6428571428571429
              precision    recall  f1-score   support

           1       0.33      0.29      0.31        14
           2       0.37      0.77      0.49        30
           3       0.00      0.00      0.00         6
           4       0.26      0.26      0.26        23
           5       0.00      0.00      0.00        11
           6       0.00      0.00      0.00        10
           7       0.00      0.00      0.00         5
           8       0.00      0.00      0.00        12
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00        13
          11       0.84      0.99      0.91       140

    accuracy                           0.64       266
   macro avg       0.16      0.21      0.18       266
weighted avg       0.52      0.64      0.57       266


*** Training AdaBoost ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   10.1s finished
  y = column_or_1d(y, warn=True)


Best Params: {'n_estimators': 100}, Test Score: 0.4398496240601504
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        14
           2       0.00      0.00      0.00        30
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00        23
           5       0.00      0.00      0.00        11
           6       0.00      0.00      0.00        10
           7       0.00      0.00      0.00         5
           8       0.00      0.00      0.00        12
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00        13
          11       0.49      0.84      0.62       140

    accuracy                           0.44       266
   macro avg       0.04      0.08      0.06       266
weighted avg       0.26      0.44      0.32       266


*** Training NaiveBayes ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Params: {}, Test Score: 0.5

  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
  y = column_or_1d(y, warn=True)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.9s finished
  y = column_or_1d(y, warn=True)


Best Params: {'base_estimator': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False), 'n_estimators': 10, 'random_state': 0}, Test Score: 0.5263157894736842
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        14
           2       0.00      0.00      0.00        30
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00        23
           5       0.00      0.00      0.00        11
           6       0.00      0.00      0.00        10
           7       0.00      0.00      0.00         5
           8       0.00      0.00      0.00        12
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00        13
          11       0.53      1.00      0.69       140

    

  'precision', 'predicted', average, warn_for)


In [37]:
# multiclass without neg samples case
X, y = w2v.featurize(data, lp, mode='multiclass', remove_neg_samples=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(501, 200) (126, 200) (501, 1) (126, 1)


In [38]:
parameters = {
    'SVM': {'kernel': ['linear', 'rbf'], 'C':[0.01, 0.1, 1], 'gamma': ['scale']},
    'GaussianProcess': {'kernel': [1.0 * RBF(1.0)]},
    'LogisticRegression': {'random_state': [0], 'max_iter': [5000]},
    'DecisionTree': {'max_depth': [5]},
    'RandomForest': {'max_depth': [5], 'n_estimators': [10], 'max_features': [1]},
    'MLP': {'alpha': [1], 'max_iter': [5000]},
    'AdaBoost': {'n_estimators': [100]},
    'NaiveBayes': {},
    'BaggingClassifier': {'base_estimator': [SVC()], 'n_estimators': [10], 'random_state': [0]}
}

classifiers = {
    'SVM': SVC(),
    'GaussianProcess': GaussianProcessClassifier(),
    'LogisticRegression': LogisticRegression(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'MLP': MLPClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'NaiveBayes': GaussianNB(),
    'BaggingClassifier': BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=0)
}

score_dict = {}
for name in classifiers.keys():
    print('\n*** Training {} ***'.format(name))
    clf = GridSearchCV(classifiers[name], parameters[name], cv=5, verbose=1)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    print('Best Params: {}, Test Score: {}'.format(clf.best_params_, score))
    print(classification_report(y_test, y_pred))
    score_dict[name] = score
print(score_dict)


*** Training SVM ***
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=T

Best Params: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}, Test Score: 0.3888888888888889
              precision    recall  f1-score   support

           1       0.57      0.29      0.38        14
           2       0.40      0.73      0.52        30
           3       0.50      0.17      0.25         6
           4       0.34      0.48      0.40        23
           5       0.41      0.64      0.50        11
           6       0.00      0.00      0.00        10
           7       0.00      0.00      0.00         5
           8       0.50      0.08      0.14        12
           9       0.00      0.00      0.00         2
          10       0.27      0.23      0.25        13

    accuracy                           0.39       126
   macro avg       0.30      0.26      0.24       126
weighted avg       0.36      0.39      0.33       126


*** Training GaussianProcess ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   56.2s finished
  y = column_or_1d(y, warn=True)


Best Params: {'kernel': 1**2 * RBF(length_scale=1)}, Test Score: 0.373015873015873
              precision    recall  f1-score   support

           1       0.50      0.29      0.36        14
           2       0.42      0.73      0.53        30
           3       0.00      0.00      0.00         6
           4       0.31      0.35      0.33        23
           5       0.33      0.73      0.46        11
           6       0.00      0.00      0.00        10
           7       0.50      0.40      0.44         5
           8       0.33      0.08      0.13        12
           9       0.00      0.00      0.00         2
          10       0.25      0.15      0.19        13

    accuracy                           0.37       126
   macro avg       0.26      0.27      0.24       126
weighted avg       0.32      0.37      0.32       126


*** Training LogisticRegression ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.3s finished
  y = column_or_1d(y, warn=True)


Best Params: {'max_iter': 5000, 'random_state': 0}, Test Score: 0.3412698412698413
              precision    recall  f1-score   support

           1       0.57      0.29      0.38        14
           2       0.34      0.77      0.47        30
           3       1.00      0.17      0.29         6
           4       0.33      0.39      0.36        23
           5       0.31      0.36      0.33        11
           6       0.00      0.00      0.00        10
           7       0.00      0.00      0.00         5
           8       0.50      0.08      0.14        12
           9       0.00      0.00      0.00         2
          10       0.12      0.08      0.10        13

    accuracy                           0.34       126
   macro avg       0.32      0.21      0.21       126
weighted avg       0.34      0.34      0.29       126


*** Training DecisionTree ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
  self.best_estimator_.fit(X, y, **fit_params)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)


Best Params: {'max_depth': 5}, Test Score: 0.24603174603174602
              precision    recall  f1-score   support

           1       0.50      0.36      0.42        14
           2       0.29      0.53      0.38        30
           3       0.00      0.00      0.00         6
           4       0.28      0.35      0.31        23
           5       0.12      0.09      0.11        11
           6       0.00      0.00      0.00        10
           7       0.00      0.00      0.00         5
           8       0.00      0.00      0.00        12
           9       0.00      0.00      0.00         2
          10       0.06      0.08      0.07        13

    accuracy                           0.25       126
   macro avg       0.13      0.14      0.13       126
weighted avg       0.19      0.25      0.21       126


*** Training RandomForest ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Params: {'max_depth': 5, 'max_features': 1, 'n_estimators': 10}, Test Score: 0.2539

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.5s finished
  y = column_or_1d(y, warn=True)


Best Params: {'alpha': 1, 'max_iter': 5000}, Test Score: 0.373015873015873
              precision    recall  f1-score   support

           1       0.56      0.36      0.43        14
           2       0.43      0.67      0.53        30
           3       1.00      0.17      0.29         6
           4       0.32      0.43      0.37        23
           5       0.32      0.64      0.42        11
           6       0.00      0.00      0.00        10
           7       0.00      0.00      0.00         5
           8       0.33      0.08      0.13        12
           9       0.00      0.00      0.00         2
          10       0.23      0.23      0.23        13

    accuracy                           0.37       126
   macro avg       0.32      0.26      0.24       126
weighted avg       0.36      0.37      0.33       126


*** Training AdaBoost ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.4s finished
  y = column_or_1d(y, warn=True)


Best Params: {'n_estimators': 100}, Test Score: 0.2619047619047619
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        14
           2       0.27      0.70      0.39        30
           3       0.00      0.00      0.00         6
           4       0.27      0.52      0.35        23
           5       0.00      0.00      0.00        11
           6       0.00      0.00      0.00        10
           7       0.00      0.00      0.00         5
           8       0.00      0.00      0.00        12
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00        13

    accuracy                           0.26       126
   macro avg       0.05      0.12      0.07       126
weighted avg       0.11      0.26      0.16       126


*** Training NaiveBayes ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Params: {}, Test Score: 0.35714285714285715
              precision    recall  f1

  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
  y = column_or_1d(y, warn=True)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.2s finished
  y = column_or_1d(y, warn=True)


Best Params: {'base_estimator': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False), 'n_estimators': 10, 'random_state': 0}, Test Score: 0.23809523809523808
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        14
           2       0.24      1.00      0.38        30
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00        23
           5       0.00      0.00      0.00        11
           6       0.00      0.00      0.00        10
           7       0.00      0.00      0.00         5
           8       0.00      0.00      0.00        12
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00        13

    accuracy                           0.24       126
   

  'precision', 'predicted', average, warn_for)
