- (Gaussian) Multinomial Naive Bayes with Grid Search
- Logistic Regression
- Support Vector Machines
- Decision Tree
- Random Forest & Bagging Classifier
- Boosting XGBoost
- Neural Nets



In [1]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import sklearn.datasets as datasets
from sklearn.kernel_approximation import Nystroem
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings(action='once')

In [2]:
from LanguageModels.Word2Vec import Word2Vec
from Preprocessing.LemmatizerPreprocessor import LemmatizerPreprocessor
from Preprocessing.DataLoader import DataLoader

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rhythmsyed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rhythmsyed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rhythmsyed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rhythmsyed/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Load Dataset

In [None]:
file = open("../data/processed/data.p",'rb')
data = pickle.load(file)
file.close()

file = open("../data/processed/labels.p",'rb')
labels = pickle.load(file)
file.close()

In [None]:
label = labels[:,0]
print('embeddings: {}'.format(data.shape))
print('label: {}'.format(label.shape))

In [None]:
# get rid of Nones
# for i in range(label.shape[0]):
#     print(label[i])
#     if label[i] == None:
#         print(i)
# 299 -> 448

In [None]:
label = np.concatenate((label[:298], label[449::]), 0)
label=label.astype('int')
data = np.concatenate((data[:298], data[449::]), 0)
print('embeddings: {}'.format(data.shape))
print('label: {}'.format(label.shape))

### Support Vector Machine: Multi-Class Classification

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=42, stratify=label)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)

### Test on Training Data

In [None]:
pred_train = clf.predict(X_train)
accuracy_score(y_train, pred_train)

In [None]:
confusion_matrix(y_train, pred_train)

### Test on Testing Data

In [None]:
pred_test = clf.predict(X_test)
accuracy_score(y_test, pred_test)

In [None]:
confusion_matrix(y_test, pred_test)

## Linear Regression & Evaluation of Different Types of Regularization

## Vanilla Linear Regression (No Regularization)

In [None]:
reg = LinearRegression().fit(X_train, y_train)
print("Training Accuracy: " + str(reg.score(X_train, y_train)))
print("Training Accuracy: " + str(reg.score(X_test, y_test)))

## Ridge Regression (L2 Regularization)

In [None]:
parameters = {'alpha':[1e-4, 1e-3, 1e-2, 0.1, 0.2, 0.5, 1, 2, 5, 10]}
rid = Ridge()
clf = GridSearchCV(rid, parameters, cv=10, verbose=1)
clf.fit(X_train, y_train)
print("Training Accuracy: " + str(clf.score(X_train, y_train)))
print("Testing Accuracy: " + str(clf.score(X_test, y_test)))
print(clf.best_params_)

## Lasso Regression (L1 Regularization)

In [None]:
parameters = {'alpha':[1e-5, 1e-4, 1e-3, 1e-2, 0.1, 0.2, 0.5, 1, 2, 5, 10, 100]}

las = Lasso(max_iter = 10000)
clf = GridSearchCV(las, parameters, cv=10, verbose=1)
clf.fit(X_train, y_train)
print("Training Accuracy: " + str(clf.score(X_train, y_train)))
print("Testing Accuracy: " + str(clf.score(X_test, y_test)))
print(clf.best_params_)

## ElasticNet Regression (L1 + L2 Regularization)

In [None]:
parameters = {'alpha':[1e-5, 1e-4, 1e-3, 1e-2, 0.1, 0.2, 0.5, 1, 2, 5, 10, 100], 'l1_ratio':[0.05, 0.1, 0.15, 0.25, 0.5, 0.75, 1]}

las = ElasticNet(max_iter = 10000)
clf = GridSearchCV(las, parameters, cv=10, verbose=1)
clf.fit(X_train, y_train)
print("Training Accuracy: " + str(clf.score(X_train, y_train)))
print("Testing Accuracy: " + str(clf.score(X_test, y_test)))
print(clf.best_params_)

## Train/Test on Multiple Classifiers

In [12]:
# names = ["Linear SVM", "RBF SVM", "Gaussian Process", "Logistic Regression"
#          "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
#          "Naive Bayes", "Bagging Classifier"]

parameters = {
    'SVM': {'kernel': ['linear', 'rbf'], 'C':[0.01, 0.1, 1], 'gamma': ['scale']},
    'GaussianProcess': {'kernel': [1.0 * RBF(1.0)]},
    'LogisticRegression': {'random_state': [0], 'max_iter': [5000]},
    'DecisionTree': {'max_depth': [5]},
    'RandomForest': {'max_depth': [5], 'n_estimators': [10], 'max_features': [1]},
    'MLP': {'alpha': [1], 'max_iter': [5000]},
    'AdaBoost': {'n_estimators': [100]},
    'NaiveBayes': {},
    'BaggingClassifier': {'base_estimator': [SVC()], 'n_estimators': [10], 'random_state': [0]}
}

classifiers = {
    'SVM': SVC(),
    'GaussianProcess': GaussianProcessClassifier(),
    'LogisticRegression': LogisticRegression(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'MLP': MLPClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'NaiveBayes': GaussianNB(),
    'BaggingClassifier': BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=0)
}

score_dict = {}
for name in classifiers.keys():
    print('\n*** Training {} ***'.format(name))
    clf = GridSearchCV(classifiers[name], parameters[name], cv=5, verbose=1)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print('Best Params: {}, Test Score: {}'.format(clf.best_params_, score))
    score_dict[name] = score
print(score_dict)


*** Training SVM ***
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=T

Best Params: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}, Test Score: 0.631578947368421

*** Training GaussianProcess ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


KeyboardInterrupt: 

## Rebalancing the Dataset

In [57]:
oversample = SMOTE()
def f(i):
    if i == 11:
        return 0
    else:
        return 1
    
a = np.where(label == 11)
print(data.shape)
print(label.shape)

data_pos = np.delete(data, a, axis=0)
label_pos = np.delete(label, a)
print(data_pos.shape)
print(label_pos.shape)

X_train, X_test, y_train, y_test = train_test_split(data_pos, label_pos, test_size=0.2, random_state=42, stratify=label_pos)

X_train_rebal, y_train_rebal = oversample.fit_resample(X_train, y_train)
print(Counter(y_train_rebal.tolist()))

(1299, 200)
(1299,)
(599, 200)
(599,)
Counter({1: 113, 4: 113, 6: 113, 2: 113, 8: 113, 3: 113, 5: 113, 7: 113, 10: 113, 9: 113})


## Multiclass classification, positive labels only, unbalanced

In [58]:
names = ["Linear SVM", "RBF SVM", "Logistic Regression",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "Bagging Classifier"]
         
classifiers = [
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    # GaussianProcessClassifier(1.0 * RBF(1.0), n_jobs=-1),
    LogisticRegression(random_state=0, max_iter = 5000),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    MLPClassifier(alpha=1, max_iter=5000),
    AdaBoostClassifier(),
    GaussianNB(),
    BaggingClassifier(base_estimator=SVC(),
                        n_estimators=10, random_state=0)]

score_dict = dict()
for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        score_dict[name] = score
        print(name)
        print(classification_report(clf.predict(X_test), y_test, zero_division=0))
print(score_dict)

Linear SVM
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         0
           2       1.00      0.23      0.38       120
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0

    accuracy                           0.23       120
   macro avg       0.10      0.02      0.04       120
weighted avg       1.00      0.23      0.38       120

RBF SVM
              precision    recall  f1-score   support

           1       0.38      0.62      0.48         8
           2       0.89      0.31      0.46        80
           3       0.00      0.00      0.00         0
     

## Multiclass classification, positive labels only, balanced

In [59]:
names = ["Linear SVM", "RBF SVM", "Logistic Regression",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "Bagging Classifier"]
         
classifiers = [
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    # GaussianProcessClassifier(1.0 * RBF(1.0), n_jobs=-1),
    LogisticRegression(random_state=0, max_iter = 5000),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    MLPClassifier(alpha=1, max_iter=5000),
    AdaBoostClassifier(),
    GaussianNB(),
    BaggingClassifier(base_estimator=SVC(),
                        n_estimators=10, random_state=0)]

score_dict = dict()
for name, clf in zip(names, classifiers):
        clf.fit(X_train_rebal, y_train_rebal)
        score = clf.score(X_test, y_test)
        score_dict[name] = score
        print(name)
        print(classification_report(clf.predict(X_test), y_test, zero_division=0))
print(score_dict)

Linear SVM
              precision    recall  f1-score   support

           1       0.46      0.43      0.44        14
           2       0.36      0.59      0.44        17
           3       0.50      0.13      0.21        23
           4       0.18      0.44      0.26         9
           5       0.27      0.43      0.33         7
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         2
           8       0.33      0.19      0.24        21
           9       0.50      0.07      0.12        14
          10       0.38      0.45      0.42        11

    accuracy                           0.30       120
   macro avg       0.30      0.27      0.25       120
weighted avg       0.38      0.30      0.29       120

RBF SVM
              precision    recall  f1-score   support

           1       0.38      0.42      0.40        12
           2       0.64      0.33      0.44        54
           3       0.00      0.00      0.00         2
     

In [None]:
#yet another logistic regression 
logReg = LogisticRegression(random_state=0, max_iter = 50000)
logReg.fit(X_train, y_train)

test_training= logReg.score(X_train, y_train)
test_testing = logReg.score(X_test, y_test)

print("training data score", test_training, "\ntesting data score", test_testing)

In [None]:
#multiclass logistic regression
multiclassLogReg = LogisticRegression(multi_class='ovr')
multiclassLogReg.fit(X_train, y_train)

multi_test_training= multiclassLogReg.score(X_train, y_train)
multi_test_testing = multiclassLogReg.score(X_test, y_test)

print("training data score", multi_test_training, "\ntesting data score", multi_test_testing)

In [None]:
# Vanilla SVM, https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
y = np.array([1, 1, 2, 2])
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X, y)
print(clf.predict([[-0.8, -1]]))

In [None]:
# Multinomial Naive Bayes
rng = np.random.RandomState(1)
X = rng.randint(5, size=(6, 100))
y = np.array([1, 2, 3, 4, 5, 6])
clf = MultinomialNB()
clf.fit(X, y)
print(clf.predict(X[2:3]))

In [None]:
# Gaussian NB
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
Y = np.array([1, 1, 1, 2, 2, 2])
clf = GaussianNB()
clf.fit(X, Y)
print(clf.predict([[-0.8, -1]]))

clf_pf = GaussianNB()
clf_pf.partial_fit(X, Y, np.unique(Y))
print(clf_pf.predict([[-0.8, -1]]))

In [None]:
# Logistic Regression, https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(random_state=0, max_iter = 50000).fit(X, y)
print(clf.predict(X[:2, :]))

print(clf.predict_proba(X[:2, :]))
print(clf.score(X, y))

In [None]:
# Linear SVM for large datasets, https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC
X, y = datasets.load_digits(n_class=9, return_X_y=True)
data = X / 16
feature_map_nystroem = Nystroem(gamma=.2,
                                random_state=1,
                                n_components=300)
data_transformed = feature_map_nystroem.fit_transform(data)

clf = make_pipeline(StandardScaler(),
                  LinearSVC(random_state=0, tol=1e-5, max_iter = 50000))
clf.fit(data_transformed, y)
clf.score(data_transformed, y)

In [None]:
# Random Forest Classification, https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
X, y = make_classification(n_samples=1000, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
print(clf.predict([[0, 0, 0, 0]]))

In [None]:
# Decision Tree Classifier, https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
iris = load_iris()
cross_val_score(clf, iris.data, iris.target, cv=10)

In [None]:
# Bagging Classifier, https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
X, y = make_classification(n_samples=100, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)
clf = BaggingClassifier(base_estimator=SVC(),
                        n_estimators=10, random_state=0).fit(X, y)
clf.predict([[0, 0, 0, 0]])

In [None]:
# Gradient Boosting Classifier, https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
X, y = make_hastie_10_2(random_state=0)
X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
      max_depth=1, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
# MLP Classifier, https://scikit-learn.org/stable/modules/neural_networks_supervised.html
X = [[0., 0.], [1., 1.]]
y = [0, 1]
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)

clf.fit(X, y)
clf.predict([[2., 2.], [-1., -2.]])

### Using data pipeline for binary logistic regression

In [4]:
#Using data pipeline

data = DataLoader('../data/EMNLP2020.csv').load()

# Load preprocessor
lp = LemmatizerPreprocessor()

In [9]:
w2v = Word2Vec(path='../data/glove.6B/glove.6B.200d.txt')

In [10]:
# binary case
X, y = w2v.featurize(data, lp, mode='binary')

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1160, 200) (290, 200) (1160, 1) (290, 1)


In [14]:
#logistic regression with binary labels

logReg = LogisticRegression(random_state=0, max_iter = 50000)
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)
logReg.fit(X_train, y_train)

test_training= logReg.score(X_train, y_train)
test_testing = logReg.score(X_test, y_test)

y_pred = logReg.predict(X_test)

print("training data score", test_training, "\ntesting data score", test_testing)
print(classification_report(y_test, y_pred))



training data score 0.9448275862068966 
testing data score 0.9172413793103448
              precision    recall  f1-score   support

           0       0.90      0.95      0.92       150
           1       0.94      0.89      0.91       140

    accuracy                           0.92       290
   macro avg       0.92      0.92      0.92       290
weighted avg       0.92      0.92      0.92       290



### MultiClass with Negative Samples Case

In [15]:
# multiclass case
X, y = w2v.featurize(data, lp, mode='multiclass')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1160, 200) (290, 200) (1160, 1) (290, 1)


In [16]:
parameters = {
    'SVM': {'kernel': ['linear', 'rbf'], 'C':[0.01, 0.1, 1], 'gamma': ['scale']},
    'GaussianProcess': {'kernel': [1.0 * RBF(1.0)]},
    'LogisticRegression': {'random_state': [0], 'max_iter': [5000]},
    'DecisionTree': {'max_depth': [5]},
    'RandomForest': {'max_depth': [5], 'n_estimators': [10], 'max_features': [1]},
    'MLP': {'alpha': [1], 'max_iter': [5000]},
    'AdaBoost': {'n_estimators': [100]},
    'NaiveBayes': {},
    'BaggingClassifier': {'base_estimator': [SVC()], 'n_estimators': [10], 'random_state': [0]}
}

classifiers = {
    'SVM': SVC(),
    'GaussianProcess': GaussianProcessClassifier(),
    'LogisticRegression': LogisticRegression(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'MLP': MLPClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'NaiveBayes': GaussianNB(),
    'BaggingClassifier': BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=0)
}

score_dict = {}
for name in classifiers.keys():
    print('\n*** Training {} ***'.format(name))
    clf = GridSearchCV(classifiers[name], parameters[name], cv=5, verbose=1)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    print('Best Params: {}, Test Score: {}'.format(clf.best_params_, score))
    print(classification_report(y_test, y_pred))
    score_dict[name] = score
print(score_dict)


*** Training SVM ***
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=T

Best Params: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}, Test Score: 0.6206896551724138
              precision    recall  f1-score   support

           1       0.32      0.38      0.34        16
           2       0.43      0.61      0.51        33
           3       0.00      0.00      0.00         7
           4       0.22      0.14      0.17        28
           5       0.50      0.46      0.48        13
           6       0.00      0.00      0.00        11
           7       0.33      0.17      0.22         6
           8       0.29      0.14      0.19        14
           9       0.00      0.00      0.00         2
          10       0.30      0.30      0.30        20
          11       0.83      0.96      0.89       140

    accuracy                           0.62       290
   macro avg       0.29      0.29      0.28       290
weighted avg       0.55      0.62      0.58       290


*** Training GaussianProcess ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.5min finished
  y = column_or_1d(y, warn=True)


Best Params: {'kernel': 1**2 * RBF(length_scale=1)}, Test Score: 0.6206896551724138
              precision    recall  f1-score   support

           1       0.29      0.44      0.35        16
           2       0.42      0.55      0.47        33
           3       0.00      0.00      0.00         7
           4       0.38      0.39      0.39        28
           5       0.00      0.00      0.00        13
           6       0.00      0.00      0.00        11
           7       0.00      0.00      0.00         6
           8       0.00      0.00      0.00        14
           9       0.00      0.00      0.00         2
          10       0.30      0.30      0.30        20
          11       0.79      0.99      0.88       140

    accuracy                           0.62       290
   macro avg       0.20      0.24      0.22       290
weighted avg       0.50      0.62      0.56       290


*** Training LogisticRegression ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.9s finished
  y = column_or_1d(y, warn=True)


Best Params: {'max_iter': 5000, 'random_state': 0}, Test Score: 0.6
              precision    recall  f1-score   support

           1       0.43      0.38      0.40        16
           2       0.41      0.55      0.47        33
           3       0.00      0.00      0.00         7
           4       0.19      0.11      0.14        28
           5       0.33      0.15      0.21        13
           6       0.00      0.00      0.00        11
           7       1.00      0.17      0.29         6
           8       0.50      0.07      0.12        14
           9       0.00      0.00      0.00         2
          10       0.33      0.25      0.29        20
          11       0.72      0.99      0.83       140

    accuracy                           0.60       290
   macro avg       0.36      0.24      0.25       290
weighted avg       0.52      0.60      0.53       290


*** Training DecisionTree ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s finished
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Best Params: {'max_depth': 5}, Test Score: 0.496551724137931
              precision    recall  f1-score   support

           1       0.33      0.19      0.24        16
           2       0.26      0.67      0.37        33
           3       0.00      0.00      0.00         7
           4       0.25      0.29      0.27        28
           5       0.00      0.00      0.00        13
           6       0.00      0.00      0.00        11
           7       0.00      0.00      0.00         6
           8       0.00      0.00      0.00        14
           9       0.00      0.00      0.00         2
          10       0.14      0.15      0.15        20
          11       0.77      0.77      0.77       140

    accuracy                           0.50       290
   macro avg       0.16      0.19      0.16       290
weighted avg       0.45      0.50      0.46       290


*** Training RandomForest ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Params: {'max_depth': 5, 'max_f

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
  self.best_estimator_.fit(X, y, **fit_params)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.3s finished
  y = column_or_1d(y, warn=True)


Best Params: {'alpha': 1, 'max_iter': 5000}, Test Score: 0.6206896551724138
              precision    recall  f1-score   support

           1       0.31      0.31      0.31        16
           2       0.40      0.67      0.50        33
           3       0.00      0.00      0.00         7
           4       0.30      0.25      0.27        28
           5       0.33      0.15      0.21        13
           6       0.00      0.00      0.00        11
           7       1.00      0.17      0.29         6
           8       0.00      0.00      0.00        14
           9       0.00      0.00      0.00         2
          10       0.27      0.30      0.29        20
          11       0.84      0.98      0.90       140

    accuracy                           0.62       290
   macro avg       0.31      0.26      0.25       290
weighted avg       0.55      0.62      0.57       290


*** Training AdaBoost ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   10.2s finished
  y = column_or_1d(y, warn=True)


Best Params: {'n_estimators': 100}, Test Score: 0.4793103448275862
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        16
           2       0.25      0.03      0.05        33
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00        28
           5       0.00      0.00      0.00        13
           6       0.00      0.00      0.00        11
           7       0.00      0.00      0.00         6
           8       0.00      0.00      0.00        14
           9       0.33      0.50      0.40         2
          10       0.00      0.00      0.00        20
          11       0.49      0.98      0.66       140

    accuracy                           0.48       290
   macro avg       0.10      0.14      0.10       290
weighted avg       0.27      0.48      0.33       290


*** Training NaiveBayes ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Params: {}, Test Score: 0.5

  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.8s finished
  y = column_or_1d(y, warn=True)


Best Params: {'base_estimator': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False), 'n_estimators': 10, 'random_state': 0}, Test Score: 0.4827586206896552
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        16
           2       0.00      0.00      0.00        33
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00        28
           5       0.00      0.00      0.00        13
           6       0.00      0.00      0.00        11
           7       0.00      0.00      0.00         6
           8       0.00      0.00      0.00        14
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00        20
          11       0.48      1.00      0.65       140

    

  'precision', 'predicted', average, warn_for)


In [17]:
# multiclass without neg samples case
X, y = w2v.featurize(data, lp, mode='multiclass', remove_neg_samples=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(600, 200) (150, 200) (600, 1) (150, 1)


In [18]:
parameters = {
    'SVM': {'kernel': ['linear', 'rbf'], 'C':[0.01, 0.1, 1], 'gamma': ['scale']},
    'GaussianProcess': {'kernel': [1.0 * RBF(1.0)]},
    'LogisticRegression': {'random_state': [0], 'max_iter': [5000]},
    'DecisionTree': {'max_depth': [5]},
    'RandomForest': {'max_depth': [5], 'n_estimators': [10], 'max_features': [1]},
    'MLP': {'alpha': [1], 'max_iter': [5000]},
    'AdaBoost': {'n_estimators': [100]},
    'NaiveBayes': {},
    'BaggingClassifier': {'base_estimator': [SVC()], 'n_estimators': [10], 'random_state': [0]}
}

classifiers = {
    'SVM': SVC(),
    'GaussianProcess': GaussianProcessClassifier(),
    'LogisticRegression': LogisticRegression(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'MLP': MLPClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'NaiveBayes': GaussianNB(),
    'BaggingClassifier': BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=0)
}

score_dict = {}
for name in classifiers.keys():
    print('\n*** Training {} ***'.format(name))
    clf = GridSearchCV(classifiers[name], parameters[name], cv=5, verbose=1)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    print('Best Params: {}, Test Score: {}'.format(clf.best_params_, score))
    print(classification_report(y_test, y_pred))
    score_dict[name] = score
print(score_dict)


*** Training SVM ***
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=T

Best Params: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}, Test Score: 0.3933333333333333
              precision    recall  f1-score   support

           1       0.35      0.38      0.36        16
           2       0.42      0.70      0.52        33
           3       0.00      0.00      0.00         7
           4       0.35      0.43      0.39        28
           5       0.50      0.38      0.43        13
           6       0.00      0.00      0.00        11
           7       1.00      0.33      0.50         6
           8       0.45      0.36      0.40        14
           9       0.00      0.00      0.00         2
          10       0.32      0.30      0.31        20

    accuracy                           0.39       150
   macro avg       0.34      0.29      0.29       150
weighted avg       0.36      0.39      0.36       150


*** Training GaussianProcess ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.4min finished
  y = column_or_1d(y, warn=True)


Best Params: {'kernel': 1**2 * RBF(length_scale=1)}, Test Score: 0.4066666666666667
              precision    recall  f1-score   support

           1       0.41      0.44      0.42        16
           2       0.51      0.70      0.59        33
           3       0.00      0.00      0.00         7
           4       0.36      0.43      0.39        28
           5       0.50      0.54      0.52        13
           6       0.00      0.00      0.00        11
           7       0.40      0.33      0.36         6
           8       0.33      0.29      0.31        14
           9       0.00      0.00      0.00         2
          10       0.29      0.30      0.29        20

    accuracy                           0.41       150
   macro avg       0.28      0.30      0.29       150
weighted avg       0.35      0.41      0.38       150


*** Training LogisticRegression ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.3s finished
  y = column_or_1d(y, warn=True)


Best Params: {'max_iter': 5000, 'random_state': 0}, Test Score: 0.35333333333333333
              precision    recall  f1-score   support

           1       0.40      0.38      0.39        16
           2       0.41      0.70      0.52        33
           3       0.00      0.00      0.00         7
           4       0.26      0.39      0.31        28
           5       0.57      0.31      0.40        13
           6       0.00      0.00      0.00        11
           7       0.00      0.00      0.00         6
           8       0.43      0.21      0.29        14
           9       0.00      0.00      0.00         2
          10       0.29      0.30      0.29        20

    accuracy                           0.35       150
   macro avg       0.24      0.23      0.22       150
weighted avg       0.31      0.35      0.31       150


*** Training DecisionTree ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
  self.best_estimator_.fit(X, y, **fit_params)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)


Best Params: {'max_depth': 5}, Test Score: 0.24666666666666667
              precision    recall  f1-score   support

           1       0.30      0.38      0.33        16
           2       0.27      0.33      0.30        33
           3       0.00      0.00      0.00         7
           4       0.32      0.32      0.32        28
           5       0.25      0.23      0.24        13
           6       0.00      0.00      0.00        11
           7       0.00      0.00      0.00         6
           8       0.25      0.14      0.18        14
           9       0.00      0.00      0.00         2
          10       0.16      0.30      0.21        20

    accuracy                           0.25       150
   macro avg       0.15      0.17      0.16       150
weighted avg       0.22      0.25      0.23       150


*** Training RandomForest ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Params: {'max_depth': 5, 'max_features': 1, 'n_estimators': 10}, Test Score: 0.26
 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.7s finished
  y = column_or_1d(y, warn=True)


Best Params: {'alpha': 1, 'max_iter': 5000}, Test Score: 0.4066666666666667
              precision    recall  f1-score   support

           1       0.39      0.44      0.41        16
           2       0.54      0.67      0.59        33
           3       0.00      0.00      0.00         7
           4       0.39      0.46      0.43        28
           5       0.50      0.46      0.48        13
           6       0.00      0.00      0.00        11
           7       0.50      0.17      0.25         6
           8       0.29      0.36      0.32        14
           9       0.00      0.00      0.00         2
          10       0.35      0.35      0.35        20

    accuracy                           0.41       150
   macro avg       0.30      0.29      0.28       150
weighted avg       0.37      0.41      0.38       150


*** Training AdaBoost ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.6s finished
  y = column_or_1d(y, warn=True)


Best Params: {'n_estimators': 100}, Test Score: 0.19333333333333333
              precision    recall  f1-score   support

           1       0.18      0.12      0.15        16
           2       0.20      0.45      0.28        33
           3       0.00      0.00      0.00         7
           4       0.19      0.21      0.20        28
           5       0.33      0.15      0.21        13
           6       0.00      0.00      0.00        11
           7       0.00      0.00      0.00         6
           8       0.00      0.00      0.00        14
           9       0.00      0.00      0.00         2
          10       0.22      0.20      0.21        20

    accuracy                           0.19       150
   macro avg       0.11      0.11      0.10       150
weighted avg       0.16      0.19      0.16       150


*** Training NaiveBayes ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Params: {}, Test Score: 0.3933333333333333
              precision    recall  f1

  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
  y = column_or_1d(y, warn=True)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.5s finished
  y = column_or_1d(y, warn=True)


Best Params: {'base_estimator': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False), 'n_estimators': 10, 'random_state': 0}, Test Score: 0.22
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        16
           2       0.22      1.00      0.36        33
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00        28
           5       0.00      0.00      0.00        13
           6       0.00      0.00      0.00        11
           7       0.00      0.00      0.00         6
           8       0.00      0.00      0.00        14
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00        20

    accuracy                           0.22       150
   macro avg      

  'precision', 'predicted', average, warn_for)
