- (Gaussian) Multinomial Naive Bayes with Grid Search
- Logistic Regression
- Support Vector Machines
- Decision Tree
- Random Forest & Bagging Classifier
- Boosting XGBoost
- Neural Nets



In [1]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import sklearn.datasets as datasets
from sklearn.kernel_approximation import Nystroem
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
from LanguageModels.Word2Vec import Word2Vec
from Preprocessing.LemmatizerPreprocessor import LemmatizerPreprocessor
from Preprocessing.DataLoader import DataLoader

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rhythmsyed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rhythmsyed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rhythmsyed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rhythmsyed/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Load Dataset

In [3]:
file = open("../data/processed/data.p",'rb')
data = pickle.load(file)
file.close()

file = open("../data/processed/labels.p",'rb')
labels = pickle.load(file)
file.close()

In [4]:
label = labels[:,0]
print('embeddings: {}'.format(data.shape))
print('label: {}'.format(label.shape))

embeddings: (1450, 200)
label: (1450,)


In [5]:
# get rid of Nones
# for i in range(label.shape[0]):
#     print(label[i])
#     if label[i] == None:
#         print(i)
# 299 -> 448

In [6]:
label = np.concatenate((label[:298], label[449::]), 0)
label=label.astype('int')
data = np.concatenate((data[:298], data[449::]), 0)
print('embeddings: {}'.format(data.shape))
print('label: {}'.format(label.shape))

embeddings: (1299, 200)
label: (1299,)


### Support Vector Machine: Multi-Class Classification

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=42, stratify=label)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1039, 200) (260, 200) (1039,) (260,)


In [8]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

### Test on Training Data

In [9]:
pred_train = clf.predict(X_train)
accuracy_score(y_train, pred_train)

0.6660250240615977

In [10]:
confusion_matrix(y_train, pred_train)

array([[ 15,  18,   0,   2,   0,   0,   0,   0,   0,   0,  19],
       [  0,  87,   0,   3,   0,   0,   0,   0,   0,   0,  23],
       [  0,  12,   0,   1,   0,   0,   0,   0,   0,   0,  10],
       [  0,  26,   0,  26,   0,   0,   0,   0,   0,   0,  34],
       [  0,  18,   0,   3,   6,   0,   0,   0,   0,   0,  17],
       [  1,  13,   0,   5,   0,   0,   0,   0,   0,   0,  17],
       [  0,   6,   0,   2,   0,   0,   1,   0,   0,   0,   9],
       [  0,  10,   0,   6,   0,   0,   0,   0,   0,   0,  31],
       [  0,   5,   0,   1,   0,   0,   0,   0,   0,   0,   0],
       [  1,  23,   0,   4,   0,   0,   0,   0,   0,   4,  20],
       [  1,   4,   0,   2,   0,   0,   0,   0,   0,   0, 553]])

### Test on Testing Data

In [11]:
pred_test = clf.predict(X_test)
accuracy_score(y_test, pred_test)

0.6384615384615384

In [12]:
confusion_matrix(y_test, pred_test)

array([[  6,   5,   0,   1,   0,   0,   0,   0,   0,   0,   1],
       [  0,  18,   0,   1,   0,   0,   0,   0,   0,   0,   9],
       [  0,   2,   0,   0,   1,   0,   0,   0,   0,   0,   3],
       [  0,   8,   0,   2,   0,   0,   0,   0,   0,   0,  12],
       [  0,   6,   0,   0,   1,   0,   0,   0,   0,   0,   4],
       [  0,   2,   0,   1,   0,   0,   0,   0,   0,   0,   6],
       [  0,   3,   0,   0,   0,   0,   0,   0,   0,   0,   1],
       [  0,   6,   0,   1,   0,   0,   0,   0,   0,   0,   5],
       [  0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   1],
       [  0,   4,   0,   1,   0,   0,   0,   0,   0,   0,   8],
       [  0,   1,   0,   0,   0,   0,   0,   0,   0,   0, 139]])

## Linear Regression & Evaluation of Different Types of Regularization

## Vanilla Linear Regression (No Regularization)

In [13]:
reg = LinearRegression().fit(X_train, y_train)
print("Training Accuracy: " + str(reg.score(X_train, y_train)))
print("Training Accuracy: " + str(reg.score(X_test, y_test)))

Training Accuracy: 0.6195824222295991
Training Accuracy: 0.4464624164733535


## Ridge Regression (L2 Regularization)

In [14]:
parameters = {'alpha':[1e-4, 1e-3, 1e-2, 0.1, 0.2, 0.5, 1, 2, 5, 10]}
rid = Ridge()
clf = GridSearchCV(rid, parameters, cv=10, verbose=1)
clf.fit(X_train, y_train)
print("Training Accuracy: " + str(clf.score(X_train, y_train)))
print("Testing Accuracy: " + str(clf.score(X_test, y_test)))
print(clf.best_params_)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Training Accuracy: 0.5454773795823153
Testing Accuracy: 0.5169477691965991
{'alpha': 2}


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished


## Lasso Regression (L1 Regularization)

In [15]:
parameters = {'alpha':[1e-5, 1e-4, 1e-3, 1e-2, 0.1, 0.2, 0.5, 1, 2, 5, 10, 100]}

las = Lasso(max_iter = 10000)
clf = GridSearchCV(las, parameters, cv=10, verbose=1)
clf.fit(X_train, y_train)
print("Training Accuracy: " + str(clf.score(X_train, y_train)))
print("Testing Accuracy: " + str(clf.score(X_test, y_test)))
print(clf.best_params_)

Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Training Accuracy: 0.5869160764837922
Testing Accuracy: 0.4985604037169039
{'alpha': 0.001}


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    2.3s finished


## ElasticNet Regression (L1 + L2 Regularization)

In [16]:
parameters = {'alpha':[1e-5, 1e-4, 1e-3, 1e-2, 0.1, 0.2, 0.5, 1, 2, 5, 10, 100], 'l1_ratio':[0.05, 0.1, 0.15, 0.25, 0.5, 0.75, 1]}

las = ElasticNet(max_iter = 10000)
clf = GridSearchCV(las, parameters, cv=10, verbose=1)
clf.fit(X_train, y_train)
print("Training Accuracy: " + str(clf.score(X_train, y_train)))
print("Testing Accuracy: " + str(clf.score(X_test, y_test)))
print(clf.best_params_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 84 candidates, totalling 840 fits
Training Accuracy: 0.5675894111134874
Testing Accuracy: 0.5207644959410633
{'alpha': 0.001, 'l1_ratio': 0.05}


[Parallel(n_jobs=1)]: Done 840 out of 840 | elapsed:   16.0s finished


## Train/Test on Multiple Classifiers

In [29]:
# names = ["Linear SVM", "RBF SVM", "Gaussian Process", "Logistic Regression"
#          "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
#          "Naive Bayes", "Bagging Classifier"]

parameters = {
    'SVM': {'kernel': ['linear', 'rbf'], 'C':[0.01, 0.1, 1], 'gamma': ['scale']},
    'GaussianProcess': {'kernel': [1.0 * RBF(1.0)]},
    'LogisticRegression': {'random_state': [0], 'max_iter': [5000]},
    'DecisionTree': {'max_depth': [5]},
    'RandomForest': {'max_depth': [5], 'n_estimators': [10], 'max_features': [1]},
    'MLP': {'alpha': [1], 'max_iter': [5000]},
    'AdaBoost': {'n_estimators': [100]},
    'NaiveBayes': {},
    'BaggingClassifier': {'base_estimator': [SVC()], 'n_estimators': [10], 'random_state': [0]}
}

classifiers = {
    'SVM': SVC(),
    'GaussianProcess': GaussianProcessClassifier(),
    'LogisticRegression': LogisticRegression(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'MLP': MLPClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'NaiveBayes': GaussianNB(),
    'BaggingClassifier': BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=0)
}

score_dict = {}
for name in classifiers.keys():
    print('\n*** Training {} ***'.format(name))
    clf = GridSearchCV(classifiers[name], parameters[name], cv=5, verbose=1)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print('Best Params: {}, Test Score: {}'.format(clf.best_params_, score))
    score_dict[name] = score
print(score_dict)


*** Training SVM ***
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    5.9s finished


Best Params: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}, Test Score: 0.6384615384615384

*** Training GaussianProcess ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.8min finished


Best Params: {'kernel': 1**2 * RBF(length_scale=1)}, Test Score: 0.6615384615384615

*** Training LogisticRegression ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.7s finished


Best Params: {'max_iter': 5000, 'random_state': 0}, Test Score: 0.6038461538461538

*** Training DecisionTree ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Best Params: {'max_depth': 5}, Test Score: 0.49615384615384617

*** Training RandomForest ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Params: {'max_depth': 5, 'max_features': 1, 'n_estimators': 10}, Test Score: 0.5384615384615384

*** Training MLP ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.0s finished


Best Params: {'alpha': 1, 'max_iter': 5000}, Test Score: 0.5692307692307692

*** Training AdaBoost ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    9.1s finished


Best Params: {'n_estimators': 100}, Test Score: 0.5384615384615384

*** Training NaiveBayes ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Params: {}, Test Score: 0.5884615384615385

*** Training BaggingClassifier ***
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.4s finished


Best Params: {'base_estimator': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False), 'n_estimators': 10, 'random_state': 0}, Test Score: 0.5384615384615384
{'SVM': 0.6384615384615384, 'GaussianProcess': 0.6615384615384615, 'LogisticRegression': 0.6038461538461538, 'DecisionTree': 0.49615384615384617, 'RandomForest': 0.5384615384615384, 'MLP': 0.5692307692307692, 'AdaBoost': 0.5384615384615384, 'NaiveBayes': 0.5884615384615385, 'BaggingClassifier': 0.5384615384615384}


## Rebalancing the Dataset

In [57]:
oversample = SMOTE()
def f(i):
    if i == 11:
        return 0
    else:
        return 1
    
a = np.where(label == 11)
print(data.shape)
print(label.shape)

data_pos = np.delete(data, a, axis=0)
label_pos = np.delete(label, a)
print(data_pos.shape)
print(label_pos.shape)

X_train, X_test, y_train, y_test = train_test_split(data_pos, label_pos, test_size=0.2, random_state=42, stratify=label_pos)

X_train_rebal, y_train_rebal = oversample.fit_resample(X_train, y_train)
print(Counter(y_train_rebal.tolist()))

(1299, 200)
(1299,)
(599, 200)
(599,)
Counter({1: 113, 4: 113, 6: 113, 2: 113, 8: 113, 3: 113, 5: 113, 7: 113, 10: 113, 9: 113})


## Multiclass classification, positive labels only, unbalanced

In [58]:
names = ["Linear SVM", "RBF SVM", "Logistic Regression",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "Bagging Classifier"]
         
classifiers = [
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    # GaussianProcessClassifier(1.0 * RBF(1.0), n_jobs=-1),
    LogisticRegression(random_state=0, max_iter = 5000),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    MLPClassifier(alpha=1, max_iter=5000),
    AdaBoostClassifier(),
    GaussianNB(),
    BaggingClassifier(base_estimator=SVC(),
                        n_estimators=10, random_state=0)]

score_dict = dict()
for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        score_dict[name] = score
        print(name)
        print(classification_report(clf.predict(X_test), y_test, zero_division=0))
print(score_dict)

Linear SVM
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         0
           2       1.00      0.23      0.38       120
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0

    accuracy                           0.23       120
   macro avg       0.10      0.02      0.04       120
weighted avg       1.00      0.23      0.38       120

RBF SVM
              precision    recall  f1-score   support

           1       0.38      0.62      0.48         8
           2       0.89      0.31      0.46        80
           3       0.00      0.00      0.00         0
     

## Multiclass classification, positive labels only, balanced

In [59]:
names = ["Linear SVM", "RBF SVM", "Logistic Regression",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "Bagging Classifier"]
         
classifiers = [
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    # GaussianProcessClassifier(1.0 * RBF(1.0), n_jobs=-1),
    LogisticRegression(random_state=0, max_iter = 5000),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    MLPClassifier(alpha=1, max_iter=5000),
    AdaBoostClassifier(),
    GaussianNB(),
    BaggingClassifier(base_estimator=SVC(),
                        n_estimators=10, random_state=0)]

score_dict = dict()
for name, clf in zip(names, classifiers):
        clf.fit(X_train_rebal, y_train_rebal)
        score = clf.score(X_test, y_test)
        score_dict[name] = score
        print(name)
        print(classification_report(clf.predict(X_test), y_test, zero_division=0))
print(score_dict)

Linear SVM
              precision    recall  f1-score   support

           1       0.46      0.43      0.44        14
           2       0.36      0.59      0.44        17
           3       0.50      0.13      0.21        23
           4       0.18      0.44      0.26         9
           5       0.27      0.43      0.33         7
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         2
           8       0.33      0.19      0.24        21
           9       0.50      0.07      0.12        14
          10       0.38      0.45      0.42        11

    accuracy                           0.30       120
   macro avg       0.30      0.27      0.25       120
weighted avg       0.38      0.30      0.29       120

RBF SVM
              precision    recall  f1-score   support

           1       0.38      0.42      0.40        12
           2       0.64      0.33      0.44        54
           3       0.00      0.00      0.00         2
     

In [27]:
#yet another logistic regression 
logReg = LogisticRegression(random_state=0, max_iter = 50000)
logReg.fit(X_train, y_train)

test_training= logReg.score(X_train, y_train)
test_testing = logReg.score(X_test, y_test)

print("training data score", test_training, "\ntesting data score", test_testing)

training data score 0.6535129932627527 
testing data score 0.6192307692307693


In [28]:
#multiclass logistic regression
multiclassLogReg = LogisticRegression(multi_class='ovr')
multiclassLogReg.fit(X_train, y_train)

multi_test_training= multiclassLogReg.score(X_train, y_train)
multi_test_testing = multiclassLogReg.score(X_test, y_test)

print("training data score", multi_test_training, "\ntesting data score", multi_test_testing)

training data score 0.6323387872954764 
testing data score 0.6076923076923076


In [22]:
# Vanilla SVM, https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
y = np.array([1, 1, 2, 2])
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X, y)
print(clf.predict([[-0.8, -1]]))

[1]


In [2]:
# Multinomial Naive Bayes
rng = np.random.RandomState(1)
X = rng.randint(5, size=(6, 100))
y = np.array([1, 2, 3, 4, 5, 6])
clf = MultinomialNB()
clf.fit(X, y)
print(clf.predict(X[2:3]))

[3]


In [3]:
# Gaussian NB
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
Y = np.array([1, 1, 1, 2, 2, 2])
clf = GaussianNB()
clf.fit(X, Y)
print(clf.predict([[-0.8, -1]]))

clf_pf = GaussianNB()
clf_pf.partial_fit(X, Y, np.unique(Y))
print(clf_pf.predict([[-0.8, -1]]))

[1]
[1]


In [4]:
# Logistic Regression, https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(random_state=0, max_iter = 50000).fit(X, y)
print(clf.predict(X[:2, :]))

print(clf.predict_proba(X[:2, :]))
print(clf.score(X, y))

[0 0]
[[9.81579028e-01 1.84209573e-02 1.44796627e-08]
 [9.71349907e-01 2.86500630e-02 3.01442199e-08]]
0.9733333333333334


In [32]:
# Linear SVM for large datasets, https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC
X, y = datasets.load_digits(n_class=9, return_X_y=True)
data = X / 16
feature_map_nystroem = Nystroem(gamma=.2,
                                random_state=1,
                                n_components=300)
data_transformed = feature_map_nystroem.fit_transform(data)

clf = make_pipeline(StandardScaler(),
                  LinearSVC(random_state=0, tol=1e-5, max_iter = 50000))
clf.fit(data_transformed, y)
clf.score(data_transformed, y)

1.0

In [35]:
# Random Forest Classification, https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
X, y = make_classification(n_samples=1000, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
print(clf.predict([[0, 0, 0, 0]]))

[1]


In [40]:
# Decision Tree Classifier, https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
iris = load_iris()
cross_val_score(clf, iris.data, iris.target, cv=10)

array([1.        , 0.93333333, 1.        , 0.93333333, 0.93333333,
       0.86666667, 0.93333333, 1.        , 1.        , 1.        ])

In [38]:
# Bagging Classifier, https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
X, y = make_classification(n_samples=100, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)
clf = BaggingClassifier(base_estimator=SVC(),
                        n_estimators=10, random_state=0).fit(X, y)
clf.predict([[0, 0, 0, 0]])

array([1])

In [43]:
# Gradient Boosting Classifier, https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
X, y = make_hastie_10_2(random_state=0)
X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
      max_depth=1, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)

0.913

In [19]:
# MLP Classifier, https://scikit-learn.org/stable/modules/neural_networks_supervised.html
X = [[0., 0.], [1., 1.]]
y = [0, 1]
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)

clf.fit(X, y)
clf.predict([[2., 2.], [-1., -2.]])

array([1, 0])

### Using data pipeline for binary logistic regression

In [9]:
#Using data pipeline

data = DataLoader('../data/EMNLP2020.csv').load()

# Load preprocessor
lp = LemmatizerPreprocessor()

In [10]:
w2v = Word2Vec(path='../data/glove.6B/glove.6B.200d.txt')

In [11]:
# binary case
X, y = w2v.featurize(data, lp, mode='binary')

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=l)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

NameError: name 'X' is not defined

In [21]:
#logistic regression with binary labels

logReg = LogisticRegression(random_state=0, max_iter = 50000)
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)
logReg.fit(X_train, y_train)

test_training= logReg.score(X_train, y_train)
test_testing = logReg.score(X_test, y_test)

print("training data score", test_training, "\ntesting data score", test_testing)

training data score 0.9349670122525919 
testing data score 0.9135338345864662
