In [None]:

from time import time

def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time


results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="sparse_cg"), "Ridge Classifier"),
        (Perceptron(max_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(max_iter=50),
         "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False,
                                       tol=1e-3)))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50,
                                           penalty=penalty)))

# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50,
                                       penalty="elasticnet")))

# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))
results.append(benchmark(ComplementNB(alpha=.1)))

print('=' * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(benchmark(Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False,
                                                  tol=1e-3))),
  ('classification', LinearSVC(penalty="l2"))])))




In [None]:

indices = np.arange(len(results))
results.sort(key=lambda tup: tup[1])

results = [[x[i] for x in results] for i in range(4)]

clf_names, score, training_time, test_time = results
training_time = np.array(training_time) / np.max(training_time)
test_time = np.array(test_time) / np.max(test_time)

plt.figure(figsize=(12, 8))
plt.title("Different classifiers score")
plt.barh(indices, score, .2, label="Accuracy", color='navy')
plt.barh(indices + .3, training_time, .2, label="training time",
         color='c')
plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange')
plt.yticks(())
plt.legend(loc='best')
plt.subplots_adjust(left=.25)
plt.subplots_adjust(top=.95)
plt.subplots_adjust(bottom=.05)

for i, c in zip(indices, clf_names):
    plt.text(-.3, i, c)

plt.savefig(IMAGE_PATH + 'classifiers')
plt.show()



In [None]:

'''
tokenizer = LemmaTokenizer()
vectorizer = TfidfVectorizer(tokenizer=tokenizer, stop_words=stopwords, ngram_range=(1,2), min_df=3)
X_tfidf = vectorizer.fit_transform(training_set['full_text'])

print(vectorizer.get_feature_names())


svd = TruncatedSVD(n_components=700, random_state=42)
X_svd = svd.fit_transform(X_tfidf)

print(f"Total variance explained: {np.sum(svd.explained_variance_ratio_):.2f}")


word_positions = {v: k for k, v in vectorizer.vocabulary_.items()}
cluster_ids = generate_wordclouds(X_svd, X_tfidf, 2, word_positions)


X_train, X_test, y_train, y_test = train_test_split(X_tfidf, training_set['class'],test_size=0.2, random_state=42)
clf = SGDClassifier(verbose=2, tol=0.0001, alpha=0.00007, loss='modified_huber')
clf.fit(X_train, y_train)
clf.score(X_test, y_test)


clf = LinearSVC(C=0.9, random_state=42)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)'''



In [None]:


pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(tokenizer=LemmaTokenizer())),
    ('clf', ClfSwitcher()),
])


parameters = [
    {
        'clf__estimator': [RidgeClassifier(solver='auto', random_state=42)], # SVM if hinge loss / logreg if log loss
        "tfidf__min_df" : [3, 5, 10, 20],
        'clf__estimator__alpha': [1.0, 0.1, 0.01],
        'clf__estimator__tol': [1e-2, 1e-3, 1e-4],
    },
    {
        'clf__estimator': [MultinomialNB()],
        "tfidf__min_df" : [3, 5, 10, 20],
        'clf__estimator__alpha': (1, 1e-1, 1e-2, 1e-3),
    },
    {
        'clf__estimator': [LinearSVC(dual=False, random_state=42)],
        "tfidf__min_df" : [3, 5, 10, 20],
        'clf__estimator__tol': [1e-2, 1e-3, 1e-4],
        'clf__estimator__penalty': ('l2', 'l1'),
        'clf__estimator__C': [1, 1e-1, 1e-2, 1e-3],
    },
]

gscv = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, return_train_score=False, verbose=3)
gscv.fit(training_set['full_text'], training_set['class'])

gscv.cv_results_


In [None]:
{
        'clf__estimator': [RidgeClassifier(solver='auto', random_state=42)], # SVM if hinge loss / logreg if log loss
        "tfidf__min_df" : [1, 3],
        "tfidf__stopwords" : (None, stopwords),
        "tfidf__ngram_range" : [(1,1), (1,2)],
        'clf__estimator__alpha': [1.0, 1.5, 2.0],
        'clf__estimator__tol': [1e-1, 1e-2, 1e-3, 1e-4],
    },
     {
        'clf__estimator': [SGDClassifier(random_state=42, verbose=3)],
        "tfidf__min_df" : [1],
        "tfidf__stop_words" : [None],
        "tfidf__ngram_range" : [(1,2)],
        'clf__estimator__tol': [1e-2, 1e-3, 1e-4, 1e-5],
        'clf__estimator__alpha': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
        'clf__estimator__loss': ('hinge', 'modified_huber'),
    },
        {
        'clf__estimator': [PassiveAggressiveClassifier(random_state=42)], # SVM if hinge loss / logreg if log loss
        "tfidf__min_df" : [1],
        "tfidf__stop_words" : [None],
        "tfidf__ngram_range" : [(1,2)],
        'clf__estimator__C': [0.5, 1.0, 1.5, 2.0],
        'clf__estimator__tol': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
    },

pipeline = Pipeline([
    ("tfidf", CountVectorizer(tokenizer=LemmaTokenizer(), binary=True)),
    ('clf', ClfSwitcher()),
])


parameters = [
    {
        'clf__estimator': [BernoulliNB()], # SVM if hinge loss / logreg if log loss
        "tfidf__min_df" : [1],
        "tfidf__stop_words" : [None],
        "tfidf__ngram_range" : [(1,2)],
        'clf__estimator__alpha': [0.1, 0.3, 0.6, 1.0],
    },
]