In [13]:
from glob import glob

import numpy as np
import pandas as pd
import joblib
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import StackingClassifier


from utilities import *

In [50]:
random_state = 42
code_folder_path = ""
data_percentage = 0.1 # how much of the data we will use

In [51]:
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
#     "Gaussian Process",
    "Decision Tree",
    "Random Forest",
#     "Neural Net", # Too slow
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
#     GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=100),
#     MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

In [52]:
df = get_SMADC_folder_data()
df = df.sample(frac=data_percentage)

In [53]:
%%time
X_train, X_test, y_train, y_test = train_test_split(df["Text"], df["Region"], random_state=random_state)
count_vectorizer = CountVectorizer()
X_vectorized = count_vectorizer.fit_transform(X_train)
# standard_scaler = StandardScaler(with_mean=False).fit(X_vectorized)

Wall time: 2.36 s


In [30]:
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [7]:
%%time
for name, clf in zip(names, classifiers):
    if name in ("Naive Bayes", "Gaussian Process", "QDA"):
        model = make_pipeline(count_vectorizer, DenseTransformer(), clf)
    else:
        model = make_pipeline(count_vectorizer, clf)
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    print(name, score, flush=True)
    joblib.dump(model, join(code_folder_path, f"models/other_models/{name}_acc={score.round(3)}.model"))

KeyboardInterrupt: 

In [None]:
estimators = [
    ('rf', make_pipeline(count_vectorizer,RandomForestClassifier(max_depth=5, n_estimators=100, random_state=42))),
    ('svr', make_pipeline(count_vectorizer,SVC(gamma=2, C=1))),
    ("ada", make_pipeline(count_vectorizer,AdaBoostClassifier()))
]
clf = StackingClassifier(
    estimators=estimators, final_estimator=DecisionTreeClassifier(max_depth=5)
)
clf.fit(X_train,y_train).score(X_test, y_test)