In [13]:
from glob import glob

import numpy as np
import pandas as pd
import joblib
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import StackingClassifier


from utilities import *

In [15]:
random_state = 42
code_folder_path = ""
data_percentage = 0.001 # how much of the data we will use

In [16]:
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    def transform(self, X, y=None, **fit_params):
        return X.todense()

stack_estimators = [
    ('NaiveBayes', make_pipeline(DenseTransformer(), GaussianNB())),
    ('DecisionTree', DecisionTreeClassifier(max_depth=100)),
    ('SVM', SVC(kernel="linear", C=0.6)),
    ("LogisticRegression", LogisticRegression(max_iter=1000, random_state=random_state))
]

names = [
    "Nearest Neighbors",
    "Linear SVM",
    # "RBF SVM", # Bad performance
#     "Gaussian Process", 
    "Decision Tree",
    "Random Forest",
#     "Neural Net", # Too slow
    "AdaBoost",
    "Naive Bayes",
    # "QDA", # Terrible performance
]

classifiers = [
    KNeighborsClassifier(5),
    StackingClassifier(estimators=stack_estimators, final_estimator=DecisionTreeClassifier(max_depth=5)),
    SVC(kernel="linear", C=0.5),
    # SVC(gamma=2, C=1),
    # GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=60),
    # RandomForestClassifier(max_depth=5, n_estimators=100),
    # MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    # QuadraticDiscriminantAnalysis(), 
]

In [17]:
df = get_SMADC_folder_data()
df = df.sample(frac=data_percentage)

In [18]:
%%time
X_train, X_test, y_train, y_test = train_test_split(df["Text"], df["Region"], random_state=random_state)
# count_vectorizer = CountVectorizer()
count_vectorizer = TfidfVectorizer()
X_vectorized = count_vectorizer.fit_transform(X_train)
standard_scaler = StandardScaler(with_mean=False).fit(X_vectorized)

Wall time: 297 ms


In [19]:
%%time
for name, clf in zip(names, classifiers):
    if name in ("Naive Bayes", "Gaussian Process", "QDA"):
        model = make_pipeline(count_vectorizer, standard_scaler, DenseTransformer(), clf)
    else:
        model = make_pipeline(count_vectorizer, standard_scaler, clf)
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    print(name, score, flush=True)
    joblib.dump(model, join(code_folder_path, f"models/other_models/{name}_acc={score.round(3)}.model"))

Nearest Neighbors 0.4559909142532652
StackClassifier 0.6607041453719478
Linear SVM 0.5857467348097671
Decision Tree 0.5570698466780238
AdaBoost 0.6391254968767746




Naive Bayes 0.4514480408858603




In [None]:
estimators = [
    ('rf', make_pipeline(count_vectorizer,RandomForestClassifier(max_depth=5, n_estimators=100, random_state=42))),
    ('svr', make_pipeline(count_vectorizer,SVC(gamma=2, C=1))),
    ("ada", make_pipeline(count_vectorizer,AdaBoostClassifier()))
]
clf = StackingClassifier(
    estimators=estimators, final_estimator=DecisionTreeClassifier(max_depth=5)
)
clf.fit(X_train,y_train).score(X_test, y_test)