In [4]:
from glob import glob

import numpy as np
import pandas as pd
import joblib
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import StackingClassifier


from utilities import *

In [5]:
random_state = 42
code_folder_path = ""
data_percentage = 0.001 # how much of the data we will use

In [6]:
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    def transform(self, X, y=None, **fit_params):
        return X.todense()

stack_estimators = [
    ('NaiveBayes', make_pipeline(DenseTransformer(), GaussianNB())),
    ('DecisionTree', DecisionTreeClassifier(max_depth=100)),
    ('SVM', SVC(kernel="linear", C=0.6)),
    # ("LogisticRegression", LogisticRegression(max_iter=1000, random_state=random_state))
]

names = [
    "Nearest Neighbors",
    "Linear SVM",
    # "RBF SVM", # Bad performance
#     "Gaussian Process", 
    "Decision Tree",
    "Random Forest",
#     "Neural Net", # Too slow
    "AdaBoost",
    "Naive Bayes",
    # "QDA", # Terrible performance
]

classifiers = [
    KNeighborsClassifier(5),
    StackingClassifier(estimators=stack_estimators, final_estimator=DecisionTreeClassifier(max_depth=5)),
    SVC(kernel="linear", C=0.5),
    # SVC(gamma=2, C=1),
    # GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=60),
    # RandomForestClassifier(max_depth=5, n_estimators=100),
    # MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    # QuadraticDiscriminantAnalysis(), 
]

NameError: name 'LogisticRegression' is not defined

In [None]:
df = get_SMADC_folder_data()
df = df.sample(frac=data_percentage)

In [None]:
%%time
X_train, X_test, y_train, y_test = train_test_split(df["Text"], df["Region"], random_state=random_state)
# count_vectorizer = CountVectorizer()
count_vectorizer = TfidfVectorizer()
X_vectorized = count_vectorizer.fit_transform(X_train)
standard_scaler = StandardScaler(with_mean=False).fit(X_vectorized)

Wall time: 297 ms


In [None]:
%%time
for name, clf in zip(names, classifiers):
    if name in ("Naive Bayes", "Gaussian Process", "QDA"):
        model = make_pipeline(count_vectorizer, standard_scaler, DenseTransformer(), clf)
    else:
        model = make_pipeline(count_vectorizer, standard_scaler, clf)
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    print(name, score, flush=True)
    joblib.dump(model, join(code_folder_path, f"models/other_models/{name}_acc={score.round(3)}.model"))

Nearest Neighbors 0.4559909142532652
StackClassifier 0.6607041453719478
Linear SVM 0.5857467348097671
Decision Tree 0.5570698466780238
AdaBoost 0.6391254968767746




Naive Bayes 0.4514480408858603




In [None]:
estimators = [
    ('rf', make_pipeline(count_vectorizer,RandomForestClassifier(max_depth=5, n_estimators=100, random_state=42))),
    ('svr', make_pipeline(count_vectorizer,SVC(gamma=2, C=1))),
    ("ada", make_pipeline(count_vectorizer,AdaBoostClassifier()))
]
clf = StackingClassifier(
    estimators=estimators, final_estimator=DecisionTreeClassifier(max_depth=5)
)
clf.fit(X_train,y_train).score(X_test, y_test)

# Load

In [7]:
loaded_models = [
    "MultinomialNaiveBayes_acc=0.865.model",
    "RandomForest_acc=0.76.model",
    "Linear SVM_acc=0.747.model"
]

In [8]:
dfs = {
    "annotated_data": get_annotated_data_folder_data(),
    "arabic_dialects": get_arabic_dialects_dataset_folder_data(),
    "dart": get_dart_folder_data()
}

In [9]:
results = []
for model_path in loaded_models:
    print(model_path)
    model = joblib.load(join(code_folder_path, "models", "other_models", model_path))
    for name, df in dfs.items():
        df = df.dropna()
        preds = model.predict(df["Text"])
        results.append((
            model_path,
            name,
            model_path[model_path.rindex("=")+1:model_path.rindex(".")], # SMADC Accuracy
            f1_score(df["Region"], preds, average="macro"),
            precision_score(df["Region"], preds, average="macro"),
            recall_score(df["Region"], preds, average="macro")
        ))

MultinomialNaiveBayes_acc=0.865.model


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
  _warn_prf(average, modifier, msg_start, len(result))


RandomForest_acc=0.76.model


  _warn_prf(average, modifier, msg_start, len(result))


Linear SVM_acc=0.747.model


  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
df_results = pd.DataFrame(results, columns=["Model name", "Dataset", "SMADC Accuracy", "Macro F1", "Macro precision", "Macro recall"])
df_results["Model name"] = df_results["Model name"].apply(lambda name: name[:name.find("_")])

In [28]:
print(df_results[["Model name", "SMADC Accuracy", "Dataset", "Macro F1", "Macro precision", "Macro recall"]].to_latex(index=False))

\begin{tabular}{lllrrr}
\toprule
           Model name & SMADC Accuracy &         Dataset &  Macro F1 &  Macro precision &  Macro recall \\
\midrule
MultinomialNaiveBayes &          0.865 &  annotated\_data &  0.552954 &         0.561883 &      0.611622 \\
MultinomialNaiveBayes &          0.865 & arabic\_dialects &  0.450884 &         0.464933 &      0.446934 \\
MultinomialNaiveBayes &          0.865 &            dart &  0.737389 &         0.742771 &      0.748739 \\
         RandomForest &           0.76 &  annotated\_data &  0.462519 &         0.497453 &      0.497691 \\
         RandomForest &           0.76 & arabic\_dialects &  0.357108 &         0.400919 &      0.357486 \\
         RandomForest &           0.76 &            dart &  0.667313 &         0.699270 &      0.681225 \\
           Linear SVM &          0.747 &  annotated\_data &  0.491982 &         0.524889 &      0.523377 \\
           Linear SVM &          0.747 & arabic\_dialects &  0.373722 &         0.418487 &      0

In [29]:
df_results

Unnamed: 0,Model name,Dataset,SMADC Accuracy,Macro F1,Macro precision,Macro recall
0,MultinomialNaiveBayes,annotated_data,0.865,0.552954,0.561883,0.611622
1,MultinomialNaiveBayes,arabic_dialects,0.865,0.450884,0.464933,0.446934
2,MultinomialNaiveBayes,dart,0.865,0.737389,0.742771,0.748739
3,RandomForest,annotated_data,0.76,0.462519,0.497453,0.497691
4,RandomForest,arabic_dialects,0.76,0.357108,0.400919,0.357486
5,RandomForest,dart,0.76,0.667313,0.69927,0.681225
6,Linear SVM,annotated_data,0.747,0.491982,0.524889,0.523377
7,Linear SVM,arabic_dialects,0.747,0.373722,0.418487,0.370232
8,Linear SVM,dart,0.747,0.656211,0.698349,0.66728


In [30]:
df_displayed = df_results.groupby("Dataset").apply(lambda df: df.sort_values("Macro F1").iloc[-1]).copy()
cols = ["Model name", "SMADC Accuracy", "Dataset", "Macro F1", "Macro precision", "Macro recall"]
print(df_displayed[cols].to_latex(index=False,))

\begin{tabular}{lllrrr}
\toprule
           Model name & SMADC Accuracy &         Dataset &  Macro F1 &  Macro precision &  Macro recall \\
MultinomialNaiveBayes &          0.865 &  annotated\_data &  0.552954 &         0.561883 &      0.611622 \\
\midrule
MultinomialNaiveBayes &          0.865 & arabic\_dialects &  0.450884 &         0.464933 &      0.446934 \\
MultinomialNaiveBayes &          0.865 &            dart &  0.737389 &         0.742771 &      0.748739 \\
\bottomrule
\end{tabular}

