In [1]:
from dotenv import dotenv_values
from pathlib import Path
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, fbeta_score, precision_score, accuracy_score, recall_score

# take environment variables from .env.
config = dotenv_values("./../../config/.env")
base_path = Path(config["BASE_PATH"])
data_path = base_path/"data"
path_10k = data_path/"raw"/"10k"
path_stoxx = data_path/"raw"/"stoxx"
writing_path = base_path/"writing"/"MSc-Thesis-Emerging-Risks"
table_path = writing_path/"tables"
figure_path = writing_path/"figures"/"plots"

In [2]:
df = pd.read_pickle(data_path/"labeling"/"active-learning-iteration-2.pkl")
df_test = pd.read_pickle(data_path/"labeling"/"GT.pkl")
df_test = df_test[df_test.strategy == "sequential"]
df_test.to_pickle(data_path/"evaluation"/"GT.pkl")

In [3]:
X_train = df[df.labeled]["text"].tolist()
y_train_l = df[df.labeled]["loss"].tolist()
y_train_u = df[df.labeled]["unexpected"].tolist()
X_test = df_test["text"].tolist()
y_test_l = df_test["loss"].tolist()
y_test_u = df_test["unexpected"].tolist()

Find Hyperparameters inspired by:
- https://github.com/MoritzLaurer/less-annotating-with-bert-nli/blob/master/analysis-classical-hyperparams.py
- https://stackoverflow.com/questions/44066264/how-to-choose-parameters-in-tfidfvectorizer-in-sklearn-during-unsupervised-clust

### Loss

In [4]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(lowercase=True, stop_words='english', norm="l2", use_idf=True, smooth_idf=True, analyzer="word")),
    ('clf', svm.SVC()),
])
parameters = {
    'tfidf__max_df': [0.9, 0.8, 0.7],
    'tfidf__min_df': [0.01, 0.03, 0.06],
    'tfidf__ngram_range': [(1, 2), (1, 3)],
    'clf__kernel': ["linear", "poly", "rbf", "sigmoid"],
    'clf__class_weight': ["balanced", None],
    'clf__C': [1, 10, 100, 1000],
    'clf__gamma': ["scale", "auto"],
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=2, verbose=1, scoring="f1")
grid_search_tune.fit(X_train, y_train_l)
print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

Fitting 2 folds for each of 1152 candidates, totalling 2304 fits
Best parameters set:
[('tfidf', TfidfVectorizer(max_df=0.9, min_df=0.03, ngram_range=(1, 2),
                stop_words='english')), ('clf', SVC(C=100, class_weight='balanced', gamma='auto', kernel='sigmoid'))]


In [5]:
vectorizer_l = TfidfVectorizer(
    lowercase=True, 
    stop_words='english', 
    norm="l2", 
    use_idf=True, 
    smooth_idf=True, 
    analyzer="word",
    max_df=0.9,
    min_df=0.03,
    ngram_range=(1,2))

cls_l = svm.SVC(
    kernel="sigmoid",
    class_weight="balanced",
    C=100,
    gamma="auto"
)

X_train_counts_l = vectorizer_l.fit_transform(X_train)
X_test_counts_l = vectorizer_l.transform(X_test)
cls_l.fit(X_train_counts_l, y_train_l)
y_pred_l = cls_l.predict(X_test_counts_l)

print("F1", round(f1_score(y_test_l, y_pred_l, average="binary"),3))
print("Pr", round(precision_score(y_test_l, y_pred_l, average="binary", zero_division=0),3))
print("Re", round(recall_score(y_test_l, y_pred_l, average="binary"),3))
print("F2", round(fbeta_score(y_test_l, y_pred_l, beta=2, average="binary"),3))
print("Ac", round(accuracy_score(y_test_l, y_pred_l),3))

F1 0.367
Pr 0.272
Re 0.562
F2 0.463
Ac 0.904


### Unexpected

In [6]:
grid_search_tune_u = GridSearchCV(pipeline, parameters, cv=2, n_jobs=2, verbose=1, scoring="f1")
grid_search_tune_u.fit(X_train, y_train_u)
print("Best parameters set:")
print(grid_search_tune_u.best_estimator_.steps)

Fitting 2 folds for each of 1152 candidates, totalling 2304 fits
Best parameters set:
[('tfidf', TfidfVectorizer(max_df=0.9, min_df=0.03, ngram_range=(1, 2),
                stop_words='english')), ('clf', SVC(C=100, class_weight='balanced', gamma='auto'))]


In [8]:
vectorizer_u = TfidfVectorizer(
    lowercase=True, 
    stop_words='english', 
    norm="l2", 
    use_idf=True, 
    smooth_idf=True, 
    analyzer="word",
    max_df=0.9,
    min_df=0.03,
    ngram_range=(1,2))

cls_u = svm.SVC(
    kernel="sigmoid",
    class_weight="balanced",
    C=100,
    gamma="auto"
)

X_train_counts_u = vectorizer_u.fit_transform(X_train)
X_test_counts_u = vectorizer_u.transform(X_test)
cls_u.fit(X_train_counts_u, y_train_u)
y_pred_u = cls_u.predict(X_test_counts_u)

print("F1", round(f1_score(y_test_u, y_pred_u, average="binary"),3))
print("Pr", round(precision_score(y_test_u, y_pred_u, average="binary", zero_division=0),3))
print("Re", round(recall_score(y_test_u, y_pred_u, average="binary"),3))
print("F2", round(fbeta_score(y_test_u, y_pred_u, beta=2, average="binary"),3))
print("Ac", round(accuracy_score(y_test_u, y_pred_u),3))

F1 0.079
Pr 0.042
Re 0.762
F2 0.171
Ac 0.865
