 1. Baselines
 2. LogReg with target + annotator
 3. LogReg with context + annotator
 4. LogReg with target + context + similarity + annotator


 5. & 6. & 7.: kNN with 2., 3. and 4. 


### DONE: LogReg with Annotators

 - group by annotators -> take majority vote across contexts
 - add annotator (categorical variable) as a predictor
 
 
### SVD Grid Search

 - reduce tf-idf space to > 100 dims (256?) = M \in V x 256
 - loop over dimensions k and produce N \n V x k from M
 - fit LogReg on N
 
 
 
### DONE: use Word2Vec embeddings






### DONE: split train-test by target words

 - measure accuracy per annotator, then average (happens by default)
 - prediction for target is average  
 
 
 
### DONE: adapt baseline

 - old: majority vote on the most common class *across all samples*
 
 - new: majority vote on most common class *per annotator* -> annotator prior P(c|a)


---

=> P(c | w, a) = P(c | (x_1, ..., x_k), a)

In [23]:
from tqdm import tqdm
import joblib
from collections import Counter

import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

from utils import get_DF, get_indices

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV, cross_validate

from sklearn.metrics import balanced_accuracy_score, classification_report, accuracy_score,\
                                    confusion_matrix, plot_roc_curve, precision_score, recall_score,\
                                    f1_score


In [2]:
df = get_DF()

In [3]:
tfidf = joblib.load(f'vectors/tfidf_0.15.pkl')
mat = pd.read_csv("vectors/svd_mat_0.15.tsv", delimiter="\t", header=None).to_numpy()
scaled = StandardScaler(with_mean=False).fit_transform(mat)

In [4]:
tfidf_words = tfidf.get_feature_names()
mat_inds = get_indices(df.target.unique(), tfidf_words)
df["mat_ind"] = df.target.apply(lambda w: mat_inds[w] if w in mat_inds else -1)
df = df[df.mat_ind > -1]

100%|██████████| 91/91 [00:00<00:00, 108.42it/s]


In [5]:
target_annotations = {x:list(subdf.y) for x, subdf in df.groupby(["annotator_x", "target"])}
xs, y = list(zip(*target_annotations.items()))
x_anno, x_target = list(zip(*xs))

mldf = pd.DataFrame([x_anno, x_target, y]).T
mldf.columns = ["x_anno", "target", "y_ls"]

mldf["ratio"] = mldf.y_ls.apply(lambda ls: sum(ls)/len(ls))
mldf = mldf[mldf.ratio != 0.5]
mldf["maj_vote"] = mldf.ratio.apply(lambda r: int(r > 0.5))

mldf = pd.concat([mldf, pd.get_dummies(mldf["x_anno"])], axis=1)

---

In [6]:
X = np.hstack([np.stack(mldf.target.apply(lambda w: scaled[mat_inds[w]])), 
               mldf.iloc[:, 5:].to_numpy()])
Y = mldf.maj_vote.to_numpy()

In [36]:
logregcv = LogisticRegression(penalty="elasticnet", max_iter=5000, solver="saga")
logregcv = LogisticRegression(penalty="none", max_iter=3000, solver="lbfgs")

prec = lambda x, y: precision_score(x, y, average=None)
rec = lambda x, y: recall_score(x, y, average=None)

scores = cross_validate(
            logregcv,
            X,
            Y,
            cv=30,
            n_jobs=8,
            scoring=['accuracy',
                     'balanced_accuracy',
                     'f1_weighted',
                     'precision_weighted',3
                     'recall_weighted', "roc_auc"],
            groups=mldf.target,
            verbose=1
        )  # parallel processing

scores

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  30 out of  30 | elapsed:  9.8min finished


{'fit_time': array([156.22908854, 159.58916569, 155.18634486, 158.04874444,
        161.35234809, 154.26207781, 151.87549663, 158.20490623,
        155.26145697, 154.13900995, 155.82272816, 156.33400655,
        156.37657928, 153.34250975, 153.01957703, 151.35069275,
        160.08963442, 159.63079119, 156.59928894, 156.99560738,
        157.63155627, 157.2330513 , 157.11628962, 156.68600726,
        115.6810472 , 114.17920399, 117.57854772, 110.54601145,
        116.22277141, 112.48255348]),
 'score_time': array([0.06067729, 0.02589893, 0.0295558 , 0.03180718, 0.04213858,
        0.03476596, 0.03742743, 0.0215559 , 0.02864194, 0.03003454,
        0.02835989, 0.07665181, 0.0270505 , 0.03226566, 0.02795029,
        0.02033901, 0.03510976, 0.02304363, 0.0307951 , 0.03810954,
        0.0198915 , 0.06029248, 0.02295423, 0.02596164, 0.01034284,
        0.01417041, 0.01023054, 0.01420212, 0.00566816, 0.01360798]),
 'test_accuracy': array([0.78525641, 0.59615385, 0.75961538, 0.78846154, 0.637

In [None]:
sns.distplot([0.71486957, 0.70836237, 0.70410114, 0.6949289 , 0.7141457 ])
sns.distplot([0.76886035, 0.74371322, 0.7693954 , 0.78491172, 0.76445396])

In [32]:
import scipy.stats as st

bal_acc = scores["test_balanced_accuracy"]
acc = scores["test_accuracy"]


print("balanced acc:", st.t.interval(0.95, len(bal_acc)-1, loc=np.mean(bal_acc), scale=st.sem(bal_acc)))
print("accuracy    :", st.t.interval(0.95, len(acc)-1, loc=np.mean(acc), scale=st.sem(acc)))

balanced acc: (0.5652904709850635, 0.6419416111582794)
accuracy    : (0.603115695930947, 0.6940707992459018)


In [40]:
for k, v in scores.items():
    print(k, "\t", tuple(map(lambda x: round(x, 3), st.t.interval(0.95, len(v)-1, loc=np.mean(v), scale=st.sem(v)))))

fit_time 	 (141.536, 154.402)
score_time 	 (0.024, 0.035)
test_accuracy 	 (0.602, 0.693)
test_balanced_accuracy 	 (0.566, 0.642)
test_f1_weighted 	 (0.607, 0.693)
test_precision_weighted 	 (0.629, 0.703)
test_recall_weighted 	 (0.602, 0.693)
test_roc_auc 	 (0.529, 0.645)


In [27]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_we

---

In [None]:
# hand-made confidence intervals

bal_accs = []
accs = []

for _ in tqdm(range(20)):
    words_shuffled = np.random.permutation(mldf.target.unique())

    ratio = int(mldf.target.unique().shape[0]*0.8)
    train_words = set(words_shuffled[:ratio])
    msk = mldf.target.isin(train_words)

    # msk = np.random.rand(len(mldf)) < 0.7

    train_df = mldf[msk]
    test_df = mldf[~msk]


    target_train, anno_train, Y_train = (np.stack(train_df.target.apply(lambda w: scaled[mat_inds[w]])), 
                                         train_df.iloc[:, 5:].to_numpy(), 
                                         train_df.maj_vote.to_numpy())

    target_test, anno_test, Y_test = (np.stack(test_df.target.apply(lambda w: scaled[mat_inds[w]])),
                                        test_df.iloc[:, 5:].to_numpy(),
                                        test_df.maj_vote.to_numpy())

    X_train, X_test = np.hstack([target_train, anno_train]), np.hstack([target_test, anno_test])
    
    
    
    
    logreg = LogisticRegression(penalty="none", max_iter=5000, solver="lbfgs")
    logreg.fit(X_train, Y_train)
    
    
    
    # Logistic Regression


    preds = logreg.predict(X_test)

    print(classification_report(Y_test, preds))
    b = balanced_accuracy_score(Y_test, preds)
    a = accuracy_score(Y_test,
                       
                       preds)
    bal_accs.append(b)
    accs.append(a)


    cm = confusion_matrix(Y_test, preds)

    print(cm)

    plot_roc_curve(logreg, X_test, Y_test)
    _=plt.plot([0,1], [0,1], "--", c="r")

---

In [None]:
# Logistic Regression

from sklearn.metrics import balanced_accuracy_score, classification_report, accuracy_score,\
                                confusion_matrix, plot_roc_curve

preds = logreg.predict(X_test)

print(classification_report(Y_test, preds))
print(balanced_accuracy_score(Y_test, preds))
print(accuracy_score(Y_test, preds))


cm = confusion_matrix(Y_test, preds)

print(cm)

plot_roc_curve(logreg, X_test, Y_test)
_=plt.plot([0,1], [0,1], "--", c="r")

In [None]:
sns.distplot(bal_accs)

sns.distplot(accs)