In [1]:
from tqdm import tqdm
import joblib
from collections import Counter

import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

from utils import get_DF, get_indices

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV

In [2]:
df = get_DF()

In [3]:
tfidf = joblib.load(f'vectors/tfidf_0.15.pkl')
mat = pd.read_csv("vectors/svd_mat_0.15.tsv", delimiter="\t", header=None).to_numpy()
scaled = StandardScaler(with_mean=False).fit_transform(mat)

tfidf_words = tfidf.get_feature_names()
mat_inds = get_indices(df.target.unique(), tfidf_words)
df["mat_ind"] = df.target.apply(lambda w: mat_inds[w] if w in mat_inds else -1)
df = df[df.mat_ind > -1]

100%|██████████| 91/91 [00:00<00:00, 114.25it/s]


In [4]:
target_annotations = {x:list(subdf.y) for x, subdf in df.groupby(["annotator_x", "target"])}
xs, y = list(zip(*target_annotations.items()))
x_anno, x_target = list(zip(*xs))

mldf = pd.DataFrame([x_anno, x_target, y]).T
mldf.columns = ["x_anno", "target", "y_ls"]

mldf["ratio"] = mldf.y_ls.apply(lambda ls: sum(ls)/len(ls))
mldf = mldf[mldf.ratio != 0.5]
mldf["maj_vote"] = mldf.ratio.apply(lambda r: int(r > 0.5))

mldf = pd.concat([mldf, pd.get_dummies(mldf["x_anno"])], axis=1)

---

In [31]:
import statsmodels.api as sm


pvals = []


for _ in tqdm(range(30)):

    words_shuffled = np.random.permutation(mldf.target.unique())

    ratio = int(mldf.target.unique().shape[0]*0.8)
    train_words = set(words_shuffled[:ratio])
    msk = mldf.target.isin(train_words)


    train_df = mldf[msk]
    test_df = mldf[~msk]


    target_train, anno_train, Y_train = (np.stack(train_df.target.apply(lambda w: scaled[mat_inds[w]])), 
                                             train_df.iloc[:, 5:].to_numpy(), 
                                             train_df.maj_vote.to_numpy())

    target_test, anno_test, Y_test = (np.stack(test_df.target.apply(lambda w: scaled[mat_inds[w]])),
                                            test_df.iloc[:, 5:].to_numpy(),
                                            test_df.maj_vote.to_numpy())

    X_train, X_test = np.hstack([target_train, anno_train]), np.hstack([target_test, anno_test])


    X = sm.add_constant(X_train)    

    results = sm.OLS(Y_train, X).fit()

    # results = sm.Probit(Y_train, X).fit()

    results.summary()
    
    pvals.append((results.pvalues[:100] < 0.05).sum())

100%|██████████| 30/30 [00:36<00:00,  1.23s/it]


In [34]:
import scipy.stats as st

st.t.interval(0.95, len(pvals)-1, loc=np.mean(pvals), scale=st.sem(pvals))

(69.04326342263703, 73.75673657736299)

In [None]:
bal_accs = []
accs = []

for _ in tqdm(range(5)):
    words_shuffled = np.random.permutation(mldf.target.unique())

    ratio = int(mldf.target.unique().shape[0]*0.8)
    train_words = set(words_shuffled[:ratio])
    msk = mldf.target.isin(train_words)

    # msk = np.random.rand(len(mldf)) < 0.7

    train_df = mldf[msk]
    test_df = mldf[~msk]


    target_train, anno_train, Y_train = (np.stack(train_df.target.apply(lambda w: scaled[mat_inds[w]])), 
                                         train_df.iloc[:, 5:].to_numpy(), 
                                         train_df.maj_vote.to_numpy())

    target_test, anno_test, Y_test = (np.stack(test_df.target.apply(lambda w: scaled[mat_inds[w]])),
                                        test_df.iloc[:, 5:].to_numpy(),
                                        test_df.maj_vote.to_numpy())

    X_train, X_test = np.hstack([target_train, anno_train]), np.hstack([target_test, anno_test])
    
    
    
    
    logreg = LogisticRegression(penalty="none", max_iter=5000, solver="lbfgs")

    logreg.fit(X_train, Y_train)
    
    
    
    # Logistic Regression

    from sklearn.metrics import balanced_accuracy_score, classification_report, accuracy_score,\
                                    confusion_matrix, plot_roc_curve

    preds = logreg.predict(X_test)

    print(classification_report(Y_test, preds))
    b = balanced_accuracy_score(Y_test, preds)
    a = accuracy_score(Y_test,
                       
                       preds)
    bal_accs.append(b)
    accs.append(a)


    cm = confusion_matrix(Y_test, preds)

    print(cm)

    plot_roc_curve(logreg, X_test, Y_test)
    _=plt.plot([0,1], [0,1], "--", c="r")

In [19]:
mldf.groupby("maj_vote").apply(len)

maj_vote
0    6474
1    2870
dtype: int64