## Explainability of LR

In [7]:
from utils import explain_label
import pandas as pd
from sklearn.model_selection import train_test_split

data_train = pd.read_csv("train.csv")
data, labels = data_train['text'].values, data_train['label'].values.astype(int)
_, data, _, labels = train_test_split(data, labels, test_size=0.1, random_state=1)
X_train, X_dev, y_train, y_dev = train_test_split(data, labels, test_size=0.3, random_state=1)

In [9]:
from models import LogisticRegressionITDI

model = LogisticRegressionITDI()
model.fit(X_train, y_train)

(0.9998019681321025, nan, None)

In [10]:
model.predict_proba("de buena cosa de l'Alto Aragón")[0]

array([0.01634345, 0.01442741, 0.01566705, 0.93947412, 0.01408797])

In [11]:
y_pred = model.predict(X_dev)
X_dev = model.vectorizer.transform(X_dev)
X_dev = model.scaler.transform(X_dev)

In [15]:
print(X_dev.shape)
print(y_dev.shape)
print(y_pred.shape)

(28134, 151300)
(28134,)
(28134,)


In [14]:
import numpy as np
vocab = model.vectorizer.get_feature_names_out()
most = []
for i, (sample, X, true, pred) in enumerate(zip(data['text'], X_dev, y_dev, y_pred)):
    
    if true != pred:

        # print(f"## Sample {i} ##")
        # print(sample)
        #vec_feat = np.exp([X_dev[i, feat]*model.coef_[9][feat] for feat in X.indices]) - 1
        wrong_feat = np.exp([X_dev[i, feat]*model.model.coef_[pred][feat] for feat in X.indices]) - 1
        
        for idx, feat in enumerate(wrong_feat):
            if feat > 0.2 or feat < -0.2:
                most.append((i, vocab[X.indices[idx]], explain_label(pred), explain_label(true), feat))

        # print(ind)
        # print([vocab[feat] for feat in X.indices])
        # print("VEC", vec_feat)
        # print("LMO", lmo_feat, "\n")

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199
expl_df = pd.DataFrame(most, columns=['sample', 'word', 'pred', 'true', 'dev']).groupby(['word', 'pred', 'true']).agg(['unique'])


expl_df['sample', 'unique'] = expl_df['sample', 'unique'].apply(lambda x: x if len(x)>2 else np.nan)
expl_df.dropna(inplace=True)
expl_df



IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [6]:

sample = []

for x in expl_df['sample']['unique']:
    sample.extend(x)

np.unique(sample).shape

(120,)

In [77]:
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
i=0
colors = sns.color_palette('rocket_r')[0:11]
words = ['atride', 'che', 'chi', "chissu", "chisti", "con", "cossa", "di", "digo", "dui", "el", "erani", "fattu", "ghe", "li", "lu", "me", "no", "non", "per", "perchè", "quel", "saru", "se", "si", "spariss", "sì", "ve", "vegnir", "vo", "vogio"]
conf = ['PMS_VEC', 'LMO_VEC', 'SC_SCN', 'SCN_SC', 'SCN_SC', 'PMS_LMO', 'LMO_VEC', 'SCN_SC', 'LMO_VEC', 'SCN_SC', 'VEC_LMO', 'SCN_SC', 'SCN_SC', 'VEC_LMO', 'SCN_SC', 'SCN_SC', 'LMO_VEC', 'VEC_SCN', 'SC_LMOVEC', 'LMO_VEC', 'LMO_VEC', 'LMO_VEC', 'SC_SCN', 'VEC_LMO', 'SCN_SC', 'PMS_LMO', 'PMS_VEC', 'LMO_VEC', 'LMO_VEC', 'NAP_SCN', 'LMO_VEC']
for w, conf in tqdm(zip(words, conf), total=len(words)):

    train = []
    data_train = pd.read_csv("data/train.csv", ) 
    data_train['text'] = data_train['text'].apply(lambda x: x if w in x else np.nan)
    data_train.dropna(inplace=True)
    for label, num in zip(np.unique(data_train['label'].values, return_counts=True)[0], np.unique(data_train['label'].values, return_counts=True)[1]):
        train.extend([explain_label(label)]*num)

    val = [] 
    data_v = data.copy()
    data_v['text'] = data_v['text'].apply(lambda x: x if w in x else np.nan)
    data_v.dropna(inplace=True)
    for label, num in zip(np.unique(data_v['label'].values, return_counts=True)[0], np.unique(data_v['label'].values, return_counts=True)[1]):
        val.extend([label]*num)

    fig = plt.gcf()
    fig.set_size_inches(9, 6)
    
    labels_t, counts_t = np.unique(np.array(train), return_counts=True)
    labels, counts = np.unique(np.array(val), return_counts=True)
    c_train, c_val = [0]*11, [0]*11
    i1 = i2 = 0
    for j, l in enumerate(['EML', 'FUR', 'LIJ', 'LLD', 'LMO', 'NAP', 'PMS', 'RT', 'SC','SCN', 'VEC']):
        if i1 < len(labels_t) and l == labels_t[i1]: 
            c_train[j] = counts_t[i1]
            i1 += 1
    for j, l in enumerate(['EML', 'FUR', 'LIJ', 'LLD', 'LMO', 'NAP', 'PMS', 'RT', 'SC','SCN', 'VEC']):
        if i2 < len(labels) and  l == labels[i2]: 
            c_val[j] = counts[i2]
            i2 += 1

    c_train = c_train  / np.sum(c_train)
    c_val = c_val  / np.sum(c_val)

    # plt.bar(range(11), height=c_train, align='center', width=0.3, color=colors[0], label="Training distribution")
    # plt.bar(np.array(range(11))+0.3, height=c_val, align='center', width=0.3, color=colors[3], label = "Validation distribution")

    #plt.plot(c_train)
    from scipy.interpolate import interp1d
    xnew = np.linspace(0, 10, num=300, endpoint=True)
    f_cubic = interp1d(range(0,11), c_train, kind='nearest')
    f_cubic2 = interp1d(range(0,11), c_val, kind='nearest')
    plt.rcParams.update({'font.size': 18})

    plt.plot(xnew, f_cubic(xnew), label="training")
    plt.fill_between(xnew, f_cubic(xnew), color='#539ecd',alpha=0.2)

    plt.plot(xnew, f_cubic2(xnew), label="validation")
    plt.fill_between(xnew, f_cubic2(xnew), color='#FFA54C',alpha=0.2)

    plt.xticks(np.array(range(0,11)), ['EML', 'FUR', 'LIJ', 'LLD', 'LMO', 'NAP', 'PMS', 'RT', 'SC','SCN', 'VEC'])


    plt.ylim(bottom=0, top=1.1)
    plt.xlim(left=0, right=10)

    fig.tight_layout()
    plt.savefig(f"plots/confounding/{w}_{conf}.png")
    plt.close()

  c_val = c_val  / np.sum(c_val)
100%|██████████| 31/31 [00:30<00:00,  1.01it/s]
