[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dbamman/anlp25/blob/main/6.tests/PermutationTest.ipynb)

This notebook explores the use of the permutation test to assess the significance of coefficents learned in logistic regression (testing against the null that each $\beta$ = 0).

In [None]:
import copy
import sys
from random import choices, shuffle

import pandas as pd
import numpy as np
from sklearn import linear_model, preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm


In [None]:
# get LMRD data
!wget https://raw.githubusercontent.com/dbamman/anlp25/refs/heads/main/data/lmrd/train.tsv -O lmrd_train.tsv
!wget https://raw.githubusercontent.com/dbamman/anlp25/refs/heads/main/data/lmrd/dev.tsv -O lmrd_dev.tsv
!wget https://raw.githubusercontent.com/dbamman/anlp25/refs/heads/main/data/lmrd/test.tsv -O lmrd_test.tsv

In [None]:
# get Convote data
!wget https://raw.githubusercontent.com/dbamman/anlp25/refs/heads/main/data/convote/train.tsv -O convote_train.tsv
!wget https://raw.githubusercontent.com/dbamman/anlp25/refs/heads/main/data/convote/dev.tsv -O convote_dev.tsv
!wget https://raw.githubusercontent.com/dbamman/anlp25/refs/heads/main/data/convote/test.tsv -O convote_test.tsv

In [None]:
# get LoC data
!wget https://raw.githubusercontent.com/dbamman/anlp25/refs/heads/main/data/loc/train.tsv -O loc_train.tsv
!wget https://raw.githubusercontent.com/dbamman/anlp25/refs/heads/main/data/loc/dev.tsv -O loc_dev.tsv
!wget https://raw.githubusercontent.com/dbamman/anlp25/refs/heads/main/data/loc/test.tsv -O loc_test.tsv

In [None]:
def read_data(filename):
    df = pd.read_csv(filename, names=["label", "text"], sep="\t")
    return df.text.to_list(), df.label.to_list()

# Change this to the directory with the data you will be using.
# The directory should contain train.tsv, dev.tsv and test.tsv
data = "convote"

x_train, y_train = read_data("%s_train.tsv" % data)
x_dev, y_dev = read_data("%s_dev.tsv" % data)

In [None]:
def featurize(x_train, x_dev):
    vectorizer = CountVectorizer(max_features=10000, analyzer=str.split, lowercase=False, strip_accents=None, binary=True)

    x_train = vectorizer.fit_transform(x_train)
    x_dev = vectorizer.transform(x_dev)

    return x_train, x_dev, vectorizer

In [None]:
def train(x_train, y_train, le):
    y_train = le.transform(y_train)
    logreg = linear_model.LogisticRegression(C=100, solver='lbfgs', penalty='l2', max_iter=10000)
    logreg.fit(x_train, y_train)
    return logreg
    return logreg.coef_[0]

In [None]:
def test(logreg, x_dev_feats, y_dev, le):
    y_dev = le.transform(y_dev)
    print("Accuracy: %.3f" % logreg.score(x_dev_feats, y_dev))

In [None]:
def analyze_weights(coefs, label_encoder, vocab, p_values):
    reverse_vocab = {v: k for k, v in vocab.items()}

    sort_index = np.argsort(coefs)

    print(label_encoder.inverse_transform([0])[0])
    for k in sort_index[:25]:
        print ("%.5f\t%s\t%.4f" % (coefs[k], reverse_vocab[k], p_values[k] ))

    print(label_encoder.inverse_transform([1])[0])

    for k in reversed(sort_index[-25:]):
        print ("%.5f\t%s\t%.4f" % (coefs[k], reverse_vocab[k], p_values[k] ))

In [None]:
x_train_feats, x_dev_feats, vectorizer=featurize(x_train, x_dev)
le = preprocessing.LabelEncoder()
le.fit(y_train)

logreg=train(x_train_feats, y_train, le)
test(logreg, x_dev_feats, y_dev, le)

true_coefficients=logreg.coef_[0]

# We'll set P=100 here to finish running in class, but set higher (e.g., 10000) for real applications
P=100

p_values = np.zeros(len(true_coefficients))
y_permuted = copy.deepcopy(y_train)

for i in tqdm(range(P)):
    # permute the values of Y so that they're now attached to random data points in X
    shuffle(y_permuted)

    # train logistic regression on that permuted dataset
    permuted_logreg = train(x_train_feats, y_permuted, le)
    coefficients = permuted_logreg.coef_[0]

    # test how often the coefficients learned from the permuted data are as extreme as
    # the coefficients from the true data
    for idx, coef in enumerate(coefficients):
        if abs(true_coefficients[idx]) < abs(coef):
            p_values[idx] += 1. / P

In [None]:
inverse_vocab = {v: k for k, v in vectorizer.vocabulary_.items()}
out = open("weights.txt", "w")
for idx, coef in enumerate(true_coefficients):
    out.write("%.3f\t%s\t%.5f\n" % (coef, inverse_vocab[idx], p_values[idx]))
out.close()

In [None]:
analyze_weights(true_coefficients, le, vectorizer.vocabulary_, p_values)