[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dbamman/anlp24/blob/main/6.tests/PermutationTest.ipynb)

This notebook explores the use of the permutation test to assess the significance of coefficents learned in logistic regression (testing against the null that each $\beta$ = 0).

In [None]:
import sys
from sklearn import preprocessing
from sklearn import linear_model
from random import choices
from sklearn.feature_extraction.text import CountVectorizer
from random import shuffle
import numpy as np
import copy

In [None]:
# get LMRD data
!wget https://raw.githubusercontent.com/dbamman/anlp24/refs/heads/main/data/lmrd/train.tsv -O lmrd_train.tsv
!wget https://raw.githubusercontent.com/dbamman/anlp24/refs/heads/main/data/lmrd/dev.tsv -O lmrd_dev.tsv
!wget https://raw.githubusercontent.com/dbamman/anlp24/refs/heads/main/data/lmrd/test.tsv -O lmrd_test.tsv

In [None]:
# get Convote data
!wget https://raw.githubusercontent.com/dbamman/anlp24/refs/heads/main/data/convote/train.tsv -O convote_train.tsv
!wget https://raw.githubusercontent.com/dbamman/anlp24/refs/heads/main/data/convote/dev.tsv -O convote_dev.tsv
!wget https://raw.githubusercontent.com/dbamman/anlp24/refs/heads/main/data/convote/test.tsv -O convote_test.tsv

In [None]:
# get LoC data
!wget https://raw.githubusercontent.com/dbamman/anlp24/refs/heads/main/data/loc/train.tsv -O loc_train.tsv
!wget https://raw.githubusercontent.com/dbamman/anlp24/refs/heads/main/data/loc/dev.tsv -O loc_dev.tsv
!wget https://raw.githubusercontent.com/dbamman/anlp24/refs/heads/main/data/loc/test.tsv -O loc_test.tsv

In [None]:
def read_data(filename):
    X=[]
    Y=[]
    with open(filename, encoding="utf-8") as file:
        for line in file:
            cols=line.rstrip().split("\t")
            label=cols[0]
            text=cols[1]
            # assumes text is already tokenized
            X.append(text)
            Y.append(label)
    return X, Y

In [None]:
# Change this to the name of the dataset to properly read in files below.
data="convote"

In [None]:
trainX, trainY=read_data("%s_train.tsv" % data)
devX, devY=read_data("%s_dev.tsv" % data)

In [None]:
def featurize(trainX, devX):
    vectorizer = CountVectorizer(max_features=10000, analyzer=str.split, lowercase=False, strip_accents=None, binary=True)

    X_train = vectorizer.fit_transform(trainX)
    X_dev = vectorizer.transform(devX)

    return X_train, X_dev, vectorizer

In [None]:
def train(X_train, trainY, le):
    Y_train=le.transform(trainY)
    logreg = linear_model.LogisticRegression(C=100, solver='lbfgs', penalty='l2', max_iter=10000)
    logreg.fit(X_train, Y_train)
    return logreg
    return logreg.coef_[0]

In [None]:
def test(logreg, devX_feat, devY, le):
    Y_dev=le.transform(devY)
    print("Accuracy: %.3f" % logreg.score(devX_feat, Y_dev))

In [None]:
def analyze_weights(coefs, label_encoder, vocab, p_values):
    reverse_vocab = {v: k for k, v in vocab.items()}

    sort_index = np.argsort(coefs)

    print(label_encoder.inverse_transform([0])[0])
    for k in sort_index[:25]:
        print ("%.5f\t%s\t%.4f" % (coefs[k], reverse_vocab[k], p_values[k] ))

    print(label_encoder.inverse_transform([1])[0])

    for k in reversed(sort_index[-25:]):
        print ("%.5f\t%s\t%.4f" % (coefs[k], reverse_vocab[k], p_values[k] ))

In [None]:
X_train, X_dev, vectorizer=featurize(trainX, devX)
le = preprocessing.LabelEncoder()
le.fit(trainY)

logreg=train(X_train, trainY, le)
test(logreg, X_dev, devY, le)

true_coefficients=logreg.coef_[0]

# We'll set P=100 here to finish running in class, but set higher (e.g., 10000) for real applications
P=100

p_values=np.zeros(len(true_coefficients))
permutedY=copy.deepcopy(trainY)

for i in range(P):
    if i % 10 == 0:
        print(i)

    # permute the values of Y so that they're now attached to random data points in X
    shuffle(permutedY)

    # train logistic regression on that permuted dataset
    permuted_logreg=train(X_train, permutedY, le)
    coefficients=permuted_logreg.coef_[0]

    # test how often the coefficients learned from the permuted data are as extreme as
    # the coefficients from the true data
    for idx, coef in enumerate(coefficients):
        if abs(true_coefficients[idx]) < abs(coef):
            p_values[idx]+=1./P

In [None]:
inverse_vocab = {v: k for k, v in vectorizer.vocabulary_.items()}
out=open("weights.txt", "w")
for idx, coef in enumerate(true_coefficients):
    out.write("%.3f\t%s\t%.5f\n" % (coef, inverse_vocab[idx], p_values[idx]))
out.close()

In [None]:
analyze_weights(true_coefficients, le, vectorizer.vocabulary_, p_values)