This notebook contextualizes accuracy against a majority class baseline, and analyzes the most important features for classification.

In [None]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn import linear_model
import numpy as np

In [None]:
def read_data(filename):
    X=[]
    Y=[]
    with open(filename, encoding="utf-8") as file:
        for line in file:
            cols=line.rstrip().split("\t")
            label=cols[0]
            # sample text is already tokenized; if yours is not, do so here
            text=cols[1]
            X.append(text)
            Y.append(label)
    return X, Y

In [None]:
# Change this to the directory with your data (from the CheckData_TODO.ipynb exercise).  
# The directory should contain train.tsv, dev.tsv and test.tsv
directory="../data/text_classification_sample_data"

In [None]:
trainX, trainY=read_data("%s/train.tsv" % directory)
devX, devY=read_data("%s/dev.tsv" % directory)

Q1: Implement the majority class baseline for your data that we went over in `Hyperparameters.ipynb`

In [None]:
def majority_class(trainY, devY):
    # Find the most frequent label in the training data
    label_counts = Counter(trainY)
    majority_label = label_counts.most_common(1)[0][0]
    
    # Predict the majority label for all items in dev set
    predictions = [majority_label] * len(devY)
    
    # Calculate accuracy
    correct = sum(p == y for p, y in zip(predictions, devY))
    accuracy = correct / len(devY)
    
    return predictions, accuracy

In [None]:
p, a = majority_class(trainY,devY)

Q2: After experimenting with hyperparameter choices in class, what is the best accuracy that you uncovered on your development data?  Which hyperparameter choices led to that accuracy?  Plug in the values here and execute the cell to yield the accuracy. 

In [None]:
le = preprocessing.LabelEncoder()
le.fit(trainY)
Y_train=le.transform(trainY)
Y_dev=le.transform(devY)

# split the string on whitespace because we assume it has already been tokenized
vectorizer = CountVectorizer(max_features=10000, analyzer=str.split, lowercase=False, strip_accents=None, binary=True)

X_train = vectorizer.fit_transform(trainX)
X_dev = vectorizer.transform(devX)
logreg = linear_model.LogisticRegression(C=0.1, solver='lbfgs', penalty='l2')
model=logreg.fit(X_train, Y_train)
print("Accuracy: %.3f" % logreg.score(X_dev, Y_dev))

Q3: For binary classification using logistic regression, the parameters of the learned model are given in `model.coef_[0]`.  Print out the 25 features that are most associated with each class (i.e., the 25 parameters that have the largest positive values and the 25 parameters with largest negative values).  For reference, consider the `inverse_transform` function in [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder.transform) to get the class labels that correspond to positive(=1) and negative(=0), and the `vocabulary_` function in [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) to yield the index for each vocabulary term.


In [None]:
def analyze_weights(learned_model, label_encoder, count_vectorizer):
    # Get the learned coefficients for class 1 (positive class)
    coefs = learned_model.coef_[0]
    
    # Get vocabulary mapping index -> feature
    vocab = np.array([term for term, idx in sorted(count_vectorizer.vocabulary_.items(), key=lambda x: x[1])])
    
    # Identify the class labels (0 and 1)
    class_labels = label_encoder.inverse_transform([0, 1])
    
    # Top 25 most positive (associated with class 1)
    top_pos_idx = np.argsort(coefs)[-25:][::-1]  # largest positive weights
    top_pos_terms = vocab[top_pos_idx]
    
    # Top 25 most negative (associated with class 0)
    top_neg_idx = np.argsort(coefs)[:25]  # most negative weights
    top_neg_terms = vocab[top_neg_idx]
    
    # Print results
    print(f"\nTop 25 features for class '{class_labels[1]}' (positive weights):")
    for term, weight in zip(top_pos_terms, coefs[top_pos_idx]):
        print(f"{term:20s} {weight:.4f}")
    
    print(f"\nTop 25 features for class '{class_labels[0]}' (negative weights):")
    for term, weight in zip(top_neg_terms, coefs[top_neg_idx]):
        print(f"{term:20s} {weight:.4f}")

In [None]:
analyze_weights(model, le, vectorizer)