[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dbamman/anlp24/blob/main/1.words/EvaluateTokenizationForSentiment.ipynb)

This notebook evaluates different methods for tokenization and stemming/lemmatization
and assesses the impact on binary sentiment classification, using a train/dev dataset of sample of 1000 reviews from the [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/).  Each tokenization method is evaluated on the same learning algorithm ($\ell_2$-regularized logistic regression); the only difference is the tokenization process. For more, see: http://sentiment.christopherpotts.net/tokenizing.html

In [None]:
# download code and data
!wget https://raw.githubusercontent.com/dbamman/anlp24/main/1.words/happyfuntokenizing.py
!wget https://raw.githubusercontent.com/dbamman/anlp24/main/data/sentiment.1000.train.txt
!wget https://raw.githubusercontent.com/dbamman/anlp24/main/data/sentiment.1000.dev.txt

In [None]:
import nltk
nltk.download('punkt')
import spacy
from nltk.stem.porter import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn import linear_model
from happyfuntokenizing import Tokenizer as potts

In [None]:
class TokenizationTest():

	def __init__(self, trainFile, devFile):
		self.trainFile=trainFile
		self.devFile=devFile

	def read_data(self, filename, tokenizer):
		corpus=[]
		Y=[]
		with open(filename, encoding="utf-8") as file:
			for idx,line in enumerate(file):
				cols=line.rstrip().split("\t")
				label=cols[0]
				text=cols[1]
				tokens=' '.join(tokenizer(text))
				corpus.append(tokens)
				Y.append(label)

		return corpus, Y

	def evaluate(self, tokenizer):

		train_corpus, train_labels=self.read_data(self.trainFile, tokenizer)
		dev_corpus, dev_labels=self.read_data(self.devFile, tokenizer)

		vectorizer = CountVectorizer(max_features=10000, analyzer=str.split, lowercase=False, strip_accents=None, binary=True)
		X_train = vectorizer.fit_transform(train_corpus)
		X_dev = vectorizer.transform(dev_corpus)

		le = preprocessing.LabelEncoder()
		le.fit(train_labels)

		Y_train=le.transform(train_labels)
		Y_dev=le.transform(dev_labels)

		logreg = linear_model.LogisticRegression(C=1.0, solver='lbfgs', penalty='l2')
		logreg.fit(X_train, Y_train)
		print("Function '%s' Accuracy: %.3f" % (tokenizer.__name__, logreg.score(X_dev, Y_dev)))

In [None]:
# spaCy lemmatization needs tagger but disable the rest
nlp = spacy.load('en_core_web_sm', disable=['ner,parser'])
nlp.remove_pipe('ner')
nlp.remove_pipe('parser')

# load NLTK porter stemmer
stemmer = PorterStemmer()

# load Potts sentiment tokenizer
potts_tokenizer=potts()

In [None]:
def spacy_tokenizer(data):
    spacy_tokens=nlp(data)
    return [token.text for token in spacy_tokens]

def spacy_lemmatizer(data):
    spacy_tokens=nlp(data)
    return [token.lemma_ for token in spacy_tokens]

In [None]:
tester=TokenizationTest("sentiment.1000.train.txt", "sentiment.1000.dev.txt")

In [None]:
tester.evaluate(str.split)

In [None]:
tester.evaluate(stemmer.stem)

In [None]:
tester.evaluate(nltk.word_tokenize)

In [None]:
tester.evaluate(spacy_tokenizer)

In [None]:
tester.evaluate(spacy_lemmatizer)

In [None]:
tester.evaluate(potts_tokenizer.tokenize)