This notebook evaluates different methods for tokenization and stemming/lemmatization
and assesses the impact on binary sentiment classification, using a train/dev dataset of sample of 1000 reviews from the [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/).  Each tokenization method is evaluated on the same learning algorithm ($\ell_2$-regularized logistic regression); the only difference is the tokenization process. For more, see: http://sentiment.christopherpotts.net/tokenizing.html

In [None]:
import nltk
import spacy
from nltk.stem.porter import *
from TokenizationTest import TokenizationTest
from happyfuntokenizing import Tokenizer as potts

In [None]:
# spaCy lemmatization needs tagger but disable the rest
nlp = spacy.load('en', disable=['ner,parser'])
nlp.remove_pipe('ner')
nlp.remove_pipe('parser')

# load NLTK porter stemmer
stemmer = PorterStemmer()

# load Potts sentiment tokenizer
potts_tokenizer=potts()

In [None]:
def spacy_tokenizer(data):
    spacy_tokens=nlp(data)
    return [token.text for token in spacy_tokens]

def spacy_lemmatizer(data):
    spacy_tokens=nlp(data)
    return [token.lemma_ for token in spacy_tokens]

In [None]:
tester=TokenizationTest("../data/sentiment.1000.train.txt", "../data/sentiment.1000.dev.txt")

In [None]:
tester.evaluate(str.split)

In [None]:
tester.evaluate(stemmer.stem)

In [None]:
tester.evaluate(nltk.word_tokenize)

In [None]:
tester.evaluate(spacy_tokenizer)

In [None]:
tester.evaluate(spacy_lemmatizer)

In [None]:
tester.evaluate(potts_tokenizer.tokenize)