In [1]:
from Corpora import MovieReviewCorpus
from Lexicon import SentimentLexicon
from Statistics import SignTest
from Classifiers import NaiveBayesText, SVMText
# from Extensions import SVMDoc2Vec

In [2]:
# retrieve corpus
corpus=MovieReviewCorpus(stemming=False,pos=False)

# use sign test for all significance testing
signTest=SignTest()

print("--- classifying reviews using sentiment lexicon  ---")

# read in lexicon
lexicon=SentimentLexicon()

# on average there are more positive than negative words per review (~7.13 more positive than negative per review)
# to take this bias into account will use threshold (roughly the bias itself) to make it harder to classify as positive
threshold=8

# NOTE: from katie to self -- play w/ changing the threshold value! 

# question 0.1
lexicon.classify(corpus.reviews,threshold,magnitude=False)
token_preds=lexicon.predictions
print(f"token-only results: {lexicon.getAccuracy():.2f}")

lexicon.classify(corpus.reviews,threshold,magnitude=True)
magnitude_preds=lexicon.predictions
print(f"magnitude results: {lexicon.getAccuracy():.2f}")

# question 0.2
p_value=signTest.getSignificance(token_preds,magnitude_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"magnitude lexicon results are {significance} with respect to token-only")

num train: 1800, num test: 200
tot num reviews: 2000
--- classifying reviews using sentiment lexicon  ---
token-only results: 0.68
magnitude results: 0.69
magnitude lexicon results are not significant with respect to token-only


In [3]:
# question 1.0
print("--- classifying reviews using Naive Bayes on held-out test set ---")
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus.train)
NB.test(corpus.test)
# store predictions from classifier
non_smoothed_preds=NB.predictions
print(f"Accuracy without smoothing: {NB.getAccuracy():.2f}")

--- classifying reviews using Naive Bayes on held-out test set ---


  cond_prob += np.log(self.condProb[token][sent])


Accuracy without smoothing: 0.51


In [4]:
NB.vocabulary

{('beats', 'NNS'),
 ('Ming-Liang', 'NNP'),
 ('shirt', 'NN'),
 ('tantrum', 'NN'),
 ('goldmine', 'NN'),
 ('Levis', 'NNP'),
 ('organize', 'VB'),
 ('broadside', 'NN'),
 ('enlightenment', 'NN'),
 ('depth', 'NN'),
 ('Pebble', 'NNP'),
 ('sanctuary', 'NN'),
 ('all-American', 'JJ'),
 ('implausible', 'JJ'),
 ('canoes', 'NNS'),
 ('head-shaving', 'JJ'),
 ('reeling', 'VBG'),
 ('Vicki', 'NNP'),
 ('agenda', 'NN'),
 ('Boston', 'NNP'),
 ('meticulous', 'JJ'),
 ('dominate', 'VBP'),
 ('Fingal', 'JJ'),
 ('Fear', 'NN'),
 ('sneers', 'VBZ'),
 ('drizzling', 'VBG'),
 ('Salma', 'NNP'),
 ('warmly', 'RB'),
 ('interpret', 'VBP'),
 ('FORTRESS', 'NNP'),
 ('trigger-happy', 'JJ'),
 ('ditzy', 'JJ'),
 ('choir', 'NN'),
 ('Zsigmond', 'NNP'),
 ('inundate', 'VB'),
 ('murderer-on-the-loose', 'NN'),
 ('8/10', 'CD'),
 ('beset', 'VBD'),
 ('Quiz', 'NN'),
 ('Palestinians', 'NNPS'),
 ("O'Daniel", 'NNP'),
 ('relocate', 'VB'),
 ('boring', 'VBG'),
 ('Charlie', 'NNP'),
 ('phony', 'JJ'),
 ('west', 'NN'),
 ('Tahoe', 'NNP'),
 ('cringed', 

In [5]:
# question 2.0
# use smoothing
NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus.train)
NB.test(corpus.test)
smoothed_preds=NB.predictions
# saving this for use later
num_non_stemmed_features=len(NB.vocabulary)
print(f"Accuracy using smoothing: {NB.getAccuracy():.2f}")


# question 2.1
# see if smoothing significantly improves results
p_value=signTest.getSignificance(non_smoothed_preds,smoothed_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using smoothing are {significance} with respect to no smoothing")

Accuracy using smoothing: 0.51
results using smoothing are not significant with respect to no smoothing


In [6]:
# question 3.0
print("--- classifying reviews using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus)
# using cross-eval for smoothed predictions from now on
smoothed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.3f}")
print(f"Std. Dev: {NB.getStdDeviation()}")

--- classifying reviews using 10-fold cross-evaluation ---


ZeroDivisionError: float division by zero

In [None]:
# question 4.0
print("--- stemming corpus ---")
# retrieve corpus with tokenized text and stemming (using porter)
stemmed_corpus=MovieReviewCorpus(stemming=True,pos=False)
print("--- cross-validating NB using stemming ---")
NB.crossValidate(stemmed_corpus)
stemmed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.3f}")
print(f"Std. Dev: {NB.getStdDeviation():.3f}")

# TODO Q4.1
# see if stemming significantly improves results on smoothed NB

# TODO Q4.2
print("--- determining the number of features before/after stemming ---")

In [None]:
# question Q5.0
# cross-validate model using smoothing and bigrams
print("--- cross-validating naive bayes using smoothing and bigrams ---")
NB=NaiveBayesText(smoothing=True,bigrams=True,trigrams=False,discard_closed_class=False)
NB.crossValidate(corpus)
smoothed_and_bigram_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.2f}") 
print(f"Std. Dev: {NB.getStdDeviation():.2f}")


# see if bigrams significantly improves results on smoothed NB only
p_value=signTest.getSignificance(smoothed_preds,smoothed_and_bigram_preds)
signifance = "significant" if p_value < 0.05 else "not significant"
print(f"results using smoothing and bigrams are {signifance} with respect to smoothing only")


# TODO Q5.1

In [None]:
# TODO Q6 and 6.1
print("--- classifying reviews using SVM 10-fold cross-eval ---")

In [None]:
# TODO Q7
print("--- adding in POS information to corpus ---")
print("--- training svm on word+pos features ----")
print("--- training svm discarding closed-class words ---")

In [None]:
# question 8.0
print("--- using document embeddings ---")