In [1]:
!pip3 install gensim



In [2]:
from Corpora import MovieReviewCorpus
from Lexicon import SentimentLexicon
from Statistics import SignTest
from Classifiers import NaiveBayesText, SVMText
from Extensions import SVMDoc2Vec

In [3]:
# retrieve corpus
corpus=MovieReviewCorpus(stemming=False,pos=False)

# use sign test for all significance testing
signTest=SignTest()

print("--- classifying reviews using sentiment lexicon  ---")

# read in lexicon
lexicon=SentimentLexicon()

# on average there are more positive than negative words per review (~7.13 more positive than negative per review)
# to take this bias into account will use threshold (roughly the bias itself) to make it harder to classify as positive
threshold=8

# NOTE: from katie to self -- play w/ changing the threshold value! 

# question 0.1
lexicon.classify(corpus.reviews,threshold,magnitude=False)
token_preds=lexicon.predictions
print(f"token-only results: {lexicon.getAccuracy():.2f}")

lexicon.classify(corpus.reviews,threshold,magnitude=True)
magnitude_preds=lexicon.predictions
print(f"magnitude results: {lexicon.getAccuracy():.2f}")

# question 0.2
p_value=signTest.getSignificance(token_preds,magnitude_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"magnitude lexicon results are {significance} with respect to token-only")

num train: 1800, num test: 200
tot num reviews: 2000
--- classifying reviews using sentiment lexicon  ---
token-only results: 0.68
magnitude results: 0.69
magnitude lexicon results are not significant with respect to token-only


In [4]:
# question 1.0
print("--- classifying reviews using Naive Bayes on held-out test set ---")
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus.train)
NB.test(corpus.test)
# store predictions from classifier
non_smoothed_preds=NB.predictions
print(f"Accuracy without smoothing: {NB.getAccuracy():.2f}")

--- classifying reviews using Naive Bayes on held-out test set ---
smoothing?  False  POS:  719037  NEG:  646842
{'POS': -3411.8404205609168, 'NEG': -3483.9242124898633}
{'POS': -3904.4050612516685, 'NEG': -3933.3041185215397}
{'POS': -5426.519556480591, 'NEG': -5416.484788757298}
{'POS': -1756.2657019089006, 'NEG': -1704.2820986463998}
{'POS': -6134.861559654649, 'NEG': -6100.001666783998}
{'POS': -1706.3584458514013, 'NEG': -1628.4310033821298}
{'POS': -4053.005790207271, 'NEG': -4027.7831086014266}
{'POS': -5637.70533362308, 'NEG': -5608.626191360286}
{'POS': -5224.421753868531, 'NEG': -4874.846820071161}
{'POS': -7735.000668824807, 'NEG': -7899.664025281486}
{'POS': -2706.05311862347, 'NEG': -2634.0381788795735}
{'POS': -1986.5811683610252, 'NEG': -1984.3682359086813}
{'POS': -7473.914957020433, 'NEG': -7526.203427480133}
{'POS': -3209.75893993538, 'NEG': -3200.9560147973}
{'POS': -2714.283829636839, 'NEG': -2679.5783104135558}
{'POS': -5286.052054221414, 'NEG': -5282.378999525864}

In [5]:
# question 2.0
# use smoothing
NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus.train)
NB.test(corpus.test)
smoothed_preds=NB.predictions
# saving this for use later
num_non_stemmed_features=len(NB.vocabulary)
print(f"Accuracy using smoothing: {NB.getAccuracy():.2f}")


# question 2.1
# see if smoothing significantly improves results
p_value=signTest.getSignificance(non_smoothed_preds,smoothed_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using smoothing are {significance} with respect to no smoothing")

smoothing?  True  POS:  2084916  NEG:  2012721
{'POS': -3933.480224718453, 'NEG': -4044.683746086524}
{'POS': -4528.2436841829185, 'NEG': -4601.902023963708}
{'POS': -6252.62667490151, 'NEG': -6291.67831184035}
{'POS': -2028.7958853054845, 'NEG': -1991.472709982339}
{'POS': -7078.071491253788, 'NEG': -7097.79039154406}
{'POS': -1959.7263507279154, 'NEG': -1889.5133773238945}
{'POS': -4701.329546803055, 'NEG': -4715.678406900162}
{'POS': -6539.396995095384, 'NEG': -6563.279567382131}
{'POS': -6020.720883480434, 'NEG': -5692.148164584504}
{'POS': -8902.835087520334, 'NEG': -9164.21082776467}
{'POS': -3127.6232460650594, 'NEG': -3075.6079330680327}
{'POS': -2283.596485422149, 'NEG': -2301.0725069076034}
{'POS': -8633.232807484841, 'NEG': -8765.777133325373}
{'POS': -3686.6867608794055, 'NEG': -3709.49907360561}
{'POS': -3142.241383251785, 'NEG': -3132.4994721647004}
{'POS': -6095.1260361800505, 'NEG': -6139.410270508621}
{'POS': -6341.296581867678, 'NEG': -6033.489019071698}
{'POS': -2328

In [6]:
# question 3.0
print("--- classifying reviews using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus)
# using cross-eval for smoothed predictions from now on
smoothed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.3f}")
print(f"Std. Dev: {NB.getStdDeviation()}")

--- classifying reviews using 10-fold cross-evaluation ---


ZeroDivisionError: float division by zero

In [None]:
# question 4.0
print("--- stemming corpus ---")
# retrieve corpus with tokenized text and stemming (using porter)
stemmed_corpus=MovieReviewCorpus(stemming=True,pos=False)
print("--- cross-validating NB using stemming ---")
NB.crossValidate(stemmed_corpus)
stemmed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.3f}")
print(f"Std. Dev: {NB.getStdDeviation():.3f}")

# TODO Q4.1
# see if stemming significantly improves results on smoothed NB

# TODO Q4.2
print("--- determining the number of features before/after stemming ---")

In [None]:
# question Q5.0
# cross-validate model using smoothing and bigrams
print("--- cross-validating naive bayes using smoothing and bigrams ---")
NB=NaiveBayesText(smoothing=True,bigrams=True,trigrams=False,discard_closed_class=False)
NB.crossValidate(corpus)
smoothed_and_bigram_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.2f}") 
print(f"Std. Dev: {NB.getStdDeviation():.2f}")


# see if bigrams significantly improves results on smoothed NB only
p_value=signTest.getSignificance(smoothed_preds,smoothed_and_bigram_preds)
signifance = "significant" if p_value < 0.05 else "not significant"
print(f"results using smoothing and bigrams are {signifance} with respect to smoothing only")


# TODO Q5.1

In [None]:
# TODO Q6 and 6.1
print("--- classifying reviews using SVM 10-fold cross-eval ---")

In [None]:
# TODO Q7
print("--- adding in POS information to corpus ---")
print("--- training svm on word+pos features ----")
print("--- training svm discarding closed-class words ---")

In [None]:
# question 8.0
print("--- using document embeddings ---")