In [1]:
!pip3 install gensim



In [1]:
from Corpora import MovieReviewCorpus
from Lexicon import SentimentLexicon
from Statistics import SignTest
from Classifiers import NaiveBayesText, SVMText
from Extensions import SVMDoc2Vec

In [2]:
# retrieve corpus
corpus=MovieReviewCorpus(stemming=False,pos=False)

# use sign test for all significance testing
signTest=SignTest()

print("--- classifying reviews using sentiment lexicon  ---")

# read in lexicon
lexicon=SentimentLexicon()

# on average there are more positive than negative words per review (~7.13 more positive than negative per review)
# to take this bias into account will use threshold (roughly the bias itself) to make it harder to classify as positive
threshold=8

# NOTE: from katie to self -- play w/ changing the threshold value! 

# question 0.1
lexicon.classify(corpus.reviews,threshold,magnitude=False)
token_preds=lexicon.predictions
print(f"token-only results: {lexicon.getAccuracy():.2f}")

lexicon.classify(corpus.reviews,threshold,magnitude=True)
magnitude_preds=lexicon.predictions
print(f"magnitude results: {lexicon.getAccuracy():.2f}")

# question 0.2
p_value=signTest.getSignificance(token_preds,magnitude_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"magnitude lexicon results are {significance} with respect to token-only")

num train: 1800, num test: 200
tot num reviews: 2000
--- classifying reviews using sentiment lexicon  ---
token-only results: 0.68
magnitude results: 0.69
magnitude lexicon results are not significant with respect to token-only


In [3]:
len(corpus.train), len(corpus.test)

(1800, 200)

In [5]:
# question 1.0
print("--- classifying reviews using Naive Bayes on held-out test set ---")
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus.train)
NB.test(corpus.test)
# store predictions from classifier
non_smoothed_preds=NB.predictions
print(f"Accuracy without smoothing: {NB.getAccuracy():.2f}")

--- classifying reviews using Naive Bayes on held-out test set ---
VOCAB FOR ALAN!!  60261
smoothing?  False  POS:  719037  NEG:  646842
{'POS': -3411.8404205609168, 'NEG': -3483.9242124898633}
{'POS': -3904.4050612516685, 'NEG': -3933.3041185215397}
{'POS': -5426.519556480591, 'NEG': -5416.484788757298}
{'POS': -1756.2657019089006, 'NEG': -1704.2820986463998}
{'POS': -6134.861559654649, 'NEG': -6100.001666783998}
{'POS': -1706.3584458514013, 'NEG': -1628.4310033821298}
{'POS': -4053.005790207271, 'NEG': -4027.7831086014266}
{'POS': -5637.70533362308, 'NEG': -5608.626191360286}
{'POS': -5224.421753868531, 'NEG': -4874.846820071161}
{'POS': -7735.000668824807, 'NEG': -7899.664025281486}
{'POS': -2706.05311862347, 'NEG': -2634.0381788795735}
{'POS': -1986.5811683610252, 'NEG': -1984.3682359086813}
{'POS': -7473.914957020433, 'NEG': -7526.203427480133}
{'POS': -3209.75893993538, 'NEG': -3200.9560147973}
{'POS': -2714.283829636839, 'NEG': -2679.5783104135558}
{'POS': -5286.052054221414, 'N

{'POS': -12362.294477907013, 'NEG': -12232.928297314165}
{'POS': -5785.990907232294, 'NEG': -5650.771981382784}
{'POS': -5340.585210506206, 'NEG': -5204.757811439963}
{'POS': -9016.14900005623, 'NEG': -8875.05731961111}
{'POS': -4000.598465435308, 'NEG': -3936.368715468309}
{'POS': -4829.522731379198, 'NEG': -4813.057835869202}
{'POS': -6155.378273479154, 'NEG': -6116.159925419023}
{'POS': -2213.3006997778316, 'NEG': -2176.796442909646}
{'POS': -2149.3160135807434, 'NEG': -2117.890333000917}
{'POS': -1230.2971467875518, 'NEG': -1254.2132374681403}
{'POS': -2990.1609347376343, 'NEG': -2875.4535285359266}
{'POS': -3613.489412493413, 'NEG': -3622.8975774505266}
{'POS': -3797.632955381795, 'NEG': -3882.455078018095}
{'POS': -6773.076939353265, 'NEG': -6780.77513772336}
{'POS': -4513.816333450407, 'NEG': -4451.147090065464}
{'POS': -4364.865982566186, 'NEG': -4316.823725634674}
{'POS': -5301.890992752154, 'NEG': -5291.4021456955215}
{'POS': -6118.396782065806, 'NEG': -6422.327676729551}
{'P

In [6]:
# question 2.0
# use smoothing
NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus.train)
NB.test(corpus.test)
smoothed_preds=NB.predictions
# saving this for use later
num_non_stemmed_features=len(NB.vocabulary)
print(f"Accuracy using smoothing: {NB.getAccuracy():.2f}")


# question 2.1
# see if smoothing significantly improves results
p_value=signTest.getSignificance(non_smoothed_preds,smoothed_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using smoothing are {significance} with respect to no smoothing")

VOCAB FOR ALAN!!  60261
total num words:  60261
smoothing?  True  POS:  779298  NEG:  707103
{'POS': -3652.1021298196665, 'NEG': -3666.895793057485}
{'POS': -4001.2934843460293, 'NEG': -3994.8650384997673}
{'POS': -5634.705280161685, 'NEG': -5690.654850139852}
{'POS': -1823.1631735294695, 'NEG': -1813.4476264095983}
{'POS': -6236.114382324964, 'NEG': -6295.446553713797}
{'POS': -1727.7705622327449, 'NEG': -1759.2559518872795}
{'POS': -4111.428592642744, 'NEG': -4129.170158241781}
{'POS': -5729.6924786764785, 'NEG': -5784.3475701317175}
{'POS': -5309.397534947531, 'NEG': -5347.711829360356}
{'POS': -8203.83735516483, 'NEG': -8146.566888951349}
{'POS': -2749.611380023502, 'NEG': -2776.03821903083}
{'POS': -2037.4891639542616, 'NEG': -2036.812649630652}
{'POS': -7689.574740438191, 'NEG': -7700.460777768894}
{'POS': -3349.5494869589315, 'NEG': -3341.950147954563}
{'POS': -2803.093838033538, 'NEG': -2811.522505060955}
{'POS': -5370.1701562942535, 'NEG': -5435.267479554061}
{'POS': -5591.866

{'POS': -5740.686344334086, 'NEG': -5729.497361685138}
{'POS': -4842.092771224706, 'NEG': -4798.04061179928}
{'POS': -12738.52366144652, 'NEG': -12665.274549107407}
{'POS': -5969.046274073938, 'NEG': -5987.888734018205}
{'POS': -5407.999355578819, 'NEG': -5428.717581964089}
{'POS': -9274.135590983693, 'NEG': -9302.326457378786}
{'POS': -4074.751824013911, 'NEG': -4035.6397290253994}
{'POS': -4950.062055662355, 'NEG': -4938.493325597955}
{'POS': -6347.7766456250365, 'NEG': -6249.370953213189}
{'POS': -2279.8664580648956, 'NEG': -2259.889403260814}
{'POS': -2221.25441262377, 'NEG': -2190.746625873381}
{'POS': -1306.053284375604, 'NEG': -1265.381473181606}
{'POS': -3023.881568677025, 'NEG': -3011.568445057615}
{'POS': -3694.4068849542887, 'NEG': -3665.1481238060524}
{'POS': -4010.636126053601, 'NEG': -3987.062266956429}
{'POS': -6963.327198716129, 'NEG': -6911.637204172785}
{'POS': -4589.317455766216, 'NEG': -4547.035080254239}
{'POS': -4486.187327094776, 'NEG': -4430.959970009889}
{'POS'

In [13]:
review_idx = 12
review_idx % 10

2

In [7]:
# question 3.0
print("--- classifying reviews using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus)
# using cross-eval for smoothed predictions from now on
smoothed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.3f}")
print(f"Std. Dev: {NB.getStdDeviation()}")

--- classifying reviews using 10-fold cross-evaluation ---


ZeroDivisionError: float division by zero

In [None]:
# question 4.0
print("--- stemming corpus ---")
# retrieve corpus with tokenized text and stemming (using porter)
stemmed_corpus=MovieReviewCorpus(stemming=True,pos=False)
print("--- cross-validating NB using stemming ---")
NB.crossValidate(stemmed_corpus)
stemmed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.3f}")
print(f"Std. Dev: {NB.getStdDeviation():.3f}")

# TODO Q4.1
# see if stemming significantly improves results on smoothed NB
p_value=signTest.getSignificance(stemmed_preds,smoothed_preds) # note compared against version w/ smoothing! 
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using stemming are {significance} with respect to no stemming")

# TODO Q4.2
print("--- determining the number of features before/after stemming ---")

In [None]:
# question Q5.0
# cross-validate model using smoothing and bigrams
print("--- cross-validating naive bayes using smoothing and bigrams ---")
NB=NaiveBayesText(smoothing=True,bigrams=True,trigrams=False,discard_closed_class=False)
NB.crossValidate(corpus)
smoothed_and_bigram_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.2f}") 
print(f"Std. Dev: {NB.getStdDeviation():.2f}")


# see if bigrams significantly improves results on smoothed NB only
p_value=signTest.getSignificance(smoothed_preds,smoothed_and_bigram_preds)
signifance = "significant" if p_value < 0.05 else "not significant"
print(f"results using smoothing and bigrams are {signifance} with respect to smoothing only")


# TODO Q5.1

In [None]:
# TODO Q6 and 6.1
print("--- classifying reviews using SVM 10-fold cross-eval ---")

In [None]:
# TODO Q7
print("--- adding in POS information to corpus ---")
print("--- training svm on word+pos features ----")
print("--- training svm discarding closed-class words ---")

In [None]:
# question 8.0
print("--- using document embeddings ---")