In [108]:
import pandas as pd
import string
import nltk
import numpy
nltk.download()
from nltk.corpus import stopwords
import io 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet as wn
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from numpy import linalg
import matplotlib
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix


showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


# Part A

In [71]:
amazonRawText = open("amazon_cells_labelled.txt").read()
imdbRawText = open("imdb_labelled.txt").read()
yelpRawText = open("yelp_labelled.txt").read()
wnl = WordNetLemmatizer()
stemmer = SnowballStemmer("english")

Labels are even, 500 positive reviews and 500 negative reviews per source

# Part B

In [11]:
def preprocessingText(inputLine,stopWords):
    
    noStops = ""
    finalLine = inputLine.lower()
    finalLine = finalLine.rstrip()
    finalLine = finalLine.translate(None, string.punctuation)
    for word in finalLine.split(" "):
        if word != "" and all(ord(s) < 128 for s in word):
            try:
                part_of_speech = wn.synsets(word)[0].pos()
            except:
                part_of_speech = 'n'
            lemmedWord = wnl.lemmatize(word,pos=part_of_speech)
            lemmedWord = stemmer.stem(lemmedWord)
            if lemmedWord not in stopWords:
                noStops = noStops + word + " "
    finalLine = noStops.translate(None, string.punctuation)
    finalLine = finalLine.rstrip()
    
    return finalLine

In [12]:
def parseRawText(inputFile,stopWords):
    parsedLines = []
    parsedMatrix = []
    
    for line in inputFile.split("\n"):
        parsedLines.append(line)
        
    for row in parsedLines:
        if row != "":
            review = preprocessingText(row[:-1],stopWords)
            rating = int(row[-1])
            parsedMatrix.append([review,rating])
    
    return parsedMatrix
    

In [13]:
stopWords = []
keepWords = ['do','doing','but','until','with','against','above','below',
             'up','down','in','out','should', "should've",'on','off','not',
             'over','under',"wouldn't","won't","won","weren't","were","was",
             "wasn't","isn't","is","have","haven't","has","hasn't","had",
             "hadn't","does","doesn't","did","didn't","couldn't", 'are', 
             'be','more','most','no','nor',"don't", 'been', 'being','having',
             "shouldn't",'mightn', "mightn't", 'mustn', "mustn't", 'needn', 
             "needn't", 'shan', "shan't", 'shouldn', 'wasn', 'weren', 'wouldn',
             'aren', "aren't", 'couldn', 'didn','can', 'doesn', 'hadn', 'hasn', 
             'haven', 'ain', 'isn']

for word in stopwords.words('english'):
    word = str(word).translate(None, string.punctuation)
    try:
        part_of_speech = wn.synsets(word)[0].pos()
    except:
        part_of_speech = 'n'
    word = wnl.lemmatize(word,pos=part_of_speech)
    word = stemmer.stem(word)
    if word not in keepWords:
        stopWords.append(word) 
    

    
amazonLines = parseRawText(amazonRawText,stopWords)
imdbLines = parseRawText(imdbRawText,stopWords)
yelpLines = parseRawText(yelpRawText,stopWords)

print "Done!"

Done!


# Part C

Instead of creating a single train and test dataset, I split my datasets and testing per source

In [15]:
amazonGood = []
imdbGood = []
yelpGood = []

amazonBad = []
imdbBad = []
yelpBad = []

for good in amazonLines:
    if int(good[1]) == 1:
        amazonGood.append(good)
    else:
        amazonBad.append(good)
        
for good in imdbLines:
    if int(good[1]) == 1:
        imdbGood.append(good)
    else:
        imdbBad.append(good)
        
for good in yelpLines:
    if int(good[1]) == 1:
        yelpGood.append(good)
    else:
        yelpBad.append(good)  

In [16]:
amazonGoodTrain = amazonGood[:400]
amazonGoodTest = amazonGood[400:]
amazonBadTrain = amazonBad[:400]
amazonBadTest = amazonBad[400:]

imdbGoodTrain = imdbGood[:400]
imdbGoodTest = imdbGood[400:]
imdbBadTrain = imdbBad[:400]
imdbBadTest = imdbBad[400:]

yelpGoodTrain = yelpGood[:400]
yelpGoodTest = yelpGood[400:]
yelpBadTrain = yelpBad[:400]
yelpBadTest = yelpBad[400:]


In [17]:
amazonWordList = []
imdbWordList = []
yelpWordList = []
wordList = []

#Amazon
for line in amazonGoodTrain:
    for word in line[0].split(" "):
        if word not in amazonWordList:
            amazonWordList.append(word)
            
for line in amazonBadTrain:
    for word in line[0].split(" "):
        if word not in amazonWordList:
            amazonWordList.append(word)
            
#Yelp            
for line in yelpGoodTrain:
    for word in line[0].split(" "):
        if word not in yelpWordList:
            yelpWordList.append(word)
            
for line in yelpBadTrain:
    for word in line[0].split(" "):
        if word not in yelpWordList:
            yelpWordList.append(word)
            
#IMDB            
for line in imdbGoodTrain:
    for word in line[0].split(" "):
        if word not in imdbWordList:
            imdbWordList.append(word)
            
for line in imdbBadTrain:
    for word in line[0].split(" "):
        if word not in imdbWordList:
            imdbWordList.append(word)
            
            
for word in amazonWordList:
    wordList.append(word)
for word in yelpWordList:
    if word not in wordList:
        wordList.append(word)
for word in imdbWordList:
    if word not in wordList:
        wordList.append(word)

In [20]:
amazonNGramWordList = []
imdbNGramWordList = []
yelpNGramWordList = []
nGramWordList = []
span = 2

#Amazon
for line in amazonGoodTrain:
    words = line[0].split(" ")
    z = [" ".join(words[i:i+span]) for i in range(0, len(words)-1)]
    for nGram in z:
        if nGram not in amazonNGramWordList:
            amazonNGramWordList.append(nGram)
            
for line in amazonBadTrain:
    words = line[0].split(" ")
    z = [" ".join(words[i:i+span]) for i in range(0, len(words)-1)]
    for nGram in z:
        if nGram not in amazonNGramWordList:
            amazonNGramWordList.append(nGram)
            
            
            
#IMDB            
for line in imdbGoodTrain:
    words = line[0].split(" ")
    z = [" ".join(words[i:i+span]) for i in range(0, len(words)-1)]
    for nGram in z:
        if nGram not in imdbNGramWordList:
            imdbNGramWordList.append(nGram)
            
for line in imdbBadTrain:
    words = line[0].split(" ")
    z = [" ".join(words[i:i+span]) for i in range(0, len(words)-1)]
    for nGram in z:
        if nGram not in imdbNGramWordList:
            imdbNGramWordList.append(nGram)
            
            

#Yelp
for line in yelpGoodTrain:
    words = line[0].split(" ")
    z = [" ".join(words[i:i+span]) for i in range(0, len(words)-1)]
    for nGram in z:
        if nGram not in yelpNGramWordList:
            yelpNGramWordList.append(nGram)
            
for line in yelpBadTrain:
    words = line[0].split(" ")
    z = [" ".join(words[i:i+span]) for i in range(0, len(words)-1)]
    for nGram in z:
        if nGram not in yelpNGramWordList:
            yelpNGramWordList.append(nGram)
            
            
            
for word in amazonNGramWordList:
    nGramWordList.append(word)
for word in yelpNGramWordList:
    if word not in wordList:
        nGramWordList.append(word)
for word in imdbNGramWordList:
    if word not in wordList:
        nGramWordList.append(word)

# Part D

In [22]:
def featureVectors(inputMatrix,chosenWordList):
    outputMatrix = []
    for line in inputMatrix:
        tempLine = [0]*len(chosenWordList)
        for word in line[0].split(" "):
            if word in chosenWordList:
                index = chosenWordList.index(word)
                tempLine[index]+=1
        outputMatrix.append(tempLine)
        
    return outputMatrix

In [23]:
def nGramsFeatureVectors(inputMatrix,chosenWordList):
    outputMatrix = []
    for line in inputMatrix:
        tempLine = [0]*len(chosenWordList)
        words = line[0].split(" ")
        nGrams = [" ".join(words[i:i+span]) for i in range(0, len(words)-1)]
        for word in nGrams:
            if word in chosenWordList:
                index = chosenWordList.index(word)
                tempLine[index]+=1
        outputMatrix.append(tempLine)
        
    return outputMatrix

In [24]:
amazonFeatureGoodTrain = featureVectors(amazonGoodTrain,amazonWordList)
amazonFeatureGoodTest = featureVectors(amazonGoodTest,amazonWordList)
amazonFeatureBadTrain = featureVectors(amazonBadTrain,amazonWordList)
amazonFeatureBadTest = featureVectors(amazonBadTest,amazonWordList)

imdbFeatureGoodTrain = featureVectors(imdbGoodTrain,imdbWordList)
imdbFeatureGoodTest = featureVectors(imdbGoodTest,imdbWordList)
imdbFeatureBadTrain = featureVectors(imdbBadTrain,imdbWordList)
imdbFeatureBadTest = featureVectors(imdbBadTest,imdbWordList)

yelpFeatureGoodTrain = featureVectors(yelpGoodTrain,yelpWordList)
yelpFeatureGoodTest = featureVectors(yelpGoodTest,yelpWordList)
yelpFeatureBadTrain = featureVectors(yelpBadTrain,yelpWordList)
yelpFeatureBadTest = featureVectors(yelpBadTest,yelpWordList)

print "Done!"


Done!


In [25]:
print amazonFeatureGoodTrain[0]
print yelpFeatureBadTrain[99]

[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

# Part E

In [26]:
def frobNorm(inputMatrix):
    normVal = []
    normArray = []
    inputMatrix = numpy.asarray(inputMatrix)
    transposeMatrix = inputMatrix.T
    for line in transposeMatrix:
        normVal.append(numpy.linalg.norm(line))
    
    for line in inputMatrix:
        normArray.append(numpy.multiply(line,normVal))
        
    return normArray

# Part F

In [137]:
def reviewLogReg(wordList,goodTrain,badTrain,goodTest,badTest,featureGoodTrain,featureBadTrain,featureGoodTest,featureBadTest):
    trainLabels = []
    for line in goodTrain:
        trainLabels.append(line[1])
    for line in badTrain:
        trainLabels.append(line[1])

    trainData = numpy.concatenate((frobNorm(featureGoodTrain),frobNorm(featureBadTrain)))


    testLabels = []
    for line in goodTest:
        testLabels.append(line[1])
    for line in badTest:
        testLabels.append(line[1])

    testData = numpy.concatenate((frobNorm(featureGoodTest),featureBadTest))


    logReg = LogisticRegression()
    y_pred1 = logReg.fit(trainData, trainLabels)
    #training binary values, training encoded labels
    logRegScore = y_pred1.score(testData, testLabels)
    #testing binarvalues, testing encoded labels
    coefficients = logReg.coef_
    coef_dict = {}
    for i,coef in enumerate(coefficients[0]):
        coef_dict[i] = coef
    sortedList = sorted(coef_dict.items(), key=lambda kv:kv[1], reverse=True)
    for tuple in sortedList[:10]:
        print wordList[tuple[0]]
    
    
    
    GNB = GaussianNB()
    y_pred2 = GNB.fit(trainData,trainLabels)
    GNBScore = y_pred2.score(testData,testLabels)
    
    
    
    print "LogReg:"
    print confusion_matrix(y_pred1.predict(testData),testLabels)
    print "GNB:"
    print confusion_matrix(y_pred2.predict(testData),testLabels)
    
    
    return logRegScore,GNBScore

In [138]:
print reviewLogReg(amazonWordList,amazonGoodTrain,amazonBadTrain,amazonGoodTest,amazonBadTest,amazonFeatureGoodTrain,amazonFeatureBadTrain,amazonFeatureGoodTest,amazonFeatureBadTest)

love
without
best
nice
easy
everything
order
comfortable
price
excellent
LogReg:
[[95 29]
 [ 5 71]]
GNB:
[[58 21]
 [42 79]]
(0.83, 0.685)


In [139]:
print reviewLogReg(yelpWordList,yelpGoodTrain,yelpBadTrain,yelpGoodTest,yelpBadTest,yelpFeatureGoodTrain,yelpFeatureBadTrain,yelpFeatureGoodTest,yelpFeatureBadTest)

fantastic
fun
awesome
check
nice
delicious
loved
happy
tender
first
LogReg:
[[84 21]
 [16 79]]
GNB:
[[53 13]
 [47 87]]
(0.815, 0.7)


In [140]:
print reviewLogReg(imdbWordList,imdbGoodTrain,imdbBadTrain,imdbGoodTest,imdbBadTest,imdbFeatureGoodTrain,imdbFeatureBadTrain,imdbFeatureGoodTest,imdbFeatureBadTest)

liked
right
awesome
watch
funny
beautiful
loved
nice
1010
love
LogReg:
[[89 29]
 [11 71]]
GNB:
[[85 36]
 [15 64]]
(0.8, 0.745)


# Part G

In [155]:
def nGramLogReg(nGramWordList,goodTrain,badTrain,goodTest,badTest,featureGoodTrain,featureBadTrain,featureGoodTest,featureBadTest):
    trainLabels = []
    for line in goodTrain:
        trainLabels.append(line[1])
    for line in badTrain:
        trainLabels.append(line[1])

    trainData = numpy.concatenate((frobNorm(featureGoodTrain),frobNorm(featureBadTrain)))


    testLabels = []
    for line in goodTest:
        testLabels.append(line[1])
    for line in badTest:
        testLabels.append(line[1])

    testData = numpy.concatenate((featureGoodTest,featureBadTest))


    logReg = LogisticRegression()
    y_pred1 = logReg.fit(trainData, trainLabels)
    #training binary values, training encoded labels
    logRegScore = y_pred1.score(testData, testLabels)
    #testing binarvalues, testing encoded labels
    coefficients = logReg.coef_
    coef_dict = {}
    for i,coef in enumerate(coefficients[0]):
        coef_dict[i] = coef
    sortedList = sorted(coef_dict.items(), key=lambda kv:kv[1], reverse=True)
    for tuple in sortedList[:10]:
        print nGramWordList[tuple[0]]
    
    GNB = GaussianNB()
    y_pred2 = GNB.fit(trainData,trainLabels)
    GNBScore = y_pred2.score(testData,testLabels)
    
    print "LogReg:"
    print confusion_matrix(y_pred1.predict(testData),testLabels)
    print "GNB:"
    print confusion_matrix(y_pred2.predict(testData),testLabels)
    
    return logRegScore,GNBScore

In [156]:
amazonNGramFeatureGoodTrain = nGramsFeatureVectors(amazonGoodTrain,amazonNGramWordList)
amazonNGramFeatureGoodTest = nGramsFeatureVectors(amazonGoodTest,amazonNGramWordList)
amazonNGramFeatureBadTrain = nGramsFeatureVectors(amazonBadTrain,amazonNGramWordList)
amazonNGramFeatureBadTest = nGramsFeatureVectors(amazonBadTest,amazonNGramWordList)

imdbNGramFeatureGoodTrain = nGramsFeatureVectors(imdbGoodTrain,imdbNGramWordList)
imdbNGramFeatureGoodTest = nGramsFeatureVectors(imdbGoodTest,imdbNGramWordList)
imdbNGramFeatureBadTrain = nGramsFeatureVectors(imdbBadTrain,imdbNGramWordList)
imdbNGramFeatureBadTest = nGramsFeatureVectors(imdbBadTest,imdbNGramWordList)

yelpNGramFeatureGoodTrain = nGramsFeatureVectors(yelpGoodTrain,yelpNGramWordList)
yelpNGramFeatureGoodTest = nGramsFeatureVectors(yelpGoodTest,yelpNGramWordList)
yelpNGramFeatureBadTrain = nGramsFeatureVectors(yelpBadTrain,yelpNGramWordList)
yelpNGramFeatureBadTest = nGramsFeatureVectors(yelpBadTest,yelpNGramWordList)

print "Done!"


Done!


In [157]:
print nGramLogReg(amazonNGramWordList,amazonGoodTrain,amazonBadTrain,amazonGoodTest,amazonBadTest,amazonNGramFeatureGoodTrain,amazonNGramFeatureBadTrain,amazonNGramFeatureGoodTest,amazonNGramFeatureBadTest)

works great
great phone
easy use
highly recommend
works well
is excellent
great product
would recommend
up well
is great
LogReg:
[[95 58]
 [ 5 42]]
GNB:
[[34  9]
 [66 91]]
(0.685, 0.625)


In [158]:
print nGramLogReg(imdbNGramWordList,imdbGoodTrain,imdbBadTrain,imdbGoodTest,imdbBadTest,imdbNGramFeatureGoodTrain,imdbNGramFeatureBadTrain,imdbNGramFeatureGoodTest,imdbNGramFeatureBadTest)

10 out
is good
is great
ray charles
excellent performance
give one
one best
gave 10
are good
saw film
LogReg:
[[88 70]
 [12 30]]
GNB:
[[91 73]
 [ 9 27]]
(0.59, 0.59)


In [159]:
print nGramLogReg(yelpNGramWordList,yelpGoodTrain,yelpBadTrain,yelpGoodTest,yelpBadTest,yelpNGramFeatureGoodTrain,yelpNGramFeatureBadTrain,yelpNGramFeatureGoodTest,yelpNGramFeatureBadTest)

really good
food delicious
is good
great place
in town
food good
great food
definitely be
friendly staff
check out
LogReg:
[[27 13]
 [73 87]]
GNB:
[[24  9]
 [76 91]]
(0.57, 0.575)


# Part H 

Confusion matrices are avoided simply to save space, please refer above to method used for printing confusion matrices.

In [151]:
def standardizeMatrix(inputMatrix,mean="skip"):
    inputMatrix = numpy.asarray(inputMatrix)
    if mean == "skip":
        mean = inputMatrix.mean()
    centeredMatrix = inputMatrix - mean
    return centeredMatrix,mean

In [152]:
def pcaMatrixMaker(inputTrainMatrix,inputTestMatrix,k):
    centeredTrainMatrix,avg = standardizeMatrix(inputTrainMatrix)
    U,D,Vt = numpy.linalg.svd(centeredTrainMatrix)
    Vt = numpy.asarray(Vt)
    pcaTrainMatrix = numpy.matmul(centeredTrainMatrix,Vt.T[:,:k])
    
    centeredTestMatrix,avg = standardizeMatrix(inputTestMatrix)
    pcaTestMatrix = numpy.matmul(centeredTestMatrix,Vt.T[:,:k])
    
    
    return pcaTrainMatrix, pcaTestMatrix

Normal Bag-of-Words: (LogReg,GNB)

In [160]:
trainConc = numpy.concatenate((amazonFeatureGoodTrain,amazonFeatureBadTrain))
testConc = numpy.concatenate((amazonFeatureGoodTest,amazonFeatureBadTest))
A,B = pcaMatrixMaker(trainConc,testConc,10)
score = reviewLogReg(amazonWordList,amazonGoodTrain,amazonBadTrain,amazonGoodTest,amazonBadTest,A[:400],A[400:],B[:400],B[400:])    
print score

print

trainConc = numpy.concatenate((amazonFeatureGoodTrain,amazonFeatureBadTrain))
testConc = numpy.concatenate((amazonFeatureGoodTest,amazonFeatureBadTest))
A,B = pcaMatrixMaker(trainConc,testConc,50)
score = reviewLogReg(amazonWordList,amazonGoodTrain,amazonBadTrain,amazonGoodTest,amazonBadTest,A[:400],A[400:],B[:400],B[400:])    
print score

print

trainConc = numpy.concatenate((amazonFeatureGoodTrain,amazonFeatureBadTrain))
testConc = numpy.concatenate((amazonFeatureGoodTest,amazonFeatureBadTest))
A,B = pcaMatrixMaker(trainConc,testConc,100)
score = reviewLogReg(amazonWordList,amazonGoodTrain,amazonBadTrain,amazonGoodTest,amazonBadTest,A[:400],A[400:],B[:400],B[400:])    
print score

are
great
razr
jawbone
case
mic
is
good
excellent
value
LogReg:
[[91 59]
 [ 9 41]]
GNB:
[[97 75]
 [ 3 25]]
(0.66, 0.61)

are
combination
razr
use
three
recommend
great
going
blue
jawbone
LogReg:
[[90 47]
 [10 53]]
GNB:
[[89 49]
 [11 51]]
(0.715, 0.7)

well
actually
seems
car
ac
charger
bulky
are
2mp
mobile
LogReg:
[[92 42]
 [ 8 58]]
GNB:
[[  0   1]
 [100  99]]
(0.75, 0.495)


In [161]:
trainConc = numpy.concatenate((imdbFeatureGoodTrain,imdbFeatureBadTrain))
testConc = numpy.concatenate((imdbFeatureGoodTest,imdbFeatureBadTest))
A,B = pcaMatrixMaker(trainConc,testConc,10)
score = reviewLogReg(imdbWordList,imdbGoodTrain,imdbBadTrain,imdbGoodTest,imdbBadTest,A[:400],A[400:],B[:400],B[400:])    
print score

print

trainConc = numpy.concatenate((imdbFeatureGoodTrain,imdbFeatureBadTrain))
testConc = numpy.concatenate((imdbFeatureGoodTest,imdbFeatureBadTest))
A,B = pcaMatrixMaker(trainConc,testConc,50)
score = reviewLogReg(imdbWordList,imdbGoodTrain,imdbBadTrain,imdbGoodTest,imdbBadTest,A[:400],A[400:],B[:400],B[400:])    
print score

print

trainConc = numpy.concatenate((imdbFeatureGoodTrain,imdbFeatureBadTrain))
testConc = numpy.concatenate((imdbFeatureGoodTest,imdbFeatureBadTest))
A,B = pcaMatrixMaker(trainConc,testConc,100)
score = reviewLogReg(imdbWordList,imdbGoodTrain,imdbBadTrain,imdbGoodTest,imdbBadTest,A[:400],A[400:],B[:400],B[400:])    
print score

song
keeps
trying
gerardo
scene
best
find
movie
in
is
LogReg:
[[61 51]
 [39 49]]
GNB:
[[57 49]
 [43 51]]
(0.55, 0.54)

effort
face
adorable
case
saw
song
hilarious
messages
showed
keeps
LogReg:
[[73 49]
 [27 51]]
GNB:
[[100 100]
 [  0   0]]
(0.62, 0.5)

words
face
think
effort
film
constructed
case
two
cinema
adorable
LogReg:
[[74 41]
 [26 59]]
GNB:
[[100 100]
 [  0   0]]
(0.665, 0.5)


In [162]:
trainConc = numpy.concatenate((yelpFeatureGoodTrain,yelpFeatureBadTrain))
testConc = numpy.concatenate((yelpFeatureGoodTest,yelpFeatureBadTest))
A,B = pcaMatrixMaker(trainConc,testConc,10)
score = reviewLogReg(yelpWordList,yelpGoodTrain,yelpBadTrain,yelpGoodTest,yelpBadTest,A[:400],A[400:],B[:400],B[400:])    
print score

print

trainConc = numpy.concatenate((yelpFeatureGoodTrain,yelpFeatureBadTrain))
testConc = numpy.concatenate((yelpFeatureGoodTest,yelpFeatureBadTest))
A,B = pcaMatrixMaker(trainConc,testConc,50)
score = reviewLogReg(yelpWordList,yelpGoodTrain,yelpBadTrain,yelpGoodTest,yelpBadTest,A[:400],A[400:],B[:400],B[400:])    
print score

print

trainConc = numpy.concatenate((yelpFeatureGoodTrain,yelpFeatureBadTrain))
testConc = numpy.concatenate((yelpFeatureGoodTest,yelpFeatureBadTest))
A,B = pcaMatrixMaker(trainConc,testConc,100)
score = reviewLogReg(yelpWordList,yelpGoodTrain,yelpBadTrain,yelpGoodTest,yelpBadTest,A[:400],A[400:],B[:400],B[400:])    
print score

may
loved
late
wow
rick
holiday
place
bank
stopped
off
LogReg:
[[72 42]
 [28 58]]
GNB:
[[78 59]
 [22 41]]
(0.65, 0.595)

cod
highly
food
cute
interior
cape
is
care
velvet
may
LogReg:
[[78 40]
 [22 60]]
GNB:
[[  1   0]
 [ 99 100]]
(0.69, 0.505)

provided
delight
combos
cod
mexican
not
server
highly
right
be
LogReg:
[[85 30]
 [15 70]]
GNB:
[[  0   0]
 [100 100]]
(0.775, 0.5)


N-Grams: (LogReg,GNB)

In [163]:
trainConc = numpy.concatenate((amazonNGramFeatureGoodTrain,amazonNGramFeatureBadTrain))
testConc = numpy.concatenate((amazonNGramFeatureGoodTest,amazonNGramFeatureBadTest))
A,B = pcaMatrixMaker(trainConc,testConc,10)
score = nGramLogReg(amazonNGramWordList,amazonGoodTrain,amazonBadTrain,amazonGoodTest,amazonBadTest,A[:400],A[400:],B[:400],B[400:])    
print score

print

trainConc = numpy.concatenate((amazonFeatureGoodTrain,amazonFeatureBadTrain))
testConc = numpy.concatenate((amazonFeatureGoodTest,amazonFeatureBadTest))
A,B = pcaMatrixMaker(trainConc,testConc,50)
score = nGramLogReg(amazonNGramWordList,amazonGoodTrain,amazonBadTrain,amazonGoodTest,amazonBadTest,A[:400],A[400:],B[:400],B[400:])    
print score

print

trainConc = numpy.concatenate((amazonFeatureGoodTrain,amazonFeatureBadTrain))
testConc = numpy.concatenate((amazonFeatureGoodTest,amazonFeatureBadTest))
A,B = pcaMatrixMaker(trainConc,testConc,100)
score = nGramLogReg(amazonNGramWordList,amazonGoodTrain,amazonBadTrain,amazonGoodTest,amazonBadTest,A[:400],A[400:],B[:400],B[400:])    
print score

great jawbone
razr owneryou
case excellent
mic is
must have
owneryou must
is great
good case
excellent value
are razr
LogReg:
[[100  85]
 [  0  15]]
GNB:
[[100  96]
 [  0   4]]
(0.575, 0.52)

owneryou must
phone 7
must have
fire absolutely
without charging
blue tooth
mic is
extended battery
far good
is great
LogReg:
[[100  93]
 [  0   7]]
GNB:
[[100  99]
 [  0   1]]
(0.535, 0.505)

charger well
excellent bluetooth
clear with
are sensitive
well ac
car charger
nice headset
owneryou must
case is
phone ive
LogReg:
[[99 87]
 [ 1 13]]
GNB:
[[  0   0]
 [100 100]]
(0.56, 0.5)


In [164]:
trainConc = numpy.concatenate((imdbNGramFeatureGoodTrain,imdbNGramFeatureBadTrain))
testConc = numpy.concatenate((imdbNGramFeatureGoodTest,imdbNGramFeatureBadTest))
A,B = pcaMatrixMaker(trainConc,testConc,10)
score = nGramLogReg(imdbNGramWordList,imdbGoodTrain,imdbBadTrain,imdbGoodTest,imdbBadTest,A[:400],A[400:],B[:400],B[400:])    
print score

print

trainConc = numpy.concatenate((imdbNGramFeatureGoodTrain,imdbNGramFeatureBadTrain))
testConc = numpy.concatenate((imdbNGramFeatureGoodTest,imdbNGramFeatureBadTest))
A,B = pcaMatrixMaker(trainConc,testConc,50)
score = nGramLogReg(imdbNGramWordList,imdbGoodTrain,imdbBadTrain,imdbGoodTest,imdbBadTest,A[:400],A[400:],B[:400],B[400:])    
print score

print

trainConc = numpy.concatenate((imdbNGramFeatureGoodTrain,imdbNGramFeatureBadTrain))
testConc = numpy.concatenate((imdbNGramFeatureGoodTest,imdbNGramFeatureBadTest))
A,B = pcaMatrixMaker(trainConc,testConc,100)
score = nGramLogReg(imdbNGramWordList,imdbGoodTrain,imdbBadTrain,imdbGoodTest,imdbBadTest,A[:400],A[400:],B[:400],B[400:])    
print score

keeps running
movie gerardo
in movie
best scene
trying find
scene in
gerardo is
is trying
find song
song keeps
LogReg:
[[ 6  7]
 [94 93]]
GNB:
[[97 92]
 [ 3  8]]
(0.495, 0.525)

running head
muppets were
almost right
right on
review is
saw movie
science teacher
is right
were best
trying find
LogReg:
[[22 13]
 [78 87]]
GNB:
[[99 97]
 [ 1  3]]
(0.545, 0.51)

almost right
editing directing
easily most
muppets were
overdue since
think no
review is
constructed in
cinematography acting
delivers everything
LogReg:
[[85 77]
 [15 23]]
GNB:
[[98 98]
 [ 2  2]]
(0.54, 0.5)


In [165]:
trainConc = numpy.concatenate((yelpNGramFeatureGoodTrain,yelpNGramFeatureBadTrain))
testConc = numpy.concatenate((yelpNGramFeatureGoodTest,yelpNGramFeatureBadTest))
A,B = pcaMatrixMaker(trainConc,testConc,10)
score = nGramLogReg(yelpNGramWordList,yelpGoodTrain,yelpBadTrain,yelpGoodTest,yelpBadTest,A[:400],A[400:],B[:400],B[400:])    
print score

print

trainConc = numpy.concatenate((yelpNGramFeatureGoodTrain,yelpNGramFeatureBadTrain))
testConc = numpy.concatenate((yelpNGramFeatureGoodTest,yelpNGramFeatureBadTest))
A,B = pcaMatrixMaker(trainConc,testConc,50)
score = nGramLogReg(yelpNGramWordList,yelpGoodTrain,yelpBadTrain,yelpGoodTest,yelpBadTest,A[:400],A[400:],B[:400],B[400:])    
print score

print

trainConc = numpy.concatenate((yelpNGramFeatureGoodTrain,yelpNGramFeatureBadTrain))
testConc = numpy.concatenate((yelpNGramFeatureGoodTest,yelpNGramFeatureBadTest))
A,B = pcaMatrixMaker(trainConc,testConc,100)
score = nGramLogReg(yelpNGramWordList,yelpGoodTrain,yelpBadTrain,yelpGoodTest,yelpBadTest,A[:400],A[400:],B[:400],B[400:])    
print score

late may
rick steve
holiday off
may bank
wow loved
stopped late
loved place
off rick
bank holiday
steve recommendation
LogReg:
[[16  6]
 [84 94]]
GNB:
[[10  7]
 [90 93]]
(0.55, 0.515)

tacos friendly
also cute
cakeohhh stuff
cape cod
were great
street tacos
late may
velvet cakeohhh
holiday off
great touch
LogReg:
[[17  9]
 [83 91]]
GNB:
[[10  5]
 [90 95]]
(0.54, 0.525)

tacos friendly
place lot
visit hiro
also cute
combos like
were great
first visit
note server
cape cod
velvet cakeohhh
LogReg:
[[19 11]
 [81 89]]
GNB:
[[12  5]
 [88 95]]
(0.54, 0.535)
