In [361]:
## Import all the things. 
import numpy as np
import csv
import pandas as pd
import random
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from nltk.corpus import opinion_lexicon as op
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import PunktSentenceTokenizer
from nltk import pos_tag
np.set_printoptions(linewidth=400)   # optional: widens column of numpy array display

In [362]:
## get positve and negative words from the lexicon. 
pos_words = set(op.words('positive-words.txt'))
neg_words = set(op.words('negative-words.txt'))

In [363]:
## create lists for response and target. 
X = [] ; y = []; initial_set = []; X_test = []; y_test = []
with open("reviews.txt", 'r', encoding='utf8') as reviews:
    for review in reviews:
        if review != "":
            review = review.strip()
            review = review.lower()
            #review = word_tokenize(review)
            if len(review) > 1:
                initial_set.append(review)
    random.shuffle(initial_set)
num_sentences = len(initial_set)

In [364]:
# write out a split to files:
train_len = int(0.8 * num_sentences)
test_len = num_sentences - train_len

train_list = initial_set[:train_len]
test_list = initial_set[train_len:]
print(len(train_list))
print(len(test_list))

with open('reviews-train.txt', 'w') as r_train:
    for line in train_list:
        line = str(line)
        r_train.write(line)
        r_train.write('\n')
with open('reviews-test.txt', 'w') as r_test:
    for test_line in test_list:
        test_line = str(test_line)
        r_test.write(test_line)
        r_test.write('\n')

1999
500


In [365]:
def return_rating(posCount, negCount):
        stars = 0
        if negCount > (posCount*2):
            stars = 1
        elif posCount > (negCount*2):
            stars = 5
        elif negCount == posCount:
            stars = 3
        elif negCount < posCount:
            stars = 4
        elif posCount < negCount:
            stars = 2
        return stars
            

In [366]:
rev_train_file = open('reviews-train.txt')
lines = rev_train_file.readlines()
for line in lines:
    line = line.strip()
    X.append(word_tokenize(line[:-1]))
    y.append(line[-1])
rev_train_file.close()

In [367]:
rev_test_file = open('reviews-test.txt')
lines = rev_test_file.readlines()
for line in lines:
    line = line.strip()
    X_test.append(word_tokenize(line[:-1]))
    y_test.append(line[-1])
rev_test_file.close()

In [368]:
def pos_neg_count(line):
    posCount = 0; negCount = 0
    for word in line:
        if word in pos_words:
            posCount +=1
        elif word in neg_words:
            negCount+=1
    return posCount, negCount
    

In [375]:
def superlative_count(tokenized):
    try:
        for i in tokenized[:1]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        pass
    count = 0
    for x in tagged:
        if (x[1]) == 'JJS':
            count +=6
        if (x[1]) == 'JJR':
            count +=3
        if (x[1]) == 'JJ':
            count +=1
        if (x[1]) == 'RBS':
            count += 6
        if (x[1]) == 'PRP':
            count += 6
        
    return count

In [379]:
###################################################################################################################
############################################ FEATURE EXTRACTION ###################################################
###################################################################################################################
def np_feature_extraction(X,y):
    X_list = [] ; y_list = []

    punktLine = PunktSentenceTokenizer()
    for x,line in enumerate(X):
        ### GET TARGET #####
        rating = int(y[x])
        print("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$${} $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$".format(rating))
        # GET POS AND NEG COUNTS
        posCount, negCount = pos_neg_count(line)
        ## POS TAGGING SUPERLATIVES
        sent = ' '.join(line)
        token = punktLine.tokenize(sent)
        JJS = superlative_count(token)
        ### Apply guessed rating
        rating_result = return_rating(posCount, negCount)
        
        
        ## THIS IS THE NUMPY LINE WITH FEATURES
        line = [len(sent),posCount, negCount, JJS, rating_result]
     
        #print("rating {}  calculated rating {}".format(rating,rating_result))
        ### APPEND LINES #####
        X_list.append(line)
        y_list.append(rating)
        
    ##TURN INTO NUMPY ARRAYS
    X_np = (np.array(X_list))
    y_np = (np.array(y_list))
    
    return X_np,y_np
#########################################################################################################################

In [380]:
X_train_np, y_train_np = np_feature_extraction(X,y)
# for line in X_train_np:
#     print(line)
X_test_np, y_test_np = np_feature_extraction(X_test, y_test)


$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$5 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
[('fast', 'RB'), ('fret', 'JJ'), (',', ','), ('or', 'CC'), ('at', 'IN'), ('least', 'JJS'), ('some', 'DT'), ('kind', 'NN'), ('of', 'IN'), ('string', 'NN'), ('cleaner', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ("'must", 'JJ'), ('have', 'VBP'), ("'", 'VBN'), ('for', 'IN'), ('anyone', 'NN'), ('who', 'WP'), ('plays', 'VBZ'), ('guitar', 'NN'), ('.', '.')]
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$5 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
[('for', 'IN'), ('the', 'DT'), ('price', 'NN'), ('this', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('wonderful', 'JJ'), ('strap', 'NN'), ('.', '.')]
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$5 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
[('these', 'DT'), ('are', 'VBP'), ('condenser', 'NN'), ('mics', 'NNS'), ('so', 'IN'), ('you', 'PRP'), ('will', 'MD'), ('need', 'VB'), ('phantom', 'JJ'), ('power', 'NN'), ('to', 'TO'), ('make', 'VB'), ('them', 'PRP'), ('work', 'VB'), ('.', '.')]
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$2 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
[('great', 'JJ'

In [381]:
for mclass in ('multinomial', 'ovr'):
    lr = LogisticRegression(solver='lbfgs', max_iter=3000, random_state=0, multi_class=mclass).fit(X_train_np, y_train_np)
    yhat = lr.predict(X_test_np)
     
    # the 3 lines below show how to invoke various output    
    print("\n",mclass,"Accuracy",accuracy_score(y_test_np, yhat))
    print("\n",mclass,"Classification Report\n",classification_report(y_test_np, yhat),sep="")
    print("\n",mclass,"Classification Report\n",confusion_matrix(y_test_np, yhat),sep="")


 multinomial Accuracy 0.69

multinomialClassification Report
              precision    recall  f1-score   support

           1       0.33      0.10      0.15        10
           2       0.00      0.00      0.00        13
           3       0.00      0.00      0.00        33
           4       0.00      0.00      0.00        96
           5       0.70      0.99      0.82       348

   micro avg       0.69      0.69      0.69       500
   macro avg       0.21      0.22      0.19       500
weighted avg       0.49      0.69      0.57       500


multinomialClassification Report
[[  1   0   0   0   9]
 [  0   0   0   0  13]
 [  0   0   0   0  33]
 [  1   0   0   0  95]
 [  1   0   1   2 344]]

 ovr Accuracy 0.688

ovrClassification Report
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        10
           2       0.00      0.00      0.00        13
           3       0.00      0.00      0.00        33
           4       0.00      0.00  

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
