In [174]:
## Import all the things. 
import numpy as np
import csv
import pandas as pd
import random
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from nltk.corpus import opinion_lexicon as op
from nltk.tokenize import word_tokenize
from nltk.tokenize import PunktSentenceTokenizer
from nltk import pos_tag
from collections import defaultdict
np.set_printoptions(linewidth=400)   # optional: widens column of numpy array display

In [175]:
## get positve and negative words from the lexicon. 
pos_words = set(op.words('positive-words.txt'))
neg_words = set(op.words('negative-words.txt'))

In [176]:
## create initial list to work on 
X = [] ; y = []; initial_set = []; X_test = []; y_test = []
with open("reviews.txt", 'r', encoding='utf8') as reviews:
    for review in reviews:
        if review != "":
            review = review.strip()
            review = review.lower()
            #review = word_tokenize(review)
            if len(review) > 1:
                initial_set.append(review)
num_sentences = len(initial_set)

In [177]:
########## Since the data predominated with 5's I developed a cumbersome method to makes sure the data trained on was 
############# representative of the different classes. I made sure the poorest represented class was fully represented.
my_dict = defaultdict(list)

### Append each review to a rating key
for item in (initial_set):
    my_dict[int(item[-1])].append(item)

### Find the shortest list of reviews, in this case it is 1    
leng = 1000000
for key,value in my_dict.items():
    v = -1
    if len(value) < leng:
        leng = len(value)
        v = key

#### write out each rating to the training file making sure each is represented        
count = 0
with open("reviews-train.txt", 'w') as r_train:
    for idx in range(0,leng):
        for key,value in dates_dict.items():
             r_train.write(value[idx])
             r_train.write('\n')   
             count+=1

#### write the rest out to a file.                 
test_set = initial_set[count:]
with open("reviews-test.txt", 'w') as r_test:
    for line in test_set:
        r_test.write(line)
        r_test.write('\n')

In [178]:
########## this was my attempt to develop a rating system based on number of positives versus number of negatives.

def return_rating(posCount, negCount):
        stars = 0
        if negCount > (posCount*2):
            stars = 1
        elif posCount > (negCount*2):
            stars = 5
        elif negCount == posCount:
            stars = 3
        elif negCount < posCount:
            stars = 4
        elif posCount < negCount:
            stars = 2
        return stars
            

In [179]:
######## open each file, read into memory, strip excess white space, and break into target and features. 
rev_train_file = open('reviews-train.txt')
lines = rev_train_file.readlines()
for line in lines:
    line = line.strip()
    X.append(word_tokenize(line[:-1]))
    y.append(line[-1])
 
rev_train_file.close()

In [180]:
######## open each file, read into memory, strip excess white space, and break into target and features. 
rev_test_file = open('reviews-test.txt')
lines = rev_test_file.readlines()
for line in lines:
    line = line.strip()
    X_test.append(word_tokenize(line[:-1]))
    y_test.append(line[-1])
rev_test_file.close()

In [181]:
### This function just returned the number of positive and negative words from the opinion lexicon

def pos_neg_count(line):
    posCount = 0; negCount = 0
    for word in line:
        if word in pos_words:
            posCount +=1
        elif word in neg_words:
            negCount+=1
    return posCount, negCount
    

In [182]:
def superlative_count(tokenized):
    
    for i in tokenized[:1]:
        words = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(words)
    count = 0
    for x in tagged:
        if (x[1]) == 'JJS':
            count +=6
        if (x[1]) == 'JJR':
            count +=3
        if (x[1]) == 'JJ':
            count +=1
        if (x[1]) == 'RBS':
            count += 6
        if (x[1]) == 'PRP':
            count += 6
        
    return count

In [183]:
###################################################################################################################
############################################ FEATURE EXTRACTION ###################################################
###################################################################################################################
def np_feature_extraction(X,y):
    X_list = [] ; y_list = []

    punktLine = PunktSentenceTokenizer()
    for x,line in enumerate(X):
        ### GET TARGET #####
        rating = int(y[x])
     
        # GET POS AND NEG COUNTS
        posCount, negCount = pos_neg_count(line)
        ## POS TAGGING SUPERLATIVES
        sent = ' '.join(line)
        token = punktLine.tokenize(sent)
        superlative = superlative_count(token)
        ### Apply guessed rating
        rating_result = return_rating(posCount, negCount)
        
        ## THIS IS THE NUMPY LINE WITH FEATURES
        line = [posCount, negCount, superlative, rating_result]
     
        ### APPEND LINES #####
        X_list.append(line)
        y_list.append(rating)
        
    ##TURN INTO NUMPY ARRAYS
    X_np = (np.array(X_list))
    y_np = (np.array(y_list))
    
    return X_np,y_np
#########################################################################################################################

In [184]:
X_train_np, y_train_np = np_feature_extraction(X,y)
X_test_np, y_test_np = np_feature_extraction(X_test, y_test)


In [185]:
for mclass in ('multinomial', 'ovr'):
    lr = LogisticRegression(solver='lbfgs', max_iter=30000, random_state=0, multi_class=mclass).fit(X_train_np, y_train_np)
    yhat = lr.predict(X_test_np)
     
    # the 3 lines below show how to invoke various output    
    print("\n",mclass,"Accuracy",accuracy_score(y_test_np, yhat))
    print("\n",mclass,"Classification Report\n",classification_report(y_test_np, yhat),sep="")
    print("\n",mclass,"Classification Report\n",confusion_matrix(y_test_np, yhat),sep="")


 multinomial Accuracy 0.49444197421076036

multinomialClassification Report
              precision    recall  f1-score   support

           1       0.09      0.53      0.15        43
           2       0.04      0.10      0.06        61
           3       0.10      0.19      0.13       161
           4       0.29      0.12      0.17       464
           5       0.74      0.66      0.69      1520

    accuracy                           0.49      2249
   macro avg       0.25      0.32      0.24      2249
weighted avg       0.57      0.49      0.52      2249


multinomialClassification Report
[[ 23   3   5   5   7]
 [ 21   6   9   5  20]
 [ 34  15  30  14  68]
 [ 49  36  59  57 263]
 [135  76 200 113 996]]

 ovr Accuracy 0.49755446865273456

ovrClassification Report
              precision    recall  f1-score   support

           1       0.08      0.56      0.14        43
           2       0.00      0.00      0.00        61
           3       0.10      0.20      0.13       161
      