In [207]:
import numpy as np
import csv
import pandas as pd
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from nltk.corpus import opinion_lexicon as op
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import pos_tag

np.set_printoptions(linewidth=400)   # optional: widens column of numpy array display

In [208]:
pos_words = set(op.words('positive-words.txt'))
neg_words = set(op.words('negative-words.txt'))

In [209]:
X = []
y = []
feature_cols = ["word_count", "pos"]


with open("reviews.txt", 'r', encoding='utf8') as reviews:
    for review in reviews:
        if review != "":
            review = review.strip()
            review = review.lower()
            review = word_tokenize(review)
            X.append(review[:-1])
            y.append(int(review[-1]))

In [210]:
with open("reviews.csv", 'w', newline='') as rv:
    thefile = csv.writer(rv)
    thefile.writerow(['len_word', 'pos', 'neg', 'rating'])
    for x,line in enumerate(X):
        posCount = 0 ; negCount = 0
        
        leng = len(line)
        for word in line:
            if word in pos_words:
                posCount +=1
            elif word in neg_words:
                negCount+=1
        value = y[x]
        line = [leng,posCount,negCount,value]
        #print(line)
        thefile.writerow(line)
   
    

In [211]:
df = pd.read_csv("reviews.csv", header=0)

df.shape

(2499, 4)

In [212]:
X = df.drop('rating', axis=1)
y = df['rating']

In [213]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [214]:
for mclass in ('multinomial', 'ovr'):
    lr = LogisticRegression(solver='lbfgs', max_iter=30000, random_state=0, multi_class=mclass).fit(X_train, y_train)
    yhat = lr.predict(X_test)
     
    # the 3 lines below show how to invoke various output    
    print("\n",mclass,"Accuracy",accuracy_score(y_test, yhat))
    print("\n",mclass,"Classification Report\n",classification_report(y_test, yhat),sep="")
    print("\n",mclass,"Classification Report\n",confusion_matrix(y_test, yhat),sep="")


 multinomial Accuracy 0.684

multinomialClassification Report
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00        10
           3       0.00      0.00      0.00        42
           4       0.00      0.00      0.00       102
           5       0.69      0.99      0.81       344

    accuracy                           0.68       500
   macro avg       0.14      0.20      0.16       500
weighted avg       0.47      0.68      0.56       500


multinomialClassification Report
[[  0   0   0   0   2]
 [  0   0   0   0  10]
 [  0   0   0   0  42]
 [  1   1   0   0 100]
 [  2   0   0   0 342]]

 ovr Accuracy 0.688

ovrClassification Report
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00        10
           3       0.00      0.00      0.00        42
           4       0.00      0.00 


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.

