In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk.stem.porter import PorterStemmer

In [2]:
reviews = []
for line in open('all_balanced.review_e', 'r'):
    reviews.append(line.strip())
#reviews[0]

In [3]:
text=[]
rate_text=[]
for review in reviews:
    for match in re.finditer(r'(#label#:\d)', review):
        text.append(review[0:match.span()[0]])
        rate_text.append(match.groups()[0])
#print(text[0])
#print(rate_text)

In [4]:
ratings = [int(int(rate[8:9])>3) for rate in rate_text]
#ratings

In [5]:
def remove_count(text):
    i=0
    for match in re.finditer(r'(:\d)',text):
        text = text[0:match.span()[0]-i] + text[match.span()[1]-i:len(text)]
        i += 2
    return(text)

In [7]:
clean_text=[]
for line in text:
    clean_text.append(remove_count(line))
#clean_text[0]

In [8]:
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\<)|(\>)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)|(\_)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

In [9]:
reviews_clean = preprocess_reviews(clean_text)
#reviews_clean[0]

In [10]:
porter=PorterStemmer()

reviews_clean_stemmed = []
for reviews in reviews_clean:
    stems = [porter.stem(word)+' ' for word in reviews.split()]
    reviews_clean_stemmed.append("".join(stems))
#reviews_clean_stemmed[0]

In [11]:
cv = CountVectorizer(binary=True)
cv.fit(reviews_clean_stemmed)
X = cv.transform(reviews_clean_stemmed)
#print(X.toarray())
#print(cv.get_feature_names())
print(X.shape)

(5901, 19542)


In [12]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(reviews_clean_stemmed)
print(X.toarray())
print(X.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(5901, 19542)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, ratings, train_size = 0.75, random_state=1)



In [14]:
X_train2, X_val, y_train2, y_val = train_test_split(
    X_train, y_train, train_size = 0.75, random_state=1)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train2, y_train2)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.8130081300813008
Accuracy for C=0.05: 0.8220415537488708
Accuracy for C=0.25: 0.8355916892502259
Accuracy for C=0.5: 0.8455284552845529
Accuracy for C=1: 0.8536585365853658


In [15]:
final_model = LogisticRegression(C=.25)
final_model.fit(X_train, y_train)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, final_model.predict(X_test)))

Final Accuracy: 0.8367208672086721


In [16]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
}

In [17]:
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:15]:
    print (best_positive)

('great', 3.6650672713273735)
('easi', 2.1264550530605373)
('good', 1.7290371930967103)
('excel', 1.6513705833978531)
('price', 1.6149153708910073)
('littl', 1.5471430463587588)
('well', 1.4453883839420179)
('love', 1.3221256877828178)
('best', 1.26580441840688)
('and', 1.2488612129365388)
('perfect', 1.1706005784230284)
('veri', 1.1537493368006366)
('happi', 1.022035869794069)
('highli', 1.007894147905282)
('need', 0.951830019813722)


In [18]:
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:15]:
    print (best_negative)

('not', -2.81110633756977)
('return', -2.2258778145872355)
('poor', -1.7299336542655193)
('tri', -1.5323506054346638)
('wast', -1.412525693332159)
('after', -1.3877357936863717)
('disappoint', -1.2880860212297287)
('bad', -1.2468944069542889)
('support', -1.2053012642834906)
('buy', -1.1197920555283813)
('back', -1.0974955402841133)
('doesnt', -0.9250799669608176)
('stop', -0.8933394511216963)
('call', -0.8865215187321196)
('they', -0.8835064378998665)


In [19]:
test_reviews = []
for line in open('all_balanced.review_k', 'r'):
    test_reviews.append(line.strip())
test_reviews[0]

In [20]:
text=[]
rate_text=[]
for review in test_reviews:
    for match in re.finditer(r'(#label#:\d)', review):
        text.append(review[0:match.span()[0]])
        rate_text.append(match.groups()[0])

In [21]:
test_ratings = [int(int(rate[8:9])>3) for rate in rate_text]

In [22]:
clean_text=[]
for line in text:
    clean_text.append(remove_count(line))

In [23]:
reviews_clean = preprocess_reviews(clean_text)

In [24]:
reviews_clean_stemmed = []
for reviews in reviews_clean:
    stems = [porter.stem(word)+' ' for word in reviews.split()]
    reviews_clean_stemmed.append("".join(stems))

In [25]:
X_testf = tfidf.transform(reviews_clean_stemmed)
#print(X.toarray())
#print(cv.get_feature_names())
print(X_testf.shape)

(5149, 19542)


In [26]:
print ("Complete Accuracy: %s" 
       % accuracy_score(test_ratings, final_model.predict(X_testf)))

Complete Accuracy: 0.8292872402408235
