In [2]:
from bs4 import BeautifulSoup

In [1]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [3]:
wn_lemmatizer = WordNetLemmatizer()
port_stemmer = PorterStemmer()

In [4]:
with open('../electronics/positive.review') as fid:
    positive_review_html = fid.read()
positive_reviews = BeautifulSoup(positive_review_html).findAll('review_text')

In [5]:
len(positive_reviews)

1000

In [6]:
with open('../electronics/negative.review') as fid:
    negative_review_html = fid.read()
negative_reviews = BeautifulSoup(negative_review_html).findAll('review_text')

In [7]:
len(negative_reviews)

1000

In [8]:
import nltk

In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
all_texts = [review.text.strip().lower() for review in positive_reviews]
all_texts += [review.text.strip().lower() for review in negative_reviews]
labels = ['positive' for _ in positive_reviews]
labels += ['negative' for _ in negative_reviews]

In [12]:
for i, text in enumerate(all_texts):
    words = nltk.tokenize.word_tokenize(text)
    text = ' '.join(map(lambda w: port_stemmer.stem(wn_lemmatizer.lemmatize(w)), words))
    all_texts[i] = text

In [13]:
all_texts[0]

'i purchas thi unit due to frequent blackout in my area and 2 power suppli go bad . it will run my cabl modem , router , pc , and lcd monitor for 5 minut . thi is more than enough time to save work and shut down . equal import , i know that my electron are receiv clean power . i feel that thi invest is minor compar to the loss of valuabl data or the failur of equip due to a power spike or an irregular power suppli . a alway , amazon had it to me in < 2 busi day'

In [21]:
with open('../stopwords.txt') as fid:
    stop_words = set(map(lambda w: port_stemmer.stem(w.strip()), fid.readlines()))

In [24]:
cnt_vectorizer = CountVectorizer(
    stop_words=stop_words
)
X = cnt_vectorizer.fit_transform(all_texts)

In [25]:
X.shape

(2000, 8010)

In [26]:
from sklearn.preprocessing import LabelEncoder

In [27]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

In [28]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [44]:
model = LogisticRegression(C=1, max_iter=200)

In [45]:
model.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [46]:
model.score(X_test, y_test)

0.775

In [47]:
model.score(X_train, y_train)

0.9971428571428571

In [48]:
from sklearn.metrics import classification_report

In [49]:
y_hat = model.predict(X)

In [50]:
print(classification_report(y, y_hat))

precision    recall  f1-score   support

           0       0.94      0.92      0.93      1000
           1       0.92      0.94      0.93      1000

    accuracy                           0.93      2000
   macro avg       0.93      0.93      0.93      2000
weighted avg       0.93      0.93      0.93      2000



In [51]:
cnt_vectorizer.get_feature_names()[:10]

['00',
 '000',
 '002',
 '007radardetectors',
 '00ghz',
 '01',
 '010',
 '0183',
 '04',
 '05']

## Extra
- https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html