In [1]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from nltk.classify.scikitlearn import SklearnClassifier
from transformers import pipeline
import torch

from sklearn.metrics import roc_auc_score, f1_score

nltk.download("punkt_tab")
nltk.download("stopwords")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/rostyslavbalytskiy/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rostyslavbalytskiy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
fn='rt-polarity.neg'
with open(fn, "r",encoding='utf-8', errors='ignore') as f: # some invalid symbols encountered 
    content = f.read()
texts_neg=content.splitlines()
print ('len of texts_neg = {:,}'.format (len(texts_neg)))
for review in texts_neg[:5]:
    print ( '\n', review)

len of texts_neg = 5,331

 simplistic , silly and tedious . 

 it's so laddish and juvenile , only teenage boys could possibly find it funny . 

 exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . 

 [garbus] discards the potential for pathological study , exhuming instead , the skewed melodrama of the circumstantial situation . 

 a visually flashy but narratively opaque and emotionally vapid exercise in style and mystification . 


In [3]:
fn='rt-polarity.pos'

with open(fn, "r",encoding='utf-8', errors='ignore') as f:
    content = f.read()
texts_pos=content.splitlines()
print ('len of texts_pos = {:,}'.format (len(texts_pos)))
for review in texts_pos[:5]:
    print ('\n', review)

len of texts_pos = 5,331

 the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 

 the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth . 

 effective but too-tepid biopic

 if you sometimes like to go to the movies to have fun , wasabi is a good place to start . 

 emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one . 


In [4]:
X_pos_train, X_pos_test = train_test_split(texts_pos, test_size=0.2, random_state=42)

X_neg_train, X_neg_test = train_test_split(texts_neg, test_size=0.2, random_state=42)

In [5]:
def preprocess_text(text):
    import re
    text = re.sub('<.*?>', '', text)
    text = re.sub('https://.*', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    return text

In [6]:
X_pos_train = [preprocess_text(review) for review in X_pos_train]
X_pos_test = [preprocess_text(review) for review in X_pos_test]
X_neg_train = [preprocess_text(review) for review in X_neg_train]
X_neg_test = [preprocess_text(review) for review in X_neg_test]

In [7]:
def find_features(review):
    words = word_tokenize(review)
    words = [word for word in words if word.lower() not in stopwords.words('english')]
    return {word: True for word in words}

In [8]:
X_y_train = [(find_features(review), '1') for review in X_pos_train] + [(find_features(review), '0') for review in X_neg_train]
X_y_test = [(find_features(review), '1') for review in X_pos_test] + [(find_features(review), '0') for review in X_neg_test]

y_test_true = [label for _, label in X_y_test]

In [9]:
from sklearn.metrics import classification_report


nb_classifier = nltk.NaiveBayesClassifier.train(X_y_train)
y_pred_nb = [nb_classifier.classify(feats) for feats, _ in X_y_test]
print("\n[Naive Bayes Results]")
print(classification_report(y_test_true, y_pred_nb, digits=4))




[Naive Bayes Results]
              precision    recall  f1-score   support

           0     0.7768    0.7601    0.7684      1067
           1     0.7651    0.7816    0.7733      1067

    accuracy                         0.7709      2134
   macro avg     0.7710    0.7709    0.7708      2134
weighted avg     0.7710    0.7709    0.7708      2134



In [10]:
lr_classifier = SklearnClassifier(LogisticRegression(max_iter=1000))
lr_classifier.train(X_y_train)
y_pred_lr = [lr_classifier.classify(feats) for feats, _ in X_y_test]
print("\n[Logistic Regression Results]")
print(classification_report(y_test_true, y_pred_lr, digits=4))



[Logistic Regression Results]
              precision    recall  f1-score   support

           0     0.7539    0.7666    0.7602      1067
           1     0.7626    0.7498    0.7561      1067

    accuracy                         0.7582      2134
   macro avg     0.7583    0.7582    0.7582      2134
weighted avg     0.7583    0.7582    0.7582      2134



In [11]:
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

raw_X_test = X_pos_test + X_neg_test
raw_y_true = ['1'] * len(X_pos_test) + ['0'] * len(X_neg_test)

candidate_labels = ["positive", "negative"]
y_pred_zs = []

print("\n[Zero-Shot Classification Running...]")
for review in raw_X_test:
    short_review = review[:512]
    result = zero_shot_classifier(short_review, candidate_labels)
    predicted_label = '1' if result['labels'][0] == 'positive' else '0'
    y_pred_zs.append(predicted_label)

print("\n[Zero-Shot Classification Results]")
print(classification_report(raw_y_true, y_pred_zs, digits=4))

Device set to use mps:0



[Zero-Shot Classification Running...]

[Zero-Shot Classification Results]
              precision    recall  f1-score   support

           0     0.7761    0.9063    0.8361      1067
           1     0.8874    0.7385    0.8061      1067

    accuracy                         0.8224      2134
   macro avg     0.8317    0.8224    0.8211      2134
weighted avg     0.8317    0.8224    0.8211      2134

