<a href="https://colab.research.google.com/github/davidemichelon11/NLU/blob/main/NLU_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
import numpy as np
from nltk.corpus import movie_reviews
from nltk.corpus import subjectivity

nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('subjectivity')


**BASELINE SUBJECTIVITY**



In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import svm

In [9]:
def doc2string(doc):
  return " ".join([w for sent in doc for w in sent])

def sent2string(sent):
  return " ".join([w for w in sent])

In [10]:
vectorizer = CountVectorizer()
classifier_NB = MultinomialNB()

subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')]

corpus = [sent2string(d[0]).lower() for d in subj_docs] + [sent2string(d[0]).lower() for d in obj_docs]
vectors = vectorizer.fit_transform(corpus)

labels = np.array(['subj'] * len(subj_docs) + ['obj'] * len(obj_docs))
scores = cross_validate(classifier_NB, vectors, labels, cv=StratifiedKFold(n_splits=10) , scoring=['f1_micro'])
average = sum(scores['test_f1_micro'])/len(scores['test_f1_micro'])
print(round(average, 3))

0.921


In [11]:
# NB and SVM for subj
classifier_NB2_subj = MultinomialNB()
classifier_SVM_subj = svm.SVC()

corpus = [sent2string(d[0]).lower() for d in subj_docs] + [sent2string(d[0]).lower() for d in obj_docs]
labels = np.array(['subj'] * len(subj_docs) + ['obj'] * len(obj_docs))
train_samples, test_samples, train_labels, test_labels = train_test_split(corpus, labels, test_size=0.3)
vectors = vectorizer.fit_transform(train_samples + test_samples)

classifier_NB2_subj.fit(vectors[:len(train_samples)], train_labels)
labels_pred_NB2 = classifier_NB2_subj.predict(vectors[len(train_labels):])
print(classification_report(test_labels, labels_pred_NB2, digits=3))

classifier_SVM_subj.fit(vectors[:len(train_samples)], train_labels)
labels_pred_SVM = classifier_SVM_subj.predict(vectors[len(train_labels):])
print(classification_report(test_labels, labels_pred_SVM, digits=3))

              precision    recall  f1-score   support

         obj      0.934     0.916     0.925      1494
        subj      0.918     0.936     0.927      1506

    accuracy                          0.926      3000
   macro avg      0.926     0.926     0.926      3000
weighted avg      0.926     0.926     0.926      3000

              precision    recall  f1-score   support

         obj      0.889     0.884     0.887      1494
        subj      0.886     0.890     0.888      1506

    accuracy                          0.887      3000
   macro avg      0.887     0.887     0.887      3000
weighted avg      0.887     0.887     0.887      3000



**BASELINE via SVM - SA**

In [12]:
nltk.download('movie_reviews')
mr = movie_reviews
neg = mr.paras(categories='neg')
pos = mr.paras(categories='pos')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [13]:
vectorizer2 = CountVectorizer()
classifier_sa = svm.SVC()

corpus = [doc2string(p) for p in pos] + [doc2string(n) for n in neg]
labels = np.array([0] * len(pos) + [1] * len(neg))
train_samples, test_samples, train_labels, test_labels = train_test_split(corpus, labels, test_size=0.1)

vectors = vectorizer2.fit_transform(train_samples + test_samples)
classifier_sa.fit(vectors[:len(train_samples)], train_labels)
labels_pred = classifier_sa.predict(vectors[len(train_labels):])

print(classification_report(test_labels, labels_pred, digits=3))

              precision    recall  f1-score   support

           0      0.824     0.680     0.745       103
           1      0.713     0.845     0.774        97

    accuracy                          0.760       200
   macro avg      0.768     0.762     0.759       200
weighted avg      0.770     0.760     0.759       200



In [14]:
# For each review, remove obj sentences and compute the SVM
vectorizer3 = CountVectorizer()
classifier_sa2 = svm.SVC()

def get_new_rev(original):
  new_list = []
  for rev in original:
    new_rev = []
    for s in rev:
      vector = vectorizer.transform([sent2string(s)]).toarray()
      if classifier_NB2_subj.predict(vector) == ['subj']: 
        new_rev.append(s)
    new_list.append(new_rev)
  return new_list
              
new_pos = get_new_rev(pos)
new_neg = get_new_rev(neg)

corpus_ = [doc2string(p) for p in new_pos] + [doc2string(n) for n in new_neg]
labels_ = np.array([0] * len(new_pos) + [1] * len(new_neg))
train_samples_, test_samples_, train_labels_, test_labels_ = train_test_split(corpus_, labels_, test_size=0.1)

vectors_ = vectorizer3.fit_transform(train_samples_ + test_samples_)
classifier_sa2.fit(vectors_[:len(train_samples_)], train_labels_)
labels_pred_ = classifier_sa2.predict(vectors_[len(train_labels_):])

print(classification_report(test_labels_, labels_pred_, digits=3))

              precision    recall  f1-score   support

           0      0.792     0.792     0.792       101
           1      0.788     0.788     0.788        99

    accuracy                          0.790       200
   macro avg      0.790     0.790     0.790       200
weighted avg      0.790     0.790     0.790       200



**VADER Baseline**

In [15]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer, VaderConstants

In [16]:
# Analyse complete review
analyzer = SentimentIntensityAnalyzer()
labels_vader = np.array([0] * len(neg) + [1] * len(pos))
prediction_val = [analyzer.polarity_scores(doc2string(v)) for v in (pos + neg)]
prediction_labels = [0 if p['pos'] > p['neg'] else 1 for p in prediction_val]

print(classification_report(labels_vader, prediction_labels, digits=3))

              precision    recall  f1-score   support

           0      0.583     0.842     0.689      1000
           1      0.715     0.397     0.511      1000

    accuracy                          0.620      2000
   macro avg      0.649     0.619     0.600      2000
weighted avg      0.649     0.620     0.600      2000



In [17]:
# Analyse each sentence of review, sum sentences contribution as 1
prediction_labels = []

for rev in (pos+neg):
  pos_ = 0
  neg_ = 0
  for sent in rev:
    p = analyzer.polarity_scores(" ".join([w for w in sent]))
    if p['pos'] > p['neg']: pos_ += 1
    else: neg_ += 1
  prediction_labels.append(0 if pos_ > neg_ else 1)
print(classification_report(labels_vader, prediction_labels, digits=3))

prediction_labels = []

for rev in (pos+neg):
  pos_ = 0
  neg_ = 0
  for sent in rev:
    p = analyzer.polarity_scores(" ".join([w for w in sent]))
    if p['pos'] > p['neg']: pos_ += p['pos']
    else: neg_ += p['neg']
  prediction_labels.append(0 if pos_ > neg_ else 1)
print(classification_report(labels, prediction_labels, digits=3))

              precision    recall  f1-score   support

           0      0.698     0.500     0.583      1000
           1      0.611     0.784     0.687      1000

    accuracy                          0.642      2000
   macro avg      0.654     0.642     0.635      2000
weighted avg      0.654     0.642     0.635      2000

              precision    recall  f1-score   support

           0      0.602     0.843     0.702      1000
           1      0.738     0.442     0.553      1000

    accuracy                          0.642      2000
   macro avg      0.670     0.642     0.628      2000
weighted avg      0.670     0.642     0.628      2000



If time, another baseline on ['neu']
Aggiungere _NEG anche baseline

In [18]:
import torch
import torch.nn as nn
import torch.utils.data as data
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()

In [19]:
import unicodedata
import re

def unicodeToAscii(s):
  return ''.join(
    c for c in unicodedata.normalize('NFD', s)
    if unicodedata.category(c) != 'Mn'
  )

def normalizeString(s):
  s = unicodeToAscii(s.lower().strip())
  s = re.sub(r"([.!?])", r" \1", s)
  s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
  return s

In [20]:
from collections import Counter

def create_tens(rev, seq_len, word_embedding, doc):

  rev = normalizeString(rev).split()
  if len(rev) > seq_len: rev = rev[:seq_len]
  # rev2Tens = torch.zeros(seq_len, word_embedding)
  sent = []
  for i, w in enumerate(rev):
    vector = nlp.vocab[w].vector
    sent.append(vector)

  z = list(np.zeros(word_embedding, dtype=np.float32))
  if len(sent) > seq_len:
    return sent[:seq_len]
  else:
    diff = seq_len - len(sent)
    zs = [z for each in range(diff)]
    return zs + sent

In [21]:
class ObjDataset (data.Dataset):
  def __init__(self, rev, labels):
    self.rev = rev
    self.labels = labels

  def __len__(self):
    return len(self.rev)

  def __getitem__(self, idx: int):
    return torch.tensor(self.rev[idx]), torch.tensor(self.labels[idx])

In [None]:
import spacy
spacy.cli.download('en_core_web_lg')
nlp = spacy.load('en_core_web_lg')
nlp.max_length = 1300000

text = ""
corpus = [sent2string(d[0]).lower() for d in subj_docs] + [sent2string(d[0]).lower() for d in obj_docs]
for sent in corpus:
  sent = normalizeString(sent)
  text = text + " "+ sent

doc = nlp(text)

In [23]:
batch_size = 120
seq_len = 100
word_embedding = 300

labels = np.append(np.zeros((len(subj_docs)), dtype=int), np.ones((len(subj_docs)), dtype=int))

train_samples, test_samples, train_labels, test_labels = train_test_split(corpus, labels, test_size=0.3)

train_samples = [create_tens(rev, seq_len, word_embedding, doc) for rev in train_samples]
test_samples = [create_tens(rev, seq_len, word_embedding, doc) for rev in test_samples]

# train samples are tensors of seq_len x word_embedding
train_dataset = ObjDataset(train_samples, train_labels)
test_dataset = ObjDataset(test_samples, test_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size,)

In [24]:
class RNN(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(RNN, self).__init__()
    
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.output_size = output_size
    
    self.i2h = nn.RNN(input_size, hidden_size, batch_first=True)
    self.i2o = nn.Linear(hidden_size, output_size)
  
  def forward(self, input, hidden=None):
    if hidden==None:
      hidden = self.init_hidden(input.shape[0])
    output, _ = self.i2h(input, hidden)
    output = self.i2o(output[:, -1])
    return output

  def init_hidden(self,shape=1):
    return torch.zeros(1, shape, self.hidden_size)

In [25]:
# training
def train(rnn, optimizer, train_loader):

  cumulative_accuracy = 0
  samples=0

  for x,y in train_loader:
    x,y = x.to(device),y.to(device)
    outputs = rnn(x)
    loss = criterion(outputs, y.long())
    _, predicted = outputs.max(1)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    samples += x.shape[1]
  
    cumulative_accuracy += predicted.eq(y).sum().item()
  return cumulative_accuracy/samples*100

In [26]:
def evaluate(rnn, test_loader):

  cumulative_accuracy = 0
  samples=0
  for x,y in test_loader:
    x,y = x.to(device),y.to(device)
    outputs = rnn(x)
    loss = criterion(outputs, y.long())
    _, predicted = outputs.max(1)
    samples += x.shape[1]
    cumulative_accuracy += predicted.eq(y).sum().item()
  return cumulative_accuracy/samples*100

In [27]:
epochs = 10
learning_rate = 0.005
n_hidden = 128
n_categories = 2

rnn = RNN(word_embedding, n_hidden, n_categories)
optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)

test_accuracy = evaluate(rnn, test_loader)
print('Before training, test accuracy: {:.2f}'.format(test_accuracy))

for e in range(epochs):
  train_accuracy = train(rnn, optimizer, train_loader)
  print('Epoch: {}/{} Train accuracy: {:.2f}'.format(e+1, epochs, train_accuracy))

test_accuracy = evaluate(rnn, test_loader)
print('After training, test accuracy: {:.2f}'.format(test_accuracy))


  # Remove the CWD from sys.path while we load stuff.


Before training, test accuracy: 59.56
Epoch: 1/10 Train accuracy: 59.58
Epoch: 2/10 Train accuracy: 64.93
Epoch: 3/10 Train accuracy: 70.10
Epoch: 4/10 Train accuracy: 73.95
Epoch: 5/10 Train accuracy: 76.71
Epoch: 6/10 Train accuracy: 79.25
Epoch: 7/10 Train accuracy: 81.64
Epoch: 8/10 Train accuracy: 83.85
Epoch: 9/10 Train accuracy: 86.37
Epoch: 10/10 Train accuracy: 91.73
After training, test accuracy: 98.88
