<a href="https://colab.research.google.com/github/davidemichelon11/NLU/blob/main/NLU_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import nltk
import numpy as np
from nltk.corpus import movie_reviews
from nltk.corpus import subjectivity

nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('subjectivity')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package subjectivity to /root/nltk_data...
[nltk_data]   Package subjectivity is already up-to-date!


True


**BASELINE SUBJECTIVITY**



In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import svm

In [7]:
def doc2string(doc):
  return " ".join([w for sent in doc for w in sent])

def sent2string(sent):
  return " ".join([w for w in sent])

In [8]:
import unicodedata
import re

def unicodeToAscii(s):
  return ''.join(
    c for c in unicodedata.normalize('NFD', s)
    if unicodedata.category(c) != 'Mn'
  )

def normalizeString(s):
  s = unicodeToAscii(s.lower().strip())
  s = re.sub(r"([.!?])", r" \1", s)
  s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
  return s

In [9]:
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')]

corpus_sub = [normalizeString(sent2string(d[0])) for d in subj_docs] + [normalizeString(sent2string(d[0])) for d in obj_docs]
labels_sub = np.array(['subj'] * len(subj_docs) + ['obj'] * len(obj_docs))
train_samples_sub, test_samples_sub, train_labels_sub, test_labels_sub = train_test_split(corpus_sub, labels_sub, test_size=0.3)

In [10]:
vectorizer_sub = CountVectorizer()
classifier_NB = MultinomialNB()

vectors = vectorizer_sub.fit_transform(corpus_sub)
scores = cross_validate(classifier_NB, vectors, labels_sub, cv=StratifiedKFold(n_splits=10) , scoring=['f1_micro'])
average = sum(scores['test_f1_micro'])/len(scores['test_f1_micro'])
print(round(average, 3))

0.92


In [11]:
# NB and SVM for subj
classifier_NB2_subj = MultinomialNB()
classifier_SVM_subj = svm.SVC()

vectors = vectorizer_sub.fit_transform(train_samples_sub + test_samples_sub)

classifier_NB2_subj.fit(vectors[:len(train_samples_sub)], train_labels_sub)
labels_pred_NB2 = classifier_NB2_subj.predict(vectors[len(train_labels_sub):])
print(classification_report(test_labels_sub, labels_pred_NB2, digits=3))

#SVM
classifier_SVM_subj.fit(vectors[:len(train_samples_sub)], train_labels_sub)
labels_pred_SVM = classifier_SVM_subj.predict(vectors[len(train_labels_sub):])
print(classification_report(test_labels_sub, labels_pred_SVM, digits=3))

              precision    recall  f1-score   support

         obj      0.928     0.900     0.914      1472
        subj      0.906     0.933     0.919      1528

    accuracy                          0.917      3000
   macro avg      0.917     0.916     0.917      3000
weighted avg      0.917     0.917     0.917      3000

              precision    recall  f1-score   support

         obj      0.883     0.882     0.883      1472
        subj      0.887     0.887     0.887      1528

    accuracy                          0.885      3000
   macro avg      0.885     0.885     0.885      3000
weighted avg      0.885     0.885     0.885      3000



**BASELINE via SVM - SA**

In [12]:
nltk.download('movie_reviews')
mr = movie_reviews
neg = mr.paras(categories='neg')
pos = mr.paras(categories='pos')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [13]:
vectorizer_sa = CountVectorizer()
classifier_sa = svm.SVC()

#each element is the review converted to string
corpus_sa = [normalizeString(doc2string(p)) for p in pos] + [normalizeString(doc2string(n)) for n in neg]

labels_sa = np.array([0] * len(pos) + [1] * len(neg))
train_samples_sa, test_samples_sa, train_labels_sa, test_labels_sa = train_test_split(corpus_sa, labels_sa, test_size=0.1)

In [14]:
#SVM with objective sentences
vectors = vectorizer_sa.fit_transform(train_samples_sa + test_samples_sa)
classifier_sa.fit(vectors[:len(train_samples_sa)], train_labels_sa)
labels_pred = classifier_sa.predict(vectors[len(train_labels_sa):])

print(classification_report(test_labels_sa, labels_pred, digits=3))

              precision    recall  f1-score   support

           0      0.821     0.639     0.719       108
           1      0.664     0.837     0.740        92

    accuracy                          0.730       200
   macro avg      0.743     0.738     0.730       200
weighted avg      0.749     0.730     0.729       200



In [15]:
def remove_obj_sentences(reviews, vectorizer_sub):
  new_list = []
  for rev in reviews:
    new_rev = []
    for s in rev:
      vector = vectorizer_sub.transform([sent2string(s)]).toarray()
      if classifier_NB2_subj.predict(vector) == ['subj']: 
        new_rev.append(s)
    new_list.append(new_rev)
  return new_list

In [16]:
# SVM without obj sentences --> circa 80-82% accuracy
# For each review, remove obj sentences and compute the SVM
vectorizer_sa_subj = CountVectorizer()
classifier_SVM_subj = svm.SVC()
     
new_pos = remove_obj_sentences(pos, vectorizer_sub)
new_neg = remove_obj_sentences(neg, vectorizer_sub)

corpus_sa_subj = [normalizeString(doc2string(p)) for p in new_pos] + [normalizeString(doc2string(n)) for n in new_neg]
labels_sa_subj = np.array([0] * len(new_pos) + [1] * len(new_neg))
train_samples_, test_samples_, train_labels_, test_labels_ = train_test_split(corpus_sa_subj, labels_sa_subj, test_size=0.2)

vectors_ = vectorizer_sa_subj.fit_transform(train_samples_ + test_samples_)
classifier_SVM_subj.fit(vectors_[:len(train_samples_)], train_labels_)
labels_pred_ = classifier_SVM_subj.predict(vectors_[len(train_labels_):])

print(classification_report(test_labels_, labels_pred_, digits=3))

              precision    recall  f1-score   support

           0      0.808     0.746     0.776       197
           1      0.771     0.828     0.798       203

    accuracy                          0.787       400
   macro avg      0.789     0.787     0.787       400
weighted avg      0.789     0.787     0.787       400



**VADER Baseline**

In [17]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer, VaderConstants

In [18]:
# Analyse complete review
analyzer = SentimentIntensityAnalyzer()
labels_vader = np.array([0] * len(neg) + [1] * len(pos))
prediction_val = [analyzer.polarity_scores(doc2string(v)) for v in (pos + neg)]
prediction_labels = [0 if p['pos'] > p['neg'] else 1 for p in prediction_val]

print(classification_report(labels_sa, prediction_labels, digits=3))

              precision    recall  f1-score   support

           0      0.583     0.842     0.689      1000
           1      0.715     0.397     0.511      1000

    accuracy                          0.620      2000
   macro avg      0.649     0.619     0.600      2000
weighted avg      0.649     0.620     0.600      2000



In [19]:
# Analyse each sentence of review, sum sentences contribution as 1
prediction_labels = []

for rev in (pos+neg):
  pos_ = 0
  neg_ = 0
  for sent in rev:
    p = analyzer.polarity_scores(" ".join([w for w in sent]))
    if p['pos'] > p['neg']: pos_ += 1
    else: neg_ += 1
  prediction_labels.append(0 if pos_ > neg_ else 1)
print(classification_report(labels_sa, prediction_labels, digits=3))

prediction_labels = []

for rev in (pos+neg):
  pos_ = 0
  neg_ = 0
  for sent in rev:
    p = analyzer.polarity_scores(" ".join([w for w in sent]))
    if p['pos'] > p['neg']: pos_ += p['pos']
    else: neg_ += p['neg']
  prediction_labels.append(0 if pos_ > neg_ else 1)
print(classification_report(labels_sa, prediction_labels, digits=3))

              precision    recall  f1-score   support

           0      0.698     0.500     0.583      1000
           1      0.611     0.784     0.687      1000

    accuracy                          0.642      2000
   macro avg      0.654     0.642     0.635      2000
weighted avg      0.654     0.642     0.635      2000

              precision    recall  f1-score   support

           0      0.602     0.843     0.702      1000
           1      0.738     0.442     0.553      1000

    accuracy                          0.642      2000
   macro avg      0.670     0.642     0.628      2000
weighted avg      0.670     0.642     0.628      2000



**Objectivity detection using a simple RNN and LSTM**

In [20]:
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.utils.data as data
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()

In [21]:
batch_size = 120
seq_len = 50
word_embedding = 300
epochs = 10
learning_rate = 0.01
n_hidden = 128
n_categories = 2

In [22]:
# before tried to create a vocabulary and 1-hot encoder but dimension of 20k and very big
def create_tens(rev, seq_len, word_embedding):
  rev = normalizeString(rev).split()
  if len(rev) > seq_len: rev = rev[:seq_len]

  sent = []
  for i, w in enumerate(rev):
    vector = nlp.vocab[w].vector
    sent.append(vector)

  z = list(np.zeros(word_embedding, dtype=np.float32))
  if len(sent) > seq_len:
    return sent[:seq_len]
  else:
    diff = seq_len - len(sent)
    zs = [z for each in range(diff)]
    return zs + sent

In [23]:
class Db(data.Dataset):
  def __init__(self, rev, labels):
    self.rev = rev
    self.labels = labels

  def __len__(self):
    return len(self.rev)

  def __getitem__(self, idx: int):
    return torch.tensor(self.rev[idx]), torch.tensor(self.labels[idx])

In [24]:
import spacy
spacy.cli.download('en_core_web_lg')
nlp = spacy.load('en_core_web_lg')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [25]:
labels = np.append(np.zeros((len(subj_docs)), dtype=int), np.ones((len(subj_docs)), dtype=int))
corpus = [sent2string(d[0]).lower() for d in subj_docs] + [sent2string(d[0]).lower() for d in obj_docs]

train_samples, test_samples, train_labels, test_labels = train_test_split(corpus, labels, test_size=0.3)

train_samples = [create_tens(rev, seq_len, word_embedding) for rev in train_samples]
test_samples = [create_tens(rev, seq_len, word_embedding) for rev in test_samples]

# train samples are tensors of seq_len x word_embedding
train_dataset = Db(train_samples, train_labels)
test_dataset = Db(test_samples, test_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

In [26]:
# Thanks to lab of DL
class RNN(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(RNN, self).__init__()
    
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.output_size = output_size
    
    self.i2h = nn.RNN(input_size, hidden_size, batch_first=True)
    self.i2o = nn.Linear(hidden_size, output_size)
  
  def forward(self, input, hidden=None):
    if hidden==None:
      hidden = self.init_hidden(input.shape[0])
    output, _ = self.i2h(input, hidden)
    output = self.i2o(output[:, -1])
    return output

  def init_hidden(self,shape=1):
    return torch.zeros(1, shape, self.hidden_size)

In [27]:
# training
def train(rnn, optimizer, train_loader, e):

  cumulative_accuracy = 0
  samples=0
  pbar = tqdm(train_loader)
  for x,y in pbar:
    x,y = x.to(device),y.to(device)
    outputs = rnn(x)
    loss = criterion(outputs, y.long())
    _, predicted = outputs.max(1)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    samples += x.shape[0]
    cumulative_accuracy += predicted.eq(y).sum().item()
    pbar.set_description('Epoch {}/{}, Train accuracy: {:.2f}'.format(e+1,epochs, cumulative_accuracy/samples*100))
  return cumulative_accuracy/samples*100

In [28]:
def evaluate(rnn, test_loader):

  cumulative_accuracy = 0
  samples=0
  pbar = tqdm(test_loader)
  for x,y in pbar:
    x,y = x.to(device),y.to(device)
    outputs = rnn(x)
    loss = criterion(outputs, y.long())
    _, predicted = outputs.max(1)
    samples += x.shape[0]
    cumulative_accuracy += predicted.eq(y).sum().item()
    pbar.set_description('Evaluate accuracy: {:.2f}'.format(cumulative_accuracy/samples*100))
  return cumulative_accuracy/samples*100

In [None]:
batch_size = 120
seq_len = 50
word_embedding = 300
epochs = 10
learning_rate = 0.01
n_hidden = 128
n_categories = 2
#accuracy 87.56

rnn = RNN(word_embedding, n_hidden, n_categories)
optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)

evaluate(rnn, test_loader)

for e in range(epochs):
  train(rnn, optimizer, train_loader, e)

evaluate(rnn, test_loader)


In [51]:
class LSTM(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(LSTM, self).__init__()
    
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.i2h = nn.LSTM(input_size, hidden_size, batch_first=True)
    self.dropout = nn.Dropout(0.3)
    self.i2o = nn.Linear(hidden_size, output_size)
      
  def forward(self, input, hidden=None, cell=None):
    if hidden==None:
      hidden = self.init_hidden(input.shape[0])
    if cell==None:
      cell = self.init_hidden(input.shape[0])

    output, (_,_)= self.i2h(input, (hidden,cell))
    output = self.dropout(output)
    output = self.i2o(output)
    output = output[:, -1]
    return output

  def init_hidden(self,shape=1):
    return torch.zeros(1, shape, self.hidden_size)
    
  def init_cell(self,shape=1):
    return torch.zeros(1, shape, self.hidden_size)

In [54]:
lstm = LSTM(word_embedding, n_hidden, n_categories)
optimizer = torch.optim.Adam(lstm.parameters(), lr = 3e-4)

evaluate(lstm, test_loader)

for e in range(epochs):
  train(lstm, optimizer, train_loader, e)
  evaluate(lstm, test_loader)

evaluate(lstm, test_loader)

Evaluate accuracy: 47.07: 100%|██████████| 25/25 [00:10<00:00,  2.43it/s]
Epoch 1/10, Train accuracy: 72.70: 100%|██████████| 59/59 [00:23<00:00,  2.49it/s]
Evaluate accuracy: 88.20: 100%|██████████| 25/25 [00:08<00:00,  3.03it/s]
Epoch 2/10, Train accuracy: 89.46: 100%|██████████| 59/59 [00:23<00:00,  2.50it/s]
Evaluate accuracy: 90.83: 100%|██████████| 25/25 [00:08<00:00,  3.06it/s]
Epoch 3/10, Train accuracy: 91.04: 100%|██████████| 59/59 [00:23<00:00,  2.50it/s]
Evaluate accuracy: 91.57: 100%|██████████| 25/25 [00:08<00:00,  3.06it/s]
Epoch 4/10, Train accuracy: 92.06: 100%|██████████| 59/59 [00:23<00:00,  2.50it/s]
Evaluate accuracy: 91.60: 100%|██████████| 25/25 [00:08<00:00,  3.07it/s]
Epoch 5/10, Train accuracy: 92.51: 100%|██████████| 59/59 [00:23<00:00,  2.50it/s]
Evaluate accuracy: 91.10: 100%|██████████| 25/25 [00:08<00:00,  3.05it/s]
Epoch 6/10, Train accuracy: 93.06: 100%|██████████| 59/59 [00:23<00:00,  2.49it/s]
Evaluate accuracy: 91.87: 100%|██████████| 25/25 [00:11<00

92.16666666666666

**Sentiment Analysis with RNN and LSTM**

In [None]:
import nltk
from nltk.corpus import movie_reviews

nltk.download('movie_reviews')

mr = movie_reviews
neg = mr.paras(categories='neg')
pos = mr.paras(categories='pos')

In [None]:
# corpus_sa_subj contains only subjectivity

print(pos[0])
print(corpus_sa[0])

train_samples_sa, test_samples_sa, train_labels_sa, test_labels_sa = train_test_split(corpus_sa_subj, labels_sa_subj, test_size=0.2)

train_samples_sa = [create_tens(x, 500, word_embedding) for x in train_samples_sa]
test_samples_sa = [create_tens(x, 500, word_embedding) for x in test_samples_sa]

train_dataset_sa = Db(train_samples_sa, train_labels_sa)
test_dataset_sa = Db(test_samples_sa, test_labels_sa)

train_loader_sa = torch.utils.data.DataLoader(train_dataset_sa, batch_size=batch_size, shuffle=True)
test_loader_sa = torch.utils.data.DataLoader(test_dataset_sa, batch_size=batch_size, shuffle=True)

In [None]:
lstm_sa = LSTM(word_embedding, n_hidden, n_categories)
optimizer = torch.optim.Adam(lstm_sa.parameters(), lr = 3e-4)
# evaluate(rnn, test_loader_sa)

for e in range(epochs):
  train(lstm_sa, optimizer_sa, train_loader_sa, e)
  evaluate(lstm_sa, test_loader_sa) 

evaluate(lstm_sa, test_loader_sa)

In [None]:
from nltk.corpus import subjectivity
subjectivity.categories()

subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')]

print(len(subj_docs))
print(len(obj_docs))

tot_len = 0
for s in (subj_docs + obj_docs):
  #print(s[0])
  tot_len += len(s[0])

print(tot_len/len(subj_docs + obj_docs))


In [None]:
from nltk.corpus import movie_reviews

mr = movie_reviews
neg = mr.paras(categories='neg')
pos = mr.paras(categories='pos')

tot_len = 0
tkn = 0

sentences = 0
for rev in (pos + neg):
  for sent in rev:
    sentences += 1
     
    for token in sent:
      tkn += 1

print(sentences)
print('average of sentences: {}'.format(sentences/ len(pos+neg)))
print(tkn/len(pos + neg))