<a href="https://colab.research.google.com/github/davidemichelon11/NLU/blob/main/NLU_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
import numpy
from nltk.corpus import movie_reviews
from nltk.corpus import subjectivity

nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('subjectivity')


**BASELINE SUBJECTIVITY**



In [42]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import svm

In [43]:
def doc2string(doc):
  return " ".join([w for sent in doc for w in sent])

def sent2string(sent):
  return " ".join([w for w in sent])

In [44]:
vectorizer = CountVectorizer()
classifier_NB = MultinomialNB()

subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')]

corpus = [sent2string(d[0]).lower() for d in subj_docs] + [sent2string(d[0]).lower() for d in obj_docs]
vectors = vectorizer.fit_transform(corpus)

labels = numpy.array(['subj'] * len(subj_docs) + ['obj'] * len(obj_docs))
scores = cross_validate(classifier_NB, vectors, labels, cv=StratifiedKFold(n_splits=10) , scoring=['f1_micro'])
average = sum(scores['test_f1_micro'])/len(scores['test_f1_micro'])
print(round(average, 3))

0.921


In [45]:
# NB and SVM for subj
classifier_NB2_subj = MultinomialNB()
classifier_SVM_subj = svm.SVC()

corpus = [sent2string(d[0]).lower() for d in subj_docs] + [sent2string(d[0]).lower() for d in obj_docs]
labels = numpy.array(['subj'] * len(subj_docs) + ['obj'] * len(obj_docs))
train_samples, test_samples, train_labels, test_labels = train_test_split(corpus, labels, test_size=0.3)
vectors = vectorizer.fit_transform(train_samples + test_samples)

classifier_NB2_subj.fit(vectors[:len(train_samples)], train_labels)
labels_pred_NB2 = classifier_NB2_subj.predict(vectors[len(train_labels):])
print(classification_report(test_labels, labels_pred_NB2, digits=3))

classifier_SVM_subj.fit(vectors[:len(train_samples)], train_labels)
labels_pred_SVM = classifier_SVM_subj.predict(vectors[len(train_labels):])
print(classification_report(test_labels, labels_pred_SVM, digits=3))

              precision    recall  f1-score   support

         obj      0.929     0.902     0.915      1509
        subj      0.904     0.930     0.917      1491

    accuracy                          0.916      3000
   macro avg      0.916     0.916     0.916      3000
weighted avg      0.916     0.916     0.916      3000

              precision    recall  f1-score   support

         obj      0.887     0.875     0.881      1509
        subj      0.876     0.887     0.881      1491

    accuracy                          0.881      3000
   macro avg      0.881     0.881     0.881      3000
weighted avg      0.881     0.881     0.881      3000



**BASELINE via SVM - SA**

In [46]:
nltk.download('movie_reviews')
mr = movie_reviews
neg = mr.paras(categories='neg')
pos = mr.paras(categories='pos')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [47]:
vectorizer2 = CountVectorizer()
classifier_sa = svm.SVC()

corpus = [doc2string(p) for p in pos] + [doc2string(n) for n in neg]
labels = numpy.array([0] * len(pos) + [1] * len(neg))
train_samples, test_samples, train_labels, test_labels = train_test_split(corpus, labels, test_size=0.1)

vectors = vectorizer2.fit_transform(train_samples + test_samples)
classifier_sa.fit(vectors[:len(train_samples)], train_labels)
labels_pred = classifier_sa.predict(vectors[len(train_labels):])

print(classification_report(test_labels, labels_pred, digits=3))

              precision    recall  f1-score   support

           0      0.844     0.710     0.772       107
           1      0.718     0.849     0.778        93

    accuracy                          0.775       200
   macro avg      0.781     0.780     0.775       200
weighted avg      0.786     0.775     0.775       200



In [48]:
# For each review, remove obj sentences and compute the SVM
vectorizer3 = CountVectorizer()
classifier_sa2 = svm.SVC()

def get_new_rev(original):
  new_list = []
  for rev in original:
    new_rev = []
    for s in rev:
      vector = vectorizer.transform([sent2string(s)]).toarray()
      if classifier_NB2_subj.predict(vector) == ['subj']: 
        new_rev.append(s)
    new_list.append(new_rev)
  return new_list
              
new_pos = get_new_rev(pos)
new_neg = get_new_rev(neg)

corpus_ = [doc2string(p) for p in new_pos] + [doc2string(n) for n in new_neg]
labels_ = numpy.array([0] * len(new_pos) + [1] * len(new_neg))
train_samples_, test_samples_, train_labels_, test_labels_ = train_test_split(corpus_, labels_, test_size=0.1)

vectors_ = vectorizer3.fit_transform(train_samples_ + test_samples_)
classifier_sa2.fit(vectors_[:len(train_samples_)], train_labels_)
labels_pred_ = classifier_sa2.predict(vectors_[len(train_labels_):])

print(classification_report(test_labels_, labels_pred_, digits=3))
counter = 0
for i in range(len(test_labels_)):
  if (test_labels_[i] - labels_pred_[i]) == 0: counter += 1
print('accuracy: ', counter/ len(test_labels_))


              precision    recall  f1-score   support

           0      0.869     0.795     0.830       117
           1      0.742     0.831     0.784        83

    accuracy                          0.810       200
   macro avg      0.806     0.813     0.807       200
weighted avg      0.816     0.810     0.811       200

accuracy:  0.81


**VADER Baseline**

In [49]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer, VaderConstants

In [50]:
# Analyse complete review
analyzer = SentimentIntensityAnalyzer()
labels_vader = numpy.array([0] * len(neg) + [1] * len(pos))
prediction_val = [analyzer.polarity_scores(doc2string(v)) for v in (pos + neg)]
prediction_labels = [0 if p['pos'] > p['neg'] else 1 for p in prediction_val]

print(classification_report(labels_vader, prediction_labels, digits=3))

              precision    recall  f1-score   support

           0      0.583     0.842     0.689      1000
           1      0.715     0.397     0.511      1000

    accuracy                          0.620      2000
   macro avg      0.649     0.619     0.600      2000
weighted avg      0.649     0.620     0.600      2000



In [51]:
# Analyse each sentence of review, sum sentences contribution as 1
prediction_labels = []

for rev in (pos+neg):
  pos_ = 0
  neg_ = 0
  for sent in rev:
    p = analyzer.polarity_scores(" ".join([w for w in sent]))
    if p['pos'] > p['neg']: pos_ += 1
    else: neg_ += 1
  prediction_labels.append(0 if pos_ > neg_ else 1)
print(classification_report(labels_vader, prediction_labels, digits=3))

# Analyse each sentence of review, sum sentences contribution as 1
prediction_labels = []

for rev in (pos+neg):
  pos_ = 0
  neg_ = 0
  for sent in rev:
    p = analyzer.polarity_scores(" ".join([w for w in sent]))
    if p['pos'] > p['neg']: pos_ += p['pos']
    else: neg_ += p['neg']
  prediction_labels.append(0 if pos_ > neg_ else 1)
print(classification_report(labels, prediction_labels, digits=3))

              precision    recall  f1-score   support

           0      0.698     0.500     0.583      1000
           1      0.611     0.784     0.687      1000

    accuracy                          0.642      2000
   macro avg      0.654     0.642     0.635      2000
weighted avg      0.654     0.642     0.635      2000

              precision    recall  f1-score   support

           0      0.602     0.843     0.702      1000
           1      0.738     0.442     0.553      1000

    accuracy                          0.642      2000
   macro avg      0.670     0.642     0.628      2000
weighted avg      0.670     0.642     0.628      2000

