# Final Assignment

## Sentiment analysis

In [2]:
def vader_output_to_label(vader_output):
    
    
    """
    map vader output e.g.,
    {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4215}
    to one of the following values:
    a) positive float -> 'positive'
    b) 0.0 -> 'neutral'
    c) negative float -> 'negative'
    
    :param dict vader_output: output dict from vader
    
    :rtype: str
    :return: 'negative' | 'neutral' | 'positive'
    """
    compound = vader_output['compound']
    
    if compound < 0:
        return 'negative'
    elif compound == 0.0:
        return 'neutral'
    elif compound > 0.0:
        return 'positive'
    
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.0}) == 'neutral'
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.01}) == 'positive'
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': -0.01}) == 'negative'

In [29]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
nlp = spacy.load('en_core_web_sm')

vader_model = SentimentIntensityAnalyzer()

def run_vader(textual_unit, 
              parts_of_speech_to_consider=None, lemmatize=False,
              verbose=0):
    """
    Run VADER on a sentence from spacy
    
    :param str textual unit: a textual unit, e.g., sentence, sentences (one string)
    (by looping over doc.sents)
    :param bool lemmatize: If True, provide lemmas to VADER instead of words
    :param set parts_of_speech_to_consider:
    -None or empty set: all parts of speech are provided
    -non-empty set: only these parts of speech are considered.
    :param int verbose: if set to 1, information is printed
    about input and output
    
    :rtype: dict
    :return: vader output dict
    """
    doc = nlp(textual_unit)
        
    input_to_vader = []

    for sent in doc.sents:
        for token in sent:

            to_add = token.text

            if lemmatize:
                to_add = token.lemma_

                if to_add == '-PRON-': 
                    to_add = token.text

            if parts_of_speech_to_consider:
                if token.pos_ in parts_of_speech_to_consider:
                    input_to_vader.append(to_add) 
            else:
                input_to_vader.append(to_add)

    scores = vader_model.polarity_scores(' '.join(input_to_vader))
    
    if verbose >= 1:
        print()
        print('INPUT SENTENCE', sent)
        print('INPUT TO VADER', input_to_vader)
        print('VADER OUTPUT', scores)

    return scores

In [30]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [31]:
test_dataset = pd.read_csv("./sentiment-topic-test.tsv", sep='\t')

In [32]:
gold = []
vader_output = []

vectorizer = CountVectorizer(min_df=2,tokenizer=nltk.word_tokenize,stop_words=stopwords.words('english')), 'name': 'Bag of Words, min_df=2'

for row in test_dataset.itertuples():
    gold_label = row[3]
    sentence = row[2]
    gold.append(str(gold_label))
    scores = run_vader(str(sentence),lemmatize=False)
    vader_output.append(vader_output_to_label(scores))

In [27]:
report = classification_report(gold,vader_output)

In [33]:
print(gold)
print(vader_output)
print(report)

['negative', 'neutral', 'positive', 'positive', 'negative', 'neutral', 'negative', 'negative', 'neutral', 'positive']
['positive', 'positive', 'positive', 'neutral', 'positive', 'neutral', 'positive', 'neutral', 'negative', 'negative']
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         4
     neutral       0.33      0.33      0.33         3
    positive       0.20      0.33      0.25         3

    accuracy                           0.20        10
   macro avg       0.18      0.22      0.19        10
weighted avg       0.16      0.20      0.17        10

