# Final Assignment

## Sentiment analysis

In [2]:
def vader_output_to_label(vader_output):
    
    
    """
    map vader output e.g.,
    {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4215}
    to one of the following values:
    a) positive float -> 'positive'
    b) 0.0 -> 'neutral'
    c) negative float -> 'negative'
    
    :param dict vader_output: output dict from vader
    
    :rtype: str
    :return: 'negative' | 'neutral' | 'positive'
    """
    compound = vader_output['compound']
    
    if compound < 0:
        return 'negative'
    elif compound == 0.0:
        return 'neutral'
    elif compound > 0.0:
        return 'positive'
    
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.0}) == 'neutral'
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.01}) == 'positive'
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': -0.01}) == 'negative'

In [3]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
nlp = spacy.load('en_core_web_sm')

vader_model = SentimentIntensityAnalyzer()

tweets = []
all_vader_output = []
gold = []

# settings (to change for different experiments)
to_lemmatize = True 
pos = set()

def run_vader(textual_unit, 
              parts_of_speech_to_consider=None, lemmatize=False,
              verbose=0):
    """
    Run VADER on a sentence from spacy
    
    :param str textual unit: a textual unit, e.g., sentence, sentences (one string)
    (by looping over doc.sents)
    :param bool lemmatize: If True, provide lemmas to VADER instead of words
    :param set parts_of_speech_to_consider:
    -None or empty set: all parts of speech are provided
    -non-empty set: only these parts of speech are considered.
    :param int verbose: if set to 1, information is printed
    about input and output
    
    :rtype: dict
    :return: vader output dict
    """
    doc = nlp(textual_unit)
        
    input_to_vader = []

    for sent in doc.sents:
        for token in sent:

            to_add = token.text

            if lemmatize:
                to_add = token.lemma_

                if to_add == '-PRON-': 
                    to_add = token.text

            if parts_of_speech_to_consider:
                if token.pos_ in parts_of_speech_to_consider:
                    input_to_vader.append(to_add) 
            else:
                input_to_vader.append(to_add)

    scores = vader_model.polarity_scores(' '.join(input_to_vader))
    
    if verbose >= 1:
        print()
        print('INPUT SENTENCE', sent)
        print('INPUT TO VADER', input_to_vader)
        print('VADER OUTPUT', scores)

    return scores

In [4]:
import pandas as pd

In [5]:
test_dataset = pd.read_csv("./sentiment-topic-test.tsv", sep='\t')

In [6]:
print(test_dataset)

   sentence id                                               text sentiment  \
0            0  I wouldn't be caught dead watching the NFL if ...  negative   
1            1  Chris O'Donnell stated that while filming for ...   neutral   
2            2  The whole game was a rollercoaster ride, but L...  positive   
3            3  Zendaya slayed in Dune 2, as she does in all h...  positive   
4            4  While my favorite player was playing this matc...  negative   
5            5  My uncle's brother's neighbor's cat's veterina...   neutral   
6            6  He said that The Great Gatsby is the best nove...  negative   
7            7  I could not look away from this train wrck of ...  negative   
8            8  The film Everything Everywhere All At Once fol...   neutral   
9            9  I just finished reading pride and prejudice wh...  positive   

    topic  
0  sports  
1   movie  
2  sports  
3   movie  
4  sports  
5    book  
6    book  
7   movie  
8   movie  
9    book 