In [15]:
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/kalleleander/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [None]:
def analyze_sentiment_vader(text):

    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(text)

    return sentiment['compound']

In [13]:
# Load GloVe embeddings (e.g., 'glove.6B.50d.txt')
file_path_glove = '/Users/kalleleander/Documents/DTU/Bachelor/Semester 2/Signals & Data/Assignments/Exam assignment/files_light/glove.6B.100d.txt'

def load_glove_embeddings(file_path, embedding_dim=100):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

In [10]:
file_path_vader = '/Users/kalleleander/nltk_data/sentiment/vader_lexicon/vader_lexicon.txt'

def load_vader_lexicon():
    lexicon = {}
    with open(file_path_vader, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                word = parts[0]  # Word or phrase
                sentiment_score = float(parts[1])  # Sentiment score
                lexicon[word] = sentiment_score
    return lexicon

In [11]:
# Cosine similarity function
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Sentiment scoring with TF-IDF weighting and embeddings
def compute_sentiment_with_tfidf(sentence, lexicon, embeddings, tfidf_vectorizer):
    words = sentence.lower().split()  # Tokenize the sentence
    tfidf_matrix = tfidf_vectorizer.fit_transform([sentence])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    word_tfidf = {word: tfidf_matrix[0, idx] for idx, word in enumerate(feature_names)}

    sentiment_scores = []
    
    for word in words:
        tfidf_weight = word_tfidf.get(word, 1)  # Default to 1 if not in TF-IDF features

        if word in lexicon:  # Direct match in lexicon
            sentiment_scores.append(lexicon[word] * tfidf_weight)
        elif word in embeddings:  # No match in lexicon, calculate similarity
            best_similarity = -1
            sentiment_score = 0

            for lex_word, lex_score in lexicon.items():
                if lex_word in embeddings:
                    similarity = cosine_similarity(embeddings[word], embeddings[lex_word])
                    if similarity > best_similarity:
                        best_similarity = similarity
                        sentiment_score = lex_score
            
            sentiment_scores.append(sentiment_score * tfidf_weight)

    # Aggregate scores (-1 to 1 scale)
    return np.mean(sentiment_scores) if sentiment_scores else 0

In [16]:
# Example usage
if __name__ == "__main__":
    # Load resources
    glove_path = file_path_glove  # Adjust path to GloVe embeddings file
    embeddings = load_glove_embeddings(glove_path)
    lexicon = load_vader_lexicon()

    # Create TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Input sentence
    sentence = "good excellent horrible amazing bad terrible neutral"
    
    # Compute sentiment
    sentiment = compute_sentiment_with_tfidf(sentence, lexicon, embeddings, tfidf_vectorizer)
    print("Sentiment Score:", sentiment)

Sentiment Score: 0.08639187954496616
