In [56]:
import os 
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# regular expression import
import re

# uni-code library
import unicodedata

# natural language toolkit library/modules
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string).lower()
    return string

def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)
    
    return string

def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()

    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]

    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)
    
    return string

def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))

    # Split words in string.
    words = word_tokenize(string)
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

def clean(text):
    '''
    This function combines the above steps and added extra stop words to clean text
    '''
    return remove_stopwords(lemmatize(basic_clean(text)), extra_words = ['dont', 'cant', 'im', 'ive', 'ill', 'te', 'youre', 'wan', 'na', 'wa', ''])




In [57]:
def TFIDFFromCSV(csvName):
    df = pd.read_csv(csvName)
    df.dropna(inplace=True)
    texts = df['Lyrics']

        
    cleaned_texts = [clean(text) for text in texts]

    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_texts)

    # Step 3: Get feature names (words)
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Step 4: Get IDF values
    idf_values = tfidf_vectorizer.idf_

    # Create a dictionary mapping each word to its IDF value
    word_idf = dict(zip(feature_names, idf_values))

    # Step 5: Generate Word Cloud
    # Convert TF-IDF matrix to a dictionary
    word_tfidf = dict(zip(feature_names, tfidf_matrix.mean(axis=0).tolist()[0]))
    

    stop_words = set(stopwords.words('english'))

    lyrics_trimmed = []
    for key, value in word_tfidf.items():
        if value <= 0.001:
            lyrics_trimmed.append(key)

    trimmed_string = set(lyrics_trimmed)
    

    filtered_trimmed_lyrics = [word.lower() for word in word_tfidf if word.isalpha() and word.lower() not in stop_words and word.lower() not in trimmed_string]

    # Load the sentiment analysis pipeline
    sentiment_analyzer = SentimentIntensityAnalyzer()
    return sentiment_analyzer.polarity_scores(' '.join(filtered_trimmed_lyrics))

In [61]:
def TFIDFFromSong(lyrics):
    
    texts = lyrics.split()

    cleaned_texts = [clean(text) for text in texts]
    
    if not any(len(s) > 0 for s in cleaned_texts):
        return

    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_texts)

    # Step 3: Get feature names (words)
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Step 4: Get IDF values
    idf_values = tfidf_vectorizer.idf_

    # Create a dictionary mapping each word to its IDF value
    word_idf = dict(zip(feature_names, idf_values))

    # Step 5: Generate Word Cloud
    # Convert TF-IDF matrix to a dictionary
    word_tfidf = dict(zip(feature_names, tfidf_matrix.mean(axis=0).tolist()[0]))
    

    stop_words = set(stopwords.words('english'))

    lyrics_trimmed = []
    for key, value in word_tfidf.items():
        if value <= 0.001:
            lyrics_trimmed.append(key)

    trimmed_string = set(lyrics_trimmed)
    

    filtered_trimmed_lyrics = [word.lower() for word in word_tfidf if word.isalpha() and word.lower() not in stop_words and word.lower() not in trimmed_string]

    # Load the sentiment analysis pipeline
    sentiment_analyzer = SentimentIntensityAnalyzer()
    return sentiment_analyzer.polarity_scores(' '.join(filtered_trimmed_lyrics))

In [63]:
def TFIDFFromCSVIndividualSongs(CSVName):
    df = pd.read_csv(CSVName, index_col=0)
    df.dropna(inplace=True)
    df["Sentiment"] = df["Lyrics"].apply(TFIDFFromSong)
    df.to_csv(CSVName)

In [65]:
allCSVNames = ['Danny Brown Lyrics.csv', 'Interpol Lyrics.csv', 'Charli XCX Lyrics.csv', 'The Strokes Lyrics.csv', 'Mac Demarco Lyrics.csv', 'Radiohead Lyrics.csv', 'Jack Johnson Lyrics.csv', 'Sufjan Stevens Lyrics.csv', 'Lorde Lyrics.csv', 'The Decemberists Lyrics.csv', 'The National Lyrics.csv', 'Cage The Elephant Lyrics.csv', 'Neutral Milk Hotel Album Lyrics.csv', 'Mac Miller Lyrics.csv', 'Lana Del Rae Lyrics.csv', 'Silver Jews Lyrics.csv', 'Kacey Musgraves Lyrics.csv', 'Haim Lyrics.csv', 'Pavement Lyrics.csv', 'Lake Street Dive Lyrics.csv', 'The 1975 Lyrics.csv', 'Vampire Weekend Lyrics.csv', 'The Lumineers Lyrics.csv', 'Car Seat Headrest Lyrics.csv', 'Alex G Lyrics.csv', 'Carly Rae Jepson Lyrics.csv', 'Alvvays Lyrics.csv', 'Kendrick Lamar Lyrics.csv', 'Brockhampton Lyrics.csv', 'Chastity Belt Lyrics.csv', 'Big Theif Lyrics.csv', 'Adrianne Lenker Lyrics.csv', 'Frank Ocean Lyrics.csv', 'The Smiths Lyrics.csv', 'Death Grips Lyrics.csv', '100 Gecs Lyrics.csv', 'Anderson .Paak Lyrics.csv', 'Vampire Weekend Lyrics.csv', 'New Order Lyrics.csv']

for i in allCSVNames:
    TFIDFFromCSVIndividualSongs(i)
    print(i)

Danny Brown Lyrics.csv
Interpol Lyrics.csv
Charli XCX Lyrics.csv
The Strokes Lyrics.csv
Mac Demarco Lyrics.csv
Radiohead Lyrics.csv
Jack Johnson Lyrics.csv
Sufjan Stevens Lyrics.csv
Lorde Lyrics.csv
The Decemberists Lyrics.csv
The National Lyrics.csv
Cage The Elephant Lyrics.csv
Neutral Milk Hotel Album Lyrics.csv
Mac Miller Lyrics.csv
Lana Del Rae Lyrics.csv
Silver Jews Lyrics.csv
Kacey Musgraves Lyrics.csv
Haim Lyrics.csv
Pavement Lyrics.csv
Lake Street Dive Lyrics.csv
The 1975 Lyrics.csv
Vampire Weekend Lyrics.csv
The Lumineers Lyrics.csv
Car Seat Headrest Lyrics.csv
Alex G Lyrics.csv
Carly Rae Jepson Lyrics.csv
Alvvays Lyrics.csv
Kendrick Lamar Lyrics.csv
Brockhampton Lyrics.csv
Chastity Belt Lyrics.csv
Big Theif Lyrics.csv
Adrianne Lenker Lyrics.csv
Frank Ocean Lyrics.csv
The Smiths Lyrics.csv
Death Grips Lyrics.csv
100 Gecs Lyrics.csv
Anderson .Paak Lyrics.csv
Vampire Weekend Lyrics.csv
New Order Lyrics.csv


In [78]:
import matplotlib.pyplot as plt
from datetime import datetime
import ast



def plot_sentiment_over_time(csv, artist_name):
    df = pd.read_csv(csv, index_col=0)
    df.dropna(inplace=True)

    df['Release Date Object'] = df["Release Date"].apply(lambda x: datetime(year=ast.literal_eval(x)['year'], month=ast.literal_eval(x)['month'], day=ast.literal_eval(x)['day']))
    df["Sentiment"] = df['Sentiment'].apply(lambda x: ast.literal_eval(x))

    df.sort_values(by="Release Date Object")
    
    positive_sentiments = []
    negative_sentiments = []

    for index, row in df.iterrows():
        sentiment = row['Sentiment']
        positive_sentiments.append(sentiment['pos'])
        negative_sentiments.append(sentiment['neg'])

    # Plot the sentiment over time
    plt.figure(figsize=(10, 6))
    plt.plot(positive_sentiments, label='Positive Sentiment', color='blue')
    plt.plot(negative_sentiments, label='Negative Sentiment', color='red')
    plt.xlabel('Release Date')
    plt.ylabel('Sentiment Score')
    plt.title(f'Sentiment Of {artist_name} Lyrics Over Time')
    plt.legend()

    # Annotate the graph with first and last song names and release dates
    first_song_name = df.index[0]
    last_song_name = df.index[-1]
    first_release_date = df['Release Date Object'].min().strftime("%m/%d/%Y")  # Assuming your index contains release dates
    last_release_date = df['Release Date Object'].max().strftime("%m/%d/%Y")

    plt.xticks([0, len(df) - 1], [f'{first_song_name}\n{first_release_date}', f'{last_song_name}\n{last_release_date}'])

    plt.ylim(0.0, 0.8)
    plt.tight_layout()
    plt.savefig(f'{artist_name}_Sentiment_Graph_2.0.png')
    plt.close()

In [79]:
for i in allCSVNames:
    tempname = ' '.join(i.split()[:-1])
    plot_sentiment_over_time(i, tempname)