In [47]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [86]:
def regex_sub(string):
    tagless = re.sub("\[.*?]", "",string)
    text_only = re.sub('[^A-Za-z]+', ' ', tagless)
    
    return text_only

def tokenize_and_stops(string):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(string)
    
    filtered_lyrics = [word for word in word_tokens if word.lower() not in stop_words]
    
    return filtered_lyrics

def stem(string_list):
    stemmer = PorterStemmer()
    stemmed_lyrics = []
    for word in string_list:
        stemmed_word = stemmer.stem(word)
        stemmed_lyrics.append(stemmed_word)
    
    rejoined_lyrics = ' '.join(stemmed_lyrics)
    
    return rejoined_lyrics

def text_processing(string):
    if type(string) != str:
        return None
    else:
        text_only = regex_sub(string)
        filtered_lyrics = tokenize_and_stops(text_only)
        stemmed_lyrics = stem(filtered_lyrics)

        return stemmed_lyrics

In [20]:
data = pd.read_csv("song_info_lyrics.csv")
data = data.iloc[:,1:]
data.head()

Unnamed: 0,year,song,artist,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,lyrics
0,2013.0,Thrift Shop,Macklemore & Ryan Lewis Featuring Wanz,67.0,0.781,0.526,6.0,-6.985,0.0,0.293,0.0619,0.0,0.0457,0.662,94.992,235613.0,4.0,"[Intro]Hey, Macklemore, can we go thrift shopp..."
1,2013.0,Blurred Lines,Robin Thicke Featuring T.I. + Pharrell,34.0,0.854,0.6,7.0,-4.755,1.0,0.0411,0.0035,8e-06,0.0758,0.835,119.996,263827.0,4.0,[Intro: Pharrell]Everybody get upEverybody get...
2,2013.0,Radioactive,Imagine Dragons,54.0,0.461,0.78,9.0,-3.809,1.0,0.0618,0.102,7.2e-05,0.667,0.23,136.26,186813.0,4.0,[Verse 1: Dan Reynolds]I'm wakin' up to ash an...
3,2013.0,Harlem Shake,Baauer,57.0,0.452,0.794,0.0,-5.151,1.0,0.0483,0.0111,0.00182,0.416,0.282,137.825,196664.0,4.0,"[Pre-Chorus]Con los terroristas, -tas, -tas, -..."
4,2013.0,Can't Hold Us,Macklemore & Ryan Lewis Featuring Ray Dalton,63.0,0.633,0.927,2.0,-4.468,1.0,0.0839,0.0267,0.0,0.0986,0.88,146.097,258432.0,4.0,"[Intro: Macklemore]Hey, hey, heyGood to see yo..."


In [87]:
# Processing all lyrics to prepare for sentiment analysis
processed_lyrics = []

for lyric in data.lyrics:
    processed = text_processing(lyric)
    processed_lyrics.append(processed)

In [83]:
# Performing sentiment analysis 
sentiment_scores = []
sent_analyzer = SentimentIntensityAnalyzer()

for lyric in processed_lyrics:
    if type(lyric) != str:
        sentiment_scores.append(None) 
    else:
        sentiment_score = sent_analyzer.polarity_scores(lyric)['compound']
        sentiment_scores.append(sentiment_score)

In [92]:
sentiments = []
for score in sentiment_scores:
    if score == None:
        sentiments.append(None)
    elif score >= 0.05:
        sentiments.append('pos')
    elif score <= -0.05:
        sentiments.append('neg')
    else:
        sentiments.append('neu')

In [95]:
data['sent_score'] = sentiment_scores
data['sentiment'] = sentiments

In [97]:
data.to_csv("full_data.csv")