In [1]:
%matplotlib inline

# Importing the necessary packages for analyzing our data.

import pandas as pd
import string
import seaborn as sns
import matplotlib.pyplot as plt
import collections
import nltk
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

In [11]:
# Load dataset (a csv file containing information about all the songs) into variable 'lyrics'.

lyrics = pd.read_csv("taylor_swift_lyrics_2006-2020_all.txt")

# Inspect the first 5 rows to see what the data looks like.

lyrics.head()

Unnamed: 0,album_name,track_title,track_n,lyric,line
0,Taylor Swift,Tim McGraw,1,He said the way my blue eyes shined,1
1,Taylor Swift,Tim McGraw,1,Put those Georgia stars to shame that night,2
2,Taylor Swift,Tim McGraw,1,"I said, ""That's a lie""",3
3,Taylor Swift,Tim McGraw,1,Just a boy in a Chevy truck,4
4,Taylor Swift,Tim McGraw,1,That had a tendency of gettin' stuck,5


In [12]:
# Print the information pertaining to the dataset (ie. number of rows/columns, column names, data type).
# 'Year of release' is noticeably absent from data set.

lyrics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8358 entries, 0 to 8357
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   album_name   8358 non-null   object
 1   track_title  8358 non-null   object
 2   track_n      8358 non-null   int64 
 3   lyric        8358 non-null   object
 4   line         8358 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 326.6+ KB


In [13]:
# Print a list of all the albums in this data set.

print(lyrics.album_name.unique())

['Taylor Swift' 'Fearless (Taylor’s Version)' 'Speak Now (Deluxe)'
 'Red (Deluxe Edition)' '1989 (Deluxe)' 'reputation' 'Lover'
 'folklore (deluxe version)' 'evermore (deluxe version)']


In [14]:
# Defining a function to map the name of the album to the year it was released.

def album_release(row):  
    if row['album_name'] == 'Taylor Swift':
        return '2006'
    elif row['album_name'] == 'Fearless (Taylor’s Version)':
        return '2008'
    elif row['album_name'] == 'Speak Now (Deluxe)':
        return '2010'
    elif row['album_name'] == 'Red (Deluxe Edition)':
        return '2012'
    elif row['album_name'] == '1989 (Deluxe)':
        return '2014'
    elif row['album_name'] == 'reputation':
        return '2017'
    elif row['album_name'] == 'Lover':
        return '2019'
    elif row['album_name'] == 'evermore (deluxe version)':
        return '2020'
    elif row['album_name'] == 'folklore (deluxe version)':
        return '2021'
    elif 'midnights' in row['album_name']:
        return '2022'
    
    return 'No Date'


# Applying the album release function to every row of the data set.

lyrics['album_year'] = lyrics.apply(lambda row: album_release(row), axis=1)

# Inspect the first five rows of the data set again to make sure the corresponding years
# were successfully added.

lyrics.head()

Unnamed: 0,album_name,track_title,track_n,lyric,line,album_year
0,Taylor Swift,Tim McGraw,1,He said the way my blue eyes shined,1,2006
1,Taylor Swift,Tim McGraw,1,Put those Georgia stars to shame that night,2,2006
2,Taylor Swift,Tim McGraw,1,"I said, ""That's a lie""",3,2006
3,Taylor Swift,Tim McGraw,1,Just a boy in a Chevy truck,4,2006
4,Taylor Swift,Tim McGraw,1,That had a tendency of gettin' stuck,5,2006


In [15]:
# Convert lyrics to lowercase so that all occurances of the same word are considered the same, regardless of captilization.
# Save the result in a new data column called 'clean_lyric'.

lyrics['clean_lyric'] = lyrics['lyric'].str.lower()

# Remove punctuation as well.

lyrics['clean_lyric'] = lyrics['clean_lyric'].str.replace('[^\w\s]','')

# Print the first five lines again to confirm correct changes.

lyrics.head()

Unnamed: 0,album_name,track_title,track_n,lyric,line,album_year,clean_lyric
0,Taylor Swift,Tim McGraw,1,He said the way my blue eyes shined,1,2006,he said the way my blue eyes shined
1,Taylor Swift,Tim McGraw,1,Put those Georgia stars to shame that night,2,2006,put those georgia stars to shame that night
2,Taylor Swift,Tim McGraw,1,"I said, ""That's a lie""",3,2006,"i said, ""that's a lie"""
3,Taylor Swift,Tim McGraw,1,Just a boy in a Chevy truck,4,2006,just a boy in a chevy truck
4,Taylor Swift,Tim McGraw,1,That had a tendency of gettin' stuck,5,2006,that had a tendency of gettin' stuck


In [16]:
# Create a list of "stop" words to remove.  These don't hold any value for our analysis.

stop = ['the', 'a', 'this', 'that', 'to', 'is', 'am', 'was', 'were', 'be', 'being', 'been']


# Make a list of words with `.split()`.
# Remove all the words in our list.
# Join the words back together into a string.

lyrics['clean_lyric'] = lyrics['clean_lyric'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [17]:
# Create a new column to reflect if the lyrics contain midnight

lyrics['midnight'] = lyrics['clean_lyric'].str.contains('midnight')

# Print the number of times the word "midnight" occurred in the data set.

sum(lyrics['midnight'])

9

In [18]:
# Making lists of words that are categorically similar, to see if they are referenced in the songs.

night = ['night', 'midnight', 'dawn', 'dusk', 'evening', 'late', 'dark', '1am', '2am', '3am', '4am']
day = ['day', 'morning', 'light', 'sun', 'dawn', 'noon', 'golden', 'bright']
time = ['today', 'tomorrow', 'yesterday']

# Regex string for each list to help with the searching.
night_regex = "|".join(night)
day_regex = "|".join(day)
time_regex = "|".join(time)

# Creating a new column for each of the categories of words above.
lyrics['night'] = lyrics['clean_lyric'].str.contains(night_regex)
lyrics['day'] = lyrics['clean_lyric'].str.contains(day_regex)
lyrics['time'] = lyrics['clean_lyric'].str.contains(time_regex)

# Count the number of times a word from any of those three categories appears in the lyrics.
night_count = sum(lyrics['night'])
day_count = sum(lyrics['day'])
time_count = sum(lyrics['time'])

# Print the counts.

print("night words: ", night_count)
print("day words: ", day_count)
print("time words: ", time_count)

# Inspect the first 5 rows again.
lyrics.head()

night words:  240
day words:  363
time words:  35


Unnamed: 0,album_name,track_title,track_n,lyric,line,album_year,clean_lyric,midnight,night,day,time
0,Taylor Swift,Tim McGraw,1,He said the way my blue eyes shined,1,2006,he said way my blue eyes shined,False,False,False,False
1,Taylor Swift,Tim McGraw,1,Put those Georgia stars to shame that night,2,2006,put those georgia stars shame night,False,True,False,False
2,Taylor Swift,Tim McGraw,1,"I said, ""That's a lie""",3,2006,"i said, ""that's lie""",False,False,False,False
3,Taylor Swift,Tim McGraw,1,Just a boy in a Chevy truck,4,2006,just boy in chevy truck,False,False,False,False
4,Taylor Swift,Tim McGraw,1,That had a tendency of gettin' stuck,5,2006,had tendency of gettin' stuck,False,False,False,False
