# Exploration spark: "What's the most cliche song ever written? How do we even measure cliche-ness?"

In [1]:
import pandas as pd
import numpy as np

In [2]:
np.__version__

'1.19.5'

# Pull in data (prepped in other notebook)

In [3]:
df = pd.read_csv('./data/Songs with metadata and lyrics.csv')

In [4]:
df.head()

Unnamed: 0,SongID,Performer,Song,spotify_genre,spotify_track_preview_url,spotify_track_explicit,spotify_track_album,WeekID,Week Position,Peak Position,Lyrics
0,...Baby One More TimeBritney Spears,Britney Spears,...Baby One More Time,"['dance pop', 'pop', 'post-teen pop']",https://p.scdn.co/mp3-preview/da2134a161f1cb34...,False,...Baby One More Time (Digital Deluxe Version),5/29/1999,31.0,1.0,"[Intro]\nOh baby, baby\nOh baby, baby\n\n[Vers..."
1,...Ready For It?Taylor Swift,Taylor Swift,...Ready For It?,"['pop', 'post-teen pop']",,False,"{'album_type': 'album', 'artists': [{'external...",12/2/2017,18.0,4.0,[Verse 1]\nKnew he was a killer first time tha...
2,'03 Bonnie & ClydeJay-Z Featuring Beyonce Knowles,Jay-Z Featuring Beyonce Knowles,'03 Bonnie & Clyde,"['east coast hip hop', 'hip hop', 'pop rap', '...",,True,The Blueprint 2 The Gift & The Curse,12/28/2002,4.0,4.0,"[Intro: Jay Z]\nUhh, uhh, uhh\nYou ready, B?\n..."
3,'65 Love AffairPaul Davis,Paul Davis,'65 Love Affair,"['album rock', 'bubblegum pop', 'country rock'...",https://p.scdn.co/mp3-preview/a701445830ecacfb...,False,Radio Hits Of the '80s,6/12/1982,18.0,6.0,'65 love affair\nPaul Davis\n\nI was a car hop...
4,'TilThe Angels,The Angels,'Til,"['blues rock', 'garage rock', 'modern blues ro...",https://p.scdn.co/mp3-preview/bf264f14124ddf08...,False,Caprice! The Story Behind Gerry Granahan's Cap...,1/20/1962,38.0,14.0,"[Intro]\nYeah, yeah, yeah\nRap diablo\nYeah, y..."


In [5]:
df[df.Song == 'Our Song'].Lyrics.iloc[0]

'[Verse 1]\nI was riding shotgun with my hair undone\nIn the front seat of his car\nHe\'s got a one-hand feel on the steering wheel\nThe other on my heart\nI look around, turn the radio down\nHe says, "Baby, is something wrong?"\nI say, "Nothing, I was just thinking\nHow we don\'t have a song"\nAnd he says\n\n[Chorus]\nOur song is the slamming screen door\nSneakin\' out late, tapping on your window\nWhen we\'re on the phone and you talk real slow\n\'Cause it\'s late and your mama don\'t know\nOur song is the way you laugh\nThe first date: "Man, I didn\'t kiss her, and I should have"\nAnd when I got home \'fore I said amen\nAsking God if he could play it again\n\n[Verse 2]\nI was walking up the front porch steps\nAfter everything that day\nHad gone all wrong and been trampled on\nAnd lost and thrown away\nGot to the hallway, well on my way\nTo my lovin\' bed\nI almost didn\'t notice all the roses\nAnd the note that said\n\n[Chorus]\nOur song is the slamming screen door\nSneakin\' out la

# Simple definition: "cliche" is when write lines very similar to lines published in previous years.

## CLICHE->SONGS: choose a cliche (eg "I love you", "I've never felt this way before", "you're my everything"), then try to identify which songs contain it.

## SONGS->analyze->CLICHES: identify cliches from the songs

## Simplest: *word-for-word* identical

In [6]:
# using title as proxy for a single line here (lyrics too big to view)
df[df.Song.str.contains('i love you', case=False)] 

Unnamed: 0,SongID,Performer,Song,spotify_genre,spotify_track_preview_url,spotify_track_explicit,spotify_track_album,WeekID,Week Position,Peak Position,Lyrics
460,Baby I Love YouAretha Franklin,Aretha Franklin,Baby I Love You,"['classic soul', 'jazz blues', 'memphis soul',...",https://p.scdn.co/mp3-preview/39a582336820d9ae...,False,Aretha Arrives,9/16/1967,5.0,4.0,If you want my lovin'\nIf you really do\nDon't...
461,"Baby I Love Your Way (From ""Reality Bites"")Big...",Big Mountain,"Baby I Love Your Way (From ""Reality Bites"")",['reggae'],,,,8/27/1994,34.0,6.0,PROLOGUE\n\nTwo tires fly. Two wail.\n\nA bamb...
479,"Baby, I Love YouAndy Kim",Andy Kim,"Baby, I Love You","['brill building pop', 'bubblegum pop']",,False,Baby I Love You,9/6/1969,25.0,9.0,Have I ever told you\nHow good it feels to hol...
480,"Baby, I Love Your Way/Freebird MedleyWill To P...",Will To Power,"Baby, I Love Your Way/Freebird Medley",['freestyle'],,,,2/18/1989,91.0,1.0,Shadows grow so long before my eyes\nAnd they'...
481,"Baby, I Love Your WayPeter Frampton",Peter Frampton,"Baby, I Love Your Way","['album rock', 'art rock', 'blues rock', 'brit...",,False,Frampton Comes Alive! (Deluxe Edition),9/18/1976,30.0,12.0,[Verse 1]\nShadows grow so long before my eyes...
595,Because I Love You (The Postman Song)Stevie B,Stevie B,Because I Love You (The Postman Song),['freestyle'],https://p.scdn.co/mp3-preview/5894932928df6362...,False,Valentine's Day Love… A Special Kind,2/16/1991,49.0,1.0,I got your letter from the postman just the ot...
846,But You Know I Love YouThe First Edition,The First Edition,But You Know I Love You,[],https://p.scdn.co/mp3-preview/4aabe0d5899dbf62...,False,Country & Western Legend,3/8/1969,19.0,19.0,When the morning sun\nStreaks across my room\n...
2256,Hate That I Love YouRihanna Featuring Ne-Yo,Rihanna Featuring Ne-Yo,Hate That I Love You,"['barbadian pop', 'dance pop', 'pop', 'post-te...",,False,Good Girl Gone Bad: Reloaded,1/12/2008,15.0,7.0,"[Intro: Rihanna & Ne-Yo]\nYeah, yeah\nYeah, ye..."
2279,He Don't Love You (Like I Love You)Tony Orland...,Tony Orlando & Dawn,He Don't Love You (Like I Love You),"['bubblegum pop', 'classic uk pop', 'country r...",https://p.scdn.co/mp3-preview/719f19b28e71a491...,False,He Don't Love You,6/14/1975,54.0,1.0,"He don't love you, like I love you\nIf he did,..."
2360,"Hello, I Love YouThe Doors",The Doors,"Hello, I Love You","['album rock', 'classic rock', 'piano rock', '...",https://p.scdn.co/mp3-preview/3d31a54a62e7489a...,False,Waiting for the Sun,8/3/1968,1.0,1.0,"[Chorus]\nHello, I love you\nWon't you tell me..."


## But how do you find similar lines that aren't *word-for-word* matches?

ie I want something like
> "Look at this cluster: "I love you", "you are my love", and "you are the love of my life" are close together. But "you are my spring dew" is far away."

### One approach: create a row for every *line*, then use word2vec & dimensio reduction to plot a 2D "phrase cloud"
- maybe limmit to first 10 lines of a song to quickly remove duplicates)
- maybe just start with most recent year

# DATA PREP: "explode" data into "one row per line"

# NOTE: I'm limiting data to1976-1980

In [7]:
tiny_df = df[
    df.WeekID.str.endswith('1980') |
    df.WeekID.str.endswith('1979') |
    df.WeekID.str.endswith('1978') |
    df.WeekID.str.endswith('1977') |
    df.WeekID.str.endswith('1976')
]
tiny_df

Unnamed: 0,SongID,Performer,Song,spotify_genre,spotify_track_preview_url,spotify_track_explicit,spotify_track_album,WeekID,Week Position,Peak Position,Lyrics
9,(Don't Fear) The ReaperBlue Oyster Cult,Blue Oyster Cult,(Don't Fear) The Reaper,"['album rock', 'art rock', 'blues rock', 'clas...",https://p.scdn.co/mp3-preview/596db216a6621b8c...,False,Agents Of Fortune,11/20/1976,17.0,12.0,[Intro]\n\n[Verse 1: Buck Dharma]\nAll our tim...
10,(Every Time I Turn Around) Back In Love AgainLTD,LTD,(Every Time I Turn Around) Back In Love Again,['reggaeton'],,,,2/11/1978,62.0,4.0,"Every time I move, I lose\nWhen I look I'm in\..."
38,(Love Is) Thicker Than WaterAndy Gibb,Andy Gibb,(Love Is) Thicker Than Water,"['adult standards', 'bubblegum pop', 'country ...",,False,Flowing Rivers,5/13/1978,47.0,1.0,[Chorus:]\nLove is higher than a mountain\nLov...
41,(Our Love) Don't Throw It All AwayAndy Gibb,Andy Gibb,(Our Love) Don't Throw It All Away,"['adult standards', 'bubblegum pop', 'country ...",,False,Shadow Dancing,2/3/1979,53.0,9.0,Maybe I don't wanna know the reason why\nBut l...
43,"(Shake, Shake, Shake) Shake Your BootyKC And T...",KC And The Sunshine Band,"(Shake, Shake, Shake) Shake Your Booty","['disco', 'funk', 'motown', 'soft rock']",https://p.scdn.co/mp3-preview/fe0fa4239fab4ad7...,False,KC & The Sunshine Band - Part 3...And More,11/20/1976,44.0,1.0,"[Intro]\nAh everybody, get on the floor, and l..."
...,...,...,...,...,...,...,...,...,...,...,...
7368,You're The Only Woman (You & I)Ambrosia,Ambrosia,You're The Only Woman (You & I),"['adult standards', 'album rock', 'art rock', ...",https://p.scdn.co/mp3-preview/c8ffa80fb4fe60de...,False,One Eighty,9/20/1980,13.0,13.0,"Oh baby\nSay, now, you\nYou talk about The thi..."
7383,Young BloodBad Company,Bad Company,Young Blood,"['album rock', 'blues rock', 'classic rock', '...",https://p.scdn.co/mp3-preview/32e39a60def288ae...,False,Run With The Pack (Deluxe),6/12/1976,64.0,20.0,I saw her standing on the corner\nA yellow rib...
7387,Young Hearts Run FreeCandi Staton,Candi Staton,Young Hearts Run Free,"['classic soul', 'disco', 'motown', 'quiet sto...",https://p.scdn.co/mp3-preview/b0b96bb7e0e4f151...,False,Young Hearts Run Free (US Internet Release),8/21/1976,20.0,20.0,What's the sense in sharing this one and only ...
7400,"Your LoveMarilyn McCoo & Billy Davis, Jr.","Marilyn McCoo & Billy Davis, Jr.",Your Love,[],https://p.scdn.co/mp3-preview/4fb26d7f63aee83b...,False,I Hope We Get To Love In Time,5/28/1977,65.0,15.0,YOUR LOVE\nMarilyn McCoo & Billy Davis\nEach d...


In [8]:
# tiny_df['Parsed Lyrics'] = tiny_df.Lyrics.apply(lambda l: l.split('\n')[:5])
tiny_df['Parsed Lyrics'] = tiny_df.Lyrics.apply(lambda l: l.split('\n'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [9]:
tiny_df_parsed = tiny_df.explode('Parsed Lyrics').drop_duplicates(
    subset=['Parsed Lyrics', 'SongID']
).drop(
    columns=['Lyrics']
).rename(columns={'Parsed Lyrics': 'lyric_line'})


### Filter & clean

In [10]:
ENDS_WITH_YEAR_IN_PAREN = '\(\d{4}?\)$'  # eg "... (2018)"
BRACKETED = '^\[.*?\]$'  # eg "[...]"
SONG_DASH_ARTIST = '[A-Z].+\s\-\s[A-Z].+'  # EG "Ariana Grande and Victoria MonÃ©t - MONOPOLY"

with_filters = tiny_df_parsed[
    ~tiny_df_parsed['lyric_line'].str.contains(
        f'{BRACKETED}|{ENDS_WITH_YEAR_IN_PAREN}|{SONG_DASH_ARTIST}', 
        regex= True, case=False, na=False
    )
]

In [11]:
with_more_filters = with_filters[~with_filters.lyric_line.str.contains('" by ')]
with_more_filters['lyric_line'] = with_more_filters.lyric_line.str.replace('EmbedShare URLCopyEmbedCopy', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# NOTE: I'm dropping any row with *any* MISSING DATA 
Maybe we try something less aggressive later (eg blank genre is ok)

In [12]:
COLUMNS_FOR_VIZ = ['Performer', 'Song', 'spotify_genre', 'lyric_line', 'WeekID']

lyric_lines_ready_for_analysis = (
    with_more_filters[
        COLUMNS_FOR_VIZ
    ]
    .replace('', np.nan)
    .dropna()
    .reset_index(drop=True)
)

lyric_lines_ready_for_analysis['lyric_line'] = lyric_lines_ready_for_analysis['lyric_line'].apply(
    lambda val: val and val.replace('â\x80\x99', "'")
)
lyric_lines_ready_for_analysis

Unnamed: 0,Performer,Song,spotify_genre,lyric_line,WeekID
0,Blue Oyster Cult,(Don't Fear) The Reaper,"['album rock', 'art rock', 'blues rock', 'clas...",All our times have come,11/20/1976
1,Blue Oyster Cult,(Don't Fear) The Reaper,"['album rock', 'art rock', 'blues rock', 'clas...","Here, but now they're gone",11/20/1976
2,Blue Oyster Cult,(Don't Fear) The Reaper,"['album rock', 'art rock', 'blues rock', 'clas...",Seasons don't fear the reaper,11/20/1976
3,Blue Oyster Cult,(Don't Fear) The Reaper,"['album rock', 'art rock', 'blues rock', 'clas...","Nor do the wind, the sun or the rain",11/20/1976
4,Blue Oyster Cult,(Don't Fear) The Reaper,"['album rock', 'art rock', 'blues rock', 'clas...","(We can be like they are) Come on, baby",11/20/1976
...,...,...,...,...,...
42665,James Taylor,Your Smiling Face,"['adult standards', 'folk', 'folk rock', 'mell...",Whenever I see you smile at me,12/24/1977
42666,James Taylor,Your Smiling Face,"['adult standards', 'folk', 'folk rock', 'mell...","No one can tell me that, tell me that I'm doin...",12/24/1977
42667,James Taylor,Your Smiling Face,"['adult standards', 'folk', 'folk rock', 'mell...",Whenever I see your smiling face my way,12/24/1977
42668,James Taylor,Your Smiling Face,"['adult standards', 'folk', 'folk rock', 'mell...","No one can tell me that I'm doing wrong today,...",12/24/1977


# EMBED WORDS USING Universal Encoder

In [13]:
#@title Load the Universal Sentence Encoder's TF Hub module
from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

In [14]:
model = hub.load('./data/universal-sentence-encoder-large_5/')

2021-11-30 14:55:40.392424: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [15]:
def embed(input):
    return model(input)

In [None]:
message_embeddings = embed(lyric_lines_ready_for_analysis['lyric_line'])

# Dimension reduction

## Dimension reduction with TSNE

In [None]:
from sklearn.manifold import TSNE

In [None]:
# tsne_transformed_embeddings = TSNE(
#     n_components=2,
#     learning_rate='auto',
#     init='pca'
# ).fit_transform(message_embeddings)

In [None]:
def concat_x_and_y(x_and_y, df):
    return pd.concat(
    [df, pd.DataFrame(x_and_y, columns=['x', 'y'])],
    axis=1
)

In [None]:
# tsne_lyrics_and_artist = concat_x_and_y(
#     tsne_transformed_embeddings, 
#     lyric_lines_ready_for_analysis
# )

In [None]:
# sns.scatterplot(data=tsne_lyrics_and_artist, x='x', y='y')

# Dimension reduction with UMAP

In [None]:
import umap
from sklearn.preprocessing import StandardScaler

In [None]:
reducer = umap.UMAP()

In [None]:
embedding = reducer.fit_transform(
    StandardScaler().fit_transform(message_embeddings)
)

In [None]:
umap_x_and_y_added = concat_x_and_y(
    embedding, 
    df=lyric_lines_ready_for_analysis
)
umap_x_and_y_added

In [None]:
sns.scatterplot(data=umap_x_and_y_added, x='x', y='y')

## Export to CSV

In [None]:
def normalize_zero_to_one(s):
    return (s - s.min()) / (s.max() - s.min())

def export_normalized_data(data):
    temp = data.copy().dropna()
    temp['x'] = normalize_zero_to_one(temp.x)
    temp['y'] = normalize_zero_to_one(temp.y)
    temp['x'] = temp['x'].round(3)
    temp['y'] = temp['y'].round(3)
    export = temp
    export.to_csv('./data/viz ready export 1976 - 1980.csv', index=False)
    return export

In [None]:
the_export = export_normalized_data(umap_x_and_y_added)
the_export

# List of all: genres, artists

In [None]:
my_set = set()
the_export['Performer'].unique()

In [None]:
from ast import literal_eval

my_set = set()
the_export['spotify_genre'].apply(
    lambda val: val and my_set.update(literal_eval(val))
)
my_set

In [None]:
rap_genres = [genre for genre in list(my_set) if 'rap' in genre]

In [None]:
pop_genres = [genre for genre in list(my_set) if 'pop' in genre]
pop_genres