In [2]:
import pandas as pd
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt
import string
from colorutils import Color
import getpass

%load_ext blackcellmagic

In [45]:
data_folder = "data/years/"

file_list = [
    f
    for f in listdir(data_folder)
    if isfile(join(data_folder, f)) and f.endswith(".json")
]

dfs = []  # an empty list to store the data frames
for file in file_list:
    path = data_folder + file
    data = pd.read_json(path)  # read data frame from json file
    dfs.append(data)  # append the data frame to the list

music_data = pd.concat(
    dfs, ignore_index=True
)  # concatenate all the data frames in the list.

In [None]:
print(f"There are {len(music_data)} songs.")

In [None]:
music_data.groupby("year")[["lyrics"]].count().describe()

In [None]:
music_data.groupby("year")[["lyrics"]].count().plot(
    kind="bar", figsize=(12, 12), title="Songs per year"
)
plt.ylabel("count")
plt.xlabel("year")
plt.show()

In [None]:
neg_mean = music_data.sentiment.map(lambda dic: dic["neg"]).mean()
pos_mean = music_data.sentiment.map(lambda dic: dic["pos"]).mean()
neutral_mean = music_data.sentiment.map(lambda dic: dic["neu"]).mean()
print(
    f"The means for the negative, positive and neutral songs are respectively: {neg_mean:.3}, {pos_mean:.3} and {neutral_mean:.3}."
)
music_data.sentiment.map(lambda dic: dic["compound"]).plot(kind="hist")
plt.title("Compound histogram")
plt.xlabel("Sentiment")
plt.show()

In [None]:
print(
    f"There are {len(set([element for list_ in list(music_data.tags) for element in list_]))} tags."
)

In [None]:
print(f"There are {len(music_data['artist'].drop_duplicates())} artists.")

In [None]:
music_data.groupby("artist")[["lyrics"]].count().boxplot()
plt.title("Number of songs per artist")
plt.ylabel("count")
# plt.yscale('log')
plt.show()

In [40]:
small_set_colors = set(
    [
        "white",
        "black",
        "yellow",
        "green",
        "purple",
        "brown",
        "pink",
        "red",
        "blue",
        "maroon",
        "salmon",
        "coral",
        "orange",
        "gold",
        "silver",
        "lime",
        "turquoise",
        "cyan",
        "navy",
        "indigo",
        "magenta",
        "beige",
        "tan",
        "gray",
    ]
)

In [None]:
with open("colors.txt", "r") as file_colors:
    set_colors = set([line.strip() for line in file_colors])

In [None]:
print(
    f"The small set of colors contains {len(small_set_colors)} colors and the big one contains {len(set_colors)} colors."
)

In [None]:
music_data["small_intersection_colors_title"] = (
    music_data["title"]
    .str.lower()
    .map(
        lambda title: set(
            title.translate(str.maketrans("", "", string.punctuation)).split()
        ).intersection(small_set_colors)
    )
)
music_data["small_intersection_colors_lyrics"] = (
    music_data["lyrics"]
    .str.lower()
    .map(
        lambda lyrics: set(
            lyrics.translate(str.maketrans("", "", string.punctuation)).split()
        ).intersection(small_set_colors)
    )
)

music_data["intersection_colors_title"] = (
    music_data["title"]
    .str.lower()
    .map(
        lambda title: set(
            title.translate(str.maketrans("", "", string.punctuation)).split()
        ).intersection(set_colors)
    )
)
music_data["intersection_colors_lyrics"] = (
    music_data["lyrics"]
    .str.lower()
    .map(
        lambda lyrics: set(
            lyrics.translate(str.maketrans("", "", string.punctuation)).split()
        ).intersection(set_colors)
    )
)

In [None]:
small_colored_songs = music_data[
    music_data["small_intersection_colors_lyrics"] != set()
]
colored_songs = music_data[music_data["intersection_colors_lyrics"] != set()]

small_colored_titles = music_data[
    music_data["small_intersection_colors_title"] != set()
]
colored_titles = music_data[music_data["intersection_colors_title"] != set()]

In [None]:
print(
    f"There are {len(small_colored_titles)} songs with colors in their lyrics using the small set of colors and\n there are {len(colored_titles)} songs with colors in their lyrics using the big set of colors."
)

print(
    f"There are {len(small_colored_songs)} songs with colors in their lyrics using the small set of colors and\n there are {len(colored_songs)} songs with colors in their lyrics using the big set of colors."
)

In [None]:
color_counts = (
    pd.DataFrame(
        [
            color
            for colors_list in list(
                small_colored_songs["small_intersection_colors_lyrics"].map(lambda set_: list(set_))
            )
            for color in colors_list
        ],
        columns=["color"],
    )
    .reset_index()
    .groupby("color")
    .count()
    .rename(columns={'index' : 'count'})
)

In [None]:
color_counts.plot(kind='bar')
plt.title("Basic colors distribution")
plt.ylabel('count')
plt.show()

In [None]:
selected_small_columns = ['lyrics', 'tags', 'pos', 'year', 'title',
                          'artist', 'small_intersection_colors_lyrics']

selected_big_columns = ['lyrics', 'tags', 'pos', 'year', 'title',
                          'artist', 'intersection_colors_lyrics']


final_big_df = colored_songs[selected_big_columns].copy()
final_big_df['intersection_colors_lyrics'] = final_big_df['intersection_colors_lyrics'].apply(lambda x : list(x))
final_big_df = final_big_df.explode('intersection_colors_lyrics')

final_small_df = small_colored_songs[selected_small_columns].copy()
final_small_df['small_intersection_colors_lyrics'] = final_small_df['small_intersection_colors_lyrics'].apply(lambda x : list(x))
final_small_df = final_small_df.explode('small_intersection_colors_lyrics').rename(columns = {'small_intersection_colors_lyrics': 'color'})
final_small_df

In [None]:
import random
def randomize_rgb(rgb, perturbation = 20):
    new_rgb = []
    for val in rgb:
        new_rgb.append(max(0, min(val + random.randint(-perturbation, perturbation), 255)))
    return 'rgb'+str(tuple(new_rgb))

In [None]:
Color(web='blue')

In [None]:
Color(hsv=(240, 0.7, 1.0))

In [None]:
final_small_df['hue_0'] = final_small_df['color'].apply(lambda c : Color(web = c).hsv[0])
final_small_df['hue_1'] = final_small_df['color'].apply(lambda c : Color(web = c).hsv[1])
final_small_df['hue_2'] = final_small_df['color'].apply(lambda c : Color(web = c).hsv[2])
final_small_df['rgb'] = final_small_df['color'].apply(lambda c : Color(web = c).rgb)
final_small_df['random_rgb'] = final_small_df['rgb'].apply(lambda c : randomize_rgb(c))


In [None]:
final_small_df = final_small_df.sort_values(['hue_0', 'hue_1', 'hue_2'])

In [None]:
final_small_df

Get the above from your [Spotify Dashboard](https://developer.spotify.com/dashboard/). The steps to do it are presented [here](https://developer.spotify.com/documentation/general/guides/app-settings/).

In [None]:
spotify_client_id = getpass.getpass()

In [None]:
spotify_client_secret = getpass.getpass()

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials #To access authorised Spotify data

client_credentials_manager = SpotifyClientCredentials(client_id=spotify_client_id, client_secret=spotify_client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) #spotify object to access API
name = "Kanye West, Homecoming" #chosen artist
result = sp.search(name) #search query
result['tracks']['items'][0]['artists']

In [None]:
#artist:'$artist' title:'$title'
artistName = 'Frankie Laine'
trackName = 'Tell Me A Story'

In [None]:
def get_spotify_uri(artistName, trackName):
    searchResults = sp.search(q="artist:" + artistName + " track:" + trackName, type="track", limit = 1)
    try:
        return searchResults['tracks']['items'][0]['uri'].split(':')[-1]
    except Exception as e:
        print(f'Error: {e}')
        return ''

In [None]:
final_small_df.head()

In [None]:
final_small_df['spotify_uri'] = final_small_df.apply(lambda row : get_spotify_uri(row.artist, row.title), axis=1)

In [68]:
final_small_df = pd.read_csv('small_colors_songs_withSpotify.csv')

In [41]:
def foo(s):
    if isinstance(s, str) and s != '':
        return str(s).split(':')[-1]
    else : 
        return ''

In [42]:
final_small_df['spotify_uri'] = final_small_df.apply(lambda row : foo(row.spotify_uri), axis=1)

In [43]:
final_small_df['spotify_uri']

0       1THl7yjezM9SW8x2taVS5w
1       6xNwKNYZcvgV3XTIwsgNio
2       1Qn5jElT7XqjvVeJ8pYM2w
3       39zODpVtRvghMyfNjZ3BVK
4       3UCmuRdeTriWgOZMEJsfqZ
                 ...          
1066                          
1067    2o5mwv7hvas1jdUg14Nihv
1068    7bYZBVrnRfqeaPbhRyEvK3
1069    5yIiXdLRE85OBiQmCaUenq
1070    1dgWTMoHwTUnQhOQ8SR5fV
Name: spotify_uri, Length: 1071, dtype: object

In [47]:
final_small_df.head()

Unnamed: 0,lyrics,tags,pos,year,title,artist,color,hue_0,hue_1,hue_2,rgb,random_rgb,spotify_uri
0,"Tell me story, tell me story\nTell me story, r...",['death by heart failure'],23,1953,Tell Me A Story,Frankie Laine,black,0.0,0.0,0.0,"(0, 0, 0)","rgb(18, 13, 0)",1THl7yjezM9SW8x2taVS5w
1,"Well, since my baby left me\nIn the fiery plac...","['american', 'blue-eyed soul', 'blues', 'chris...",1,1956,Heartbreak Hotel,Elvis Presley,black,0.0,0.0,0.0,"(0, 0, 0)","rgb(0, 0, 0)",6xNwKNYZcvgV3XTIwsgNio
2,"Bah, bah, bah, but\nBah, bah, bah, but\nBah, b...","['all-time greatest hits', 'death by airplane']",25,1957,A Teenager's Romance,Ricky Nelson,black,0.0,0.0,0.0,"(0, 0, 0)","rgb(20, 9, 0)",1Qn5jElT7XqjvVeJ8pYM2w
3,You can shake an apple off an apple tree\nShak...,"['american', 'blue-eyed soul', 'blues', 'chris...",9,1960,Stuck On You,Elvis Presley,black,0.0,0.0,0.0,"(0, 0, 0)","rgb(7, 0, 5)",39zODpVtRvghMyfNjZ3BVK
4,There is a rose in Spanish Harlem\nA red rose ...,"['classic pop and rock', 'funk', 'jazz pop', '...",64,1961,Spanish Harlem,Ben E. King,black,0.0,0.0,0.0,"(0, 0, 0)","rgb(0, 0, 0)",3UCmuRdeTriWgOZMEJsfqZ


In [71]:
def surround_color_words(s, color):
    punctuation_or_space = [el for el in string.punctuation] + [' ', '\n']
    color_length = len(color)
    splitted = s.lower().split(color.lower())
    final = ''
    chunk_to_add_before = f'<span class="{color}_word colored_word">'
    chunk_to_add_after = '</span>'
    
    char_to_begin = 0
    accumulated_usused_chunks_len = 0
    for i, chunk in enumerate(splitted[:-1]):
        if len(splitted[i+1])>0 and splitted[i+1][0] in punctuation_or_space:
            begin_char = char_to_begin
            first_char_color = begin_char + accumulated_usused_chunks_len + len(chunk)
            first_char_after_color = first_char_color + color_length

            final += s[begin_char:first_char_color]
            final += chunk_to_add_before
            final += s[first_char_color:first_char_after_color]
            final += chunk_to_add_after

            char_to_begin = first_char_after_color
            accumulated_usused_characters = 0
        else:
            accumulated_usused_chunks_len += len(chunk) + color_length
    
    final += s[char_to_begin:]
    return final

In [72]:
def preprocess_lyrics(lyrics, color):
    return '<p>'+('<br>'.join(surround_color_words(lyrics, color).split('\n'))) + '</p>'

In [73]:
final_small_df['preprocessed_lyrics'] = final_small_df.apply(lambda row : preprocess_lyrics(row.lyrics, row.color), axis = 1)

In [74]:
final_small_df.to_csv('small_colors_songs_withSpotify.csv', index=False)