In [364]:
import requests, json
from time import sleep
import json


# constant values.
BASE_URL = "https://api.genius.com"
CLIENT_ACCESS_TOKEN = "ENTER YOUR API KEY HERE"
ARTIST_NAME = "Sopico"


# send request and get response in json format.
def _get(path,params=None, headers=None):

    # generate request URL
    requrl = '/'.join([BASE_URL,path])
    token = "Bearer {}".format(CLIENT_ACCESS_TOKEN)
    if headers:
        headers['Authorization'] = token
    else:
        headers = {"Authorization": token}

    response = requests.get(url=requrl, params=params, headers=headers)
    response.raise_for_status()

    return response.json()


def get_artist_songs(artist_id):
    current_page = 1
    next_page = True
    songs = []

    # main loop
    while next_page:

        path = "artists/{}/songs/".format(artist_id)
        params = {'page': current_page}
        data = _get(path=path, params=params)

        page_songs = data['response']['songs']

        if page_songs:
            # add all the songs of current page,
            # and increment current_page value for next loop.
            songs += page_songs
            current_page += 1
        else:
            # if page_songs is empty, quit.
            next_page = False

    # get all the song ids, excluding not-primary-artist songs.
    songs = [song["id"] for song in songs
             if song["primary_artist"]["id"] == artist_id]

    return songs

def get_song_information(song_ids):
    song_list = {}

    # main loop
    for i, song_id in enumerate(song_ids):
        print("id:" + str(song_id) + " start. ->")

        path = "songs/{}".format(song_id)
        data = _get(path=path)["response"]["song"]

        song_list.update({
        i: {
            "title": data["title"],
            "album": data["album"]["name"] if data["album"] else "<single>",
            "release_date": data["release_date"] if data["release_date"] else "unidentified",
            "featured_artists":
                [feat["name"] if data["featured_artists"] else "" for feat in data["featured_artists"]],
            "producer_artists":
                [feat["name"] if data["producer_artists"] else "" for feat in data["producer_artists"]],
            "writer_artists":
                [feat["name"] if data["writer_artists"] else "" for feat in data["writer_artists"]],
            "genius_track_id": song_id,
            "genius_album_id": data["album"]["id"] if data["album"] else "none",
            "image_url" : data["song_art_image_url"]}
        })

        print("-> id:" + str(song_id) + " is finished. \n")
    return song_list




In [366]:
print("Searching " + ARTIST_NAME + "'s artist id. \n")

# find artist id
find_id = _get("search", {'q': ARTIST_NAME})
for hit in find_id["response"]["hits"]:
   if hit["result"]["primary_artist"]["name"] == ARTIST_NAME:
       artist_id = hit["result"]["primary_artist"]["id"]
       break

print("-> " + ARTIST_NAME + "'s id is " + str(artist_id) + "\n")

Searching Charles Aznavour's artist id. 

-> Charles Aznavour's id is 13060



In [367]:
print("-> " + ARTIST_NAME + "'s id is " + str(artist_id) + "\n")

print("getting song ids. \n")

# get all song ids and make a list.
song_ids = get_artist_songs(artist_id)
print(song_ids)

print("getting meta data of each song. \n")

# finally, make a full list of songs with meta data.
full_list_of_songs = get_song_information(song_ids)

print("-> Finished! Dump the data into json data. \n")

with open("./" + ARTIST_NAME + "Songs.json", "w", encoding="utf-8") as f:
    json.dump(full_list_of_songs, f, indent=4, ensure_ascii=False)

print("-> Mission complete! Check it out!")

-> Charles Aznavour's id is 13060

getting song ids. 

[1796536, 838243, 1831674, 838058, 838244, 837382, 1743684, 839896, 1806116, 838259, 1847152, 838147, 839267, 837870, 4504215, 837985, 837506, 837424, 838305, 839478, 837660, 4527699, 4596466, 837921, 839440, 838335, 837573, 837426, 837656, 4504962, 837492, 423738, 839499, 837952, 4503170, 837732, 838420, 839528, 4501281, 837298, 4504533, 4564817, 839566, 837696, 837446, 4534725, 4504302, 837513, 4501220, 4545433, 4133330, 4558633, 837676, 4503274, 837514, 4558623, 1807824, 423735, 839629, 837540, 839403, 4501129, 4562795, 839488, 838353, 837574, 4545438, 837967, 1832610, 1761074, 838010, 4501288, 4504213, 838402, 4504994, 837666, 838440, 839683, 4519286, 4133329, 839707, 837596, 838492, 4401031, 4501146, 838620, 838556, 837691, 838439, 4528528, 838446, 838528, 837469, 838489, 1771788, 838627, 4501291, 838594, 4133331, 838557, 4501221, 838184, 839522, 4562799, 4558622, 838103, 839554, 4501285, 838667, 787617, 837999, 838248, 839612

-> id:839403 is finished. 

id:4501129 start. ->
-> id:4501129 is finished. 

id:4562795 start. ->
-> id:4562795 is finished. 

id:839488 start. ->
-> id:839488 is finished. 

id:838353 start. ->
-> id:838353 is finished. 

id:837574 start. ->
-> id:837574 is finished. 

id:4545438 start. ->
-> id:4545438 is finished. 

id:837967 start. ->
-> id:837967 is finished. 

id:1832610 start. ->
-> id:1832610 is finished. 

id:1761074 start. ->
-> id:1761074 is finished. 

id:838010 start. ->
-> id:838010 is finished. 

id:4501288 start. ->
-> id:4501288 is finished. 

id:4504213 start. ->
-> id:4504213 is finished. 

id:838402 start. ->
-> id:838402 is finished. 

id:4504994 start. ->
-> id:4504994 is finished. 

id:837666 start. ->
-> id:837666 is finished. 

id:838440 start. ->
-> id:838440 is finished. 

id:839683 start. ->
-> id:839683 is finished. 

id:4519286 start. ->
-> id:4519286 is finished. 

id:4133329 start. ->
-> id:4133329 is finished. 

id:839707 start. ->
-> id:839707 is fini

-> id:4503180 is finished. 

id:839063 start. ->
-> id:839063 is finished. 

id:4501298 start. ->
-> id:4501298 is finished. 

id:4501144 start. ->
-> id:4501144 is finished. 

id:839079 start. ->
-> id:839079 is finished. 

id:838388 start. ->
-> id:838388 is finished. 

id:836952 start. ->
-> id:836952 is finished. 

id:839299 start. ->
-> id:839299 is finished. 

id:839081 start. ->
-> id:839081 is finished. 

id:838339 start. ->
-> id:838339 is finished. 

id:4501224 start. ->
-> id:4501224 is finished. 

id:837190 start. ->
-> id:837190 is finished. 

id:839265 start. ->
-> id:839265 is finished. 

id:838864 start. ->
-> id:838864 is finished. 

id:837982 start. ->
-> id:837982 is finished. 

id:839344 start. ->
-> id:839344 is finished. 

id:838028 start. ->
-> id:838028 is finished. 

id:838224 start. ->
-> id:838224 is finished. 

id:838791 start. ->
-> id:838791 is finished. 

id:739067 start. ->
-> id:739067 is finished. 

id:838287 start. ->
-> id:838287 is finished. 

id:83

-> id:4503246 is finished. 

id:839495 start. ->
-> id:839495 is finished. 

id:838859 start. ->
-> id:838859 is finished. 

id:4519296 start. ->
-> id:4519296 is finished. 

id:4504951 start. ->
-> id:4504951 is finished. 

id:837987 start. ->
-> id:837987 is finished. 

id:838158 start. ->
-> id:838158 is finished. 

id:4503200 start. ->
-> id:4503200 is finished. 

id:4503198 start. ->
-> id:4503198 is finished. 

id:4503197 start. ->
-> id:4503197 is finished. 

id:838938 start. ->
-> id:838938 is finished. 

id:4133326 start. ->
-> id:4133326 is finished. 

id:838734 start. ->
-> id:838734 is finished. 

id:839355 start. ->
-> id:839355 is finished. 

id:839074 start. ->
-> id:839074 is finished. 

id:4501303 start. ->
-> id:4501303 is finished. 

id:839455 start. ->
-> id:839455 is finished. 

id:839503 start. ->
-> id:839503 is finished. 

id:4501131 start. ->
-> id:4501131 is finished. 

id:838092 start. ->
-> id:838092 is finished. 

id:4558629 start. ->
-> id:4558629 is finis

-> id:4516734 is finished. 

id:839292 start. ->
-> id:839292 is finished. 

id:4501159 start. ->
-> id:4501159 is finished. 

id:839326 start. ->
-> id:839326 is finished. 

id:839845 start. ->
-> id:839845 is finished. 

id:4501143 start. ->
-> id:4501143 is finished. 

id:4501145 start. ->
-> id:4501145 is finished. 

id:837349 start. ->
-> id:837349 is finished. 

id:838188 start. ->
-> id:838188 is finished. 

id:837672 start. ->
-> id:837672 is finished. 

id:838025 start. ->
-> id:838025 is finished. 

id:4501150 start. ->
-> id:4501150 is finished. 

id:4558626 start. ->
-> id:4558626 is finished. 

id:837719 start. ->
-> id:837719 is finished. 

id:838127 start. ->
-> id:838127 is finished. 

id:4562791 start. ->
-> id:4562791 is finished. 

id:838891 start. ->
-> id:838891 is finished. 

id:4133335 start. ->
-> id:4133335 is finished. 

id:1844141 start. ->
-> id:1844141 is finished. 

id:4504844 start. ->
-> id:4504844 is finished. 

id:4534761 start. ->
-> id:4534761 is fin

In [368]:
# Open the JSON file with UTF-8 encoding
with open(f"{ARTIST_NAME}Songs.json", "r", encoding="utf-8") as f:
    data = json.load(f)


In [369]:
# Initialize a set to store unique album IDs
album_ids = set()
album_name = set()

# Iterate through the keys in the dictionary (0, 1, 2, etc.)
for key in data:
    song = data[key]
    if "genius_album_id" in song:
        album_ids.add(song["genius_album_id"])
    if "album" in song:
        album_name.add(song["album"])
    
# Count the number of unique album IDs
num_unique_albums = len(album_ids)

print(f"There are {num_unique_albums} different album IDs in the JSON file.")

There are 91 different album IDs in the JSON file.


In [370]:
print(album_name)

{'Un Natale un po’ speciale', 'Je m’voyais déjà', 'Entre deux rêves', 'Duos', 'Je n’ai pas vu le temps passer...', 'Toi et moi', 'Les meilleures chansons de Charles Aznavour [Compilation]', 'Il bosco e la riva', 'Aznavour chante Noël', 'Désormais...', 'Le Temps des loups [Compilation]', 'Je voyage', 'His Kind Of Love Songs', 'Plus bleu...', 'Encores', 'Visages de l’amour', 'Toujours ', 'Bravos du music-hall à Charles Aznavour ', 'Yerushalaïm [EP]', 'Charles Aznavour e le sue canzoni', 'Discographie Studio Originale, Vol. 2 : 1951-54  [Compilation]', 'Insolitement vôtre', 'Discographie Studio Originale, Vol. 3 : 1954-56 [Compilation]', 'Charles Aznavour chante... Charles Aznavour, Vol. 3 ', 'Essere', 'Charles Aznavour chante en anglais - Les meilleurs moments', '<single>', 'Charles Aznavour, volume 1', 'J’aime Charles Aznavour, vol. 4', 'Idiote, je t’aime...', 'Aznavour (Je bois)', 'Une première danse / La légende de Stenka Razine', 'Les deux guitares', 'Le cabotin [EP]', 'Charles chant

In [371]:
# Create a dictionary to store albums, their tracks, and track IDs
albums_dict_with_ids = {}

# Iterate through the keys in the dictionary (0, 1, 2, etc.)
for key in data:
    song = data[key]
    album_id = song.get("genius_album_id")
    title = song.get("title")
    track_id = song.get("genius_track_id")

    if album_id is not None:
        # Check if the album ID already exists in the dictionary
        if album_id not in albums_dict_with_ids:
            albums_dict_with_ids[album_id] = {"album_name": song["album"], "tracks": []}

        # Append the song title and track ID to the album's "tracks" list
        albums_dict_with_ids[album_id]["tracks"].append({"title": title, "track_id": track_id})

# Save the album, track, and track ID data to a new JSON file
with open(f"{ARTIST_NAME}SongsWithIDs.json", "w", encoding="utf-8") as output_file:
    json.dump(albums_dict_with_ids, output_file, indent=4, ensure_ascii=False)

print(f"Albums, tracks, and track IDs data saved to {ARTIST_NAME}SongsWithIDs.json.")

Albums, tracks, and track IDs data saved to Charles AznavourSongsWithIDs.json.


In [372]:
# Load the original JSON data
with open(f"{ARTIST_NAME}SongsWithIDs.json", "r", encoding="utf-8") as input_file:
    original_data = json.load(input_file)


min_songs_per_album = 6

# Create a new dictionary to store filtered albums
filtered_albums_dict = {}

# Iterate through albums in the original dictionary
for album_id, album_data in original_data.items():
    num_tracks = len(album_data.get("tracks", []))
    # Check if the album has at least min_songs_per_album tracks
    if num_tracks >= min_songs_per_album:
        filtered_albums_dict[album_id] = album_data

# Save the filtered album data to a new JSON file
with open(f"{ARTIST_NAME}SongsWithIDsFiltered.json", "w", encoding="utf-8") as output_file:
    json.dump(filtered_albums_dict, output_file, indent=4, ensure_ascii=False)

print(f"Albums with at least {min_songs_per_album} songs data saved to {ARTIST_NAME}SongsWithIDsFiltered.json.")


Albums with at least 6 songs data saved to Charles AznavourSongsWithIDsFiltered.json.


In [373]:
from unidecode import unidecode
import re

def transform_artist_name(input_string):
    parts = input_string.split("'")
    if len(parts) > 1:
        parts[1] = parts[1].lower()
    cleaned_string = ''.join(parts)
    return cleaned_string


ARTIST_NAME_U = unidecode(ARTIST_NAME)
ARTIST_NAME_U = transform_artist_name(ARTIST_NAME_U)
print(ARTIST_NAME_U)

Charles Aznavour


In [374]:
from bs4 import BeautifulSoup

# Function to scrape lyrics from Genius
def scrape_lyrics(artist, track):
    track = unidecode(track.lower().replace(' ', '-'))
    track = track.replace("'","")
    track = track.replace("(","")
    track = track.replace(")","")
    track = track.replace(":","-")
    track = track.replace("[","")
    track = track.replace("]","")
    track = track.replace(",","")
    track = track.replace("+","")
    track = track.replace("%","")
    track = track.replace("#","")
    
    search_url = f"https://genius.com/{artist.replace(' ', '-')}-{track}-lyrics"
    try:
        response = requests.get(search_url)
        print("track's lyrics found")
        if response.status_code == 404:
            print(f"Lyrics not found for {track} by {artist}.")
            print(search_url)
            return None
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        lyrics_div = soup.find('div', class_='Lyrics__Container-sc-1ynbvzw-1 kUgSbL')
        if lyrics_div:
            lyrics = lyrics_div.get_text('\n')
            return lyrics.strip()
    except requests.ConnectionError:
        print(f"Connection error for {track} by {artist}. Skipping the track.")
        return None
    return None

# Open the JSON file with UTF-8 encoding
with open(f"{ARTIST_NAME}SongsWithIDsFiltered.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Iterate through the data and add lyrics to each track
for album_id, album_data in data.items():
    tracks = album_data["tracks"]
    for track_info in tracks:
        track_name = track_info["title"]
        lyrics = scrape_lyrics(ARTIST_NAME_U, track_name)  
        if lyrics is not None:
            track_info["lyrics"] = lyrics

# Save the updated data to a new JSON file while keeping it sorted by album
with open(f"{ARTIST_NAME}SongsWithLyricsSorted.json", "w", encoding="utf-8") as output_file:
    json.dump(data, output_file, indent=4, ensure_ascii=False, sort_keys=True)

print(f"Lyrics added and data saved to '{ARTIST_NAME}SongsWithLyricsSorted.json'.")


track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyric

track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
Lyrics not found for donne-donne-moi-ton-coeur by Charles Aznavour.
https://genius.com/Charles-Aznavour-donne-donne-moi-ton-coeur-lyrics
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
Lyrics not found for quand-tu-viens-chez-moi-mon-coeur by Charles Aznavour.
https://genius.com/Charles-Aznavour-quand-tu-viens-chez-moi-mon-coeur-lyrics
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics found
track's lyrics fo

In [375]:
# Load data 
with open(f"{ARTIST_NAME}Songs.json", "r", encoding="utf-8") as songs_file:
    songs_data = json.load(songs_file)

# Load data wiht lyrics
with open(f"{ARTIST_NAME}SongsWithLyricsSorted.json", "r", encoding="utf-8") as lyrics_file:
    lyrics_data = json.load(lyrics_file)

# Create a CSV file for the combined data
with open(f"{ARTIST_NAME}SongsCombined.csv", "w", newline="", encoding="utf-8") as csv_file:
    fieldnames = ["Song Name", "Song ID", "Album Name", "Album ID", "Release Date", "Lyrics","url"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    
    # Write the CSV header
    writer.writeheader()

    # Iterate through the data and match song names and album IDs
    for song_id, song_info in songs_data.items():
        song_name = song_info["title"]
        album_name = song_info["album"]
        release_date = song_info["release_date"]
        album_id = None  # Initialize album_id as None
        lyrics = ""  # Initialize lyrics as an empty string
        url = song_info["image_url"]
        
        # Search for the matching album in the lyrics data
        for lyr_album_id, lyr_album_info in lyrics_data.items():
            if lyr_album_info["album_name"] == album_name:
                album_id = lyr_album_id  # Match found, set the album_id
                break
        
        # Check if album_id is not None and song_name is in the track titles
        if album_id is not None:
            album_tracks = lyrics_data[album_id]["tracks"]
            for track in album_tracks:
                if track["title"] == song_name and "lyrics" in track:
                    lyrics = track["lyrics"]
                    break
                
        # Write the data to the CSV file
        writer.writerow({
            "Song Name": song_name,
            "Song ID": song_info["genius_track_id"],
            "Album Name": album_name,
            "Album ID": album_id,
            "Release Date": release_date,
            "Lyrics": lyrics,
            "url":url
        })

print(f"CSV file '{ARTIST_NAME}SongsCombined.csv' created successfully.")


CSV file 'Charles AznavourSongsCombined.csv' created successfully.


In [376]:
import pandas as pd
import csv
df = pd.read_csv(f"{ARTIST_NAME}SongsCombined.csv")

In [377]:
df.head(n=30)

Unnamed: 0,Song Name,Song ID,Album Name,Album ID,Release Date,Lyrics,url
0,À contre-amour,1796536,Aznavour 92,523928,unidentified,"[Refrain]\nDe contretemps en contretemps, on n...",https://images.genius.com/4056b54a3731469b387e...
1,Adieu,838243,Entre deux rêves,187905,unidentified,"Adieu, tout ce qui fut nous\nCe qui fut notre ...",https://images.genius.com/7eb45ed810ad1c5c3ff4...
2,Adiós A La Mamá (La Mamma),1831674,Sus Canciones,266544,unidentified,"Ya están aquí, llegaron ya, a la llamada del a...",https://images.genius.com/1d22dad9d044c521f08f...
3,After Loving You,838058,His Kind Of Love Songs,,unidentified,,https://images.genius.com/357559e1914d7d10765a...
4,Ah !,838244,"Discographie Studio Originale, Vol. 2 : 1951-5...",,unidentified,,https://images.genius.com/442f68b1828bfc5db703...
5,Aime-moi,837382,La Bohème,529321,unidentified,"[Paroles de ""Aime-moi""]\nAime-moi\nComme tu n'...",https://images.genius.com/b1339c6e5e99d0b681aa...
6,Aimer,1743684,Toi et moi,256593,unidentified,Par un frisson léger et presque imperceptible\...,https://images.genius.com/c75a734d19390af1ff5e...
7,Alleluia,839896,Charles Aznavour accompagné par Burt Random et...,512128,unidentified,La jeunesse est turbulente\nInsolente\nMais so...,https://images.genius.com/8df66f8c40d392f8eb75...
8,Allez vaï Marseille,1806116,Autobiographie,513763,unidentified,Marseille mon ami\nDès l'entrée à l'école\nAve...,https://images.genius.com/47d0830f3aa01e27badd...
9,Alors je dérive,838259,Non identifié [EP],,unidentified,,https://images.genius.com/5491fe8ff249d8e35995...


In [378]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 620 entries, 0 to 619
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Song Name     620 non-null    object
 1   Song ID       620 non-null    int64 
 2   Album Name    620 non-null    object
 3   Album ID      536 non-null    object
 4   Release Date  620 non-null    object
 5   Lyrics        518 non-null    object
 6   url           620 non-null    object
dtypes: int64(1), object(6)
memory usage: 34.0+ KB


In [379]:
# Filter rows where the "Release Date" is not "unidentified," the "Lyrics" column is not NaN,
# and the "Album ID" is not None
df1 = df[(~df["Lyrics"].isna() & (df["Album ID"] != "none"))]

# Reset the index of the DataFrame
df1.reset_index(drop=True, inplace=True)

# Save the filtered DataFrame back to a CSV file
df1.to_csv(f"{ARTIST_NAME}SongsFiltered.csv", index=False, encoding="utf-8")

print(f"Filtered CSV file '{ARTIST_NAME}SongsFiltered.csv' created successfully.")


Filtered CSV file 'Charles AznavourSongsFiltered.csv' created successfully.


In [380]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Song Name     504 non-null    object
 1   Song ID       504 non-null    int64 
 2   Album Name    504 non-null    object
 3   Album ID      504 non-null    object
 4   Release Date  504 non-null    object
 5   Lyrics        504 non-null    object
 6   url           504 non-null    object
dtypes: int64(1), object(6)
memory usage: 27.7+ KB


In [381]:
df1.drop_duplicates(subset = ['Song Name'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.drop_duplicates(subset = ['Song Name'], inplace=True)


In [382]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 504 entries, 0 to 503
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Song Name     504 non-null    object
 1   Song ID       504 non-null    int64 
 2   Album Name    504 non-null    object
 3   Album ID      504 non-null    object
 4   Release Date  504 non-null    object
 5   Lyrics        504 non-null    object
 6   url           504 non-null    object
dtypes: int64(1), object(6)
memory usage: 31.5+ KB


In [383]:
import string
def text_cleansing(data):
    '''Removes brackets, replaces new line breaks with spaces, 
    lowercases everything, removes punctuations, extra whitespaces, and break words'''
    data = data.str.replace("[\(\[].*?[\)\]]", '')
    data = data.str.replace("\n", ' ')
    data = data.str.lower()
    data = data.str.replace('[{}]'.format(string.punctuation), '')
    #data = data.str.replace(' +', ' ')
    return data

df1.loc[:,'rem_sp_char'] = text_cleansing(df1.loc[:,'Lyrics'])
df1

  data = data.str.replace("[\(\[].*?[\)\]]", '')
  data = data.str.replace('[{}]'.format(string.punctuation), '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.loc[:,'rem_sp_char'] = text_cleansing(df1.loc[:,'Lyrics'])


Unnamed: 0,Song Name,Song ID,Album Name,Album ID,Release Date,Lyrics,url,rem_sp_char
0,À contre-amour,1796536,Aznavour 92,523928,unidentified,"[Refrain]\nDe contretemps en contretemps, on n...",https://images.genius.com/4056b54a3731469b387e...,de contretemps en contretemps on ne se voit q...
1,Adieu,838243,Entre deux rêves,187905,unidentified,"Adieu, tout ce qui fut nous\nCe qui fut notre ...",https://images.genius.com/7eb45ed810ad1c5c3ff4...,adieu tout ce qui fut nous ce qui fut notre vi...
2,Adiós A La Mamá (La Mamma),1831674,Sus Canciones,266544,unidentified,"Ya están aquí, llegaron ya, a la llamada del a...",https://images.genius.com/1d22dad9d044c521f08f...,ya están aquí llegaron ya a la llamada del amo...
3,Aime-moi,837382,La Bohème,529321,unidentified,"[Paroles de ""Aime-moi""]\nAime-moi\nComme tu n'...",https://images.genius.com/b1339c6e5e99d0b681aa...,aimemoi comme tu nas jamais aimé aimemoi auss...
4,Aimer,1743684,Toi et moi,256593,unidentified,Par un frisson léger et presque imperceptible\...,https://images.genius.com/c75a734d19390af1ff5e...,par un frisson léger et presque imperceptible ...
...,...,...,...,...,...,...,...,...
499,You And Me,839064,You And Me,282061,unidentified,You and me\nTwo hearts that melt and flow into...,https://images.genius.com/a39ecbc99ed3e742f6c0...,you and me two hearts that melt and flow into ...
500,You And Me (Reprise),4595420,Duos,219847,2008-12-08,You and me\nTwo hearts that melt and flow into...,https://images.genius.com/2ecad7974974b770409c...,you and me two hearts that melt and flow into ...
501,Young At Heart,4503066,Duos,219847,2008-12-08,"[Verse 1: Frank Sinatra, Charles Aznavour]\nFa...",https://images.genius.com/2ecad7974974b770409c...,fairy tales can come true it can happen to yo...
502,You’ve Got To Learn (Reprise),837793,Duos,219847,2008-12-08,You've got to learn to show a happy face\nAlth...,https://images.genius.com/2ecad7974974b770409c...,youve got to learn to show a happy face althou...


In [384]:
import nltk
from nltk.corpus import stopwords

# Download the NLTK stopwords data if not already downloaded
nltk.download('stopwords')

# Your code continues here...


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\elios\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [385]:


def remove_stopwords(text):    # text is a list/series of string to clean
    clean_text = list()
    nltk.download('punkt')
    nltk.download('stopwords')
    
    words=["ouai","oui","no","nan","non","jsais","ca","ça","jai","cest","jsuis","si","jme","tas","ni","jte","ya","eh","oh","comme","plus","tant","plus","rien","tout","quand","négro","négros","ouais","ouai","trop","là","va","dla","où","san","quon","quil","quelle"
          "qujdois","davoir","skandalize","dun","sen","car","faire","fais","jirai","quelle","faut",
          "ai", "as", "a", "avons", "avez", "ont","suis", "es", "est", "sommes", "êtes", "sont","fais", "fais", "fait", "faisons", "faites", "font",
          "peux", "peut", "pouvons", "pouvez", "peuvent","dois", "dois","cette","tous","doit","jveux","jmets","devons","jfais","yeah","devez", "doivent", "vais", "vas", "va", "allons", "allez", "vont","estce","dit","quelque","jetais"]
    ignore= (stopwords.words('french') + words)
    
    for i in text:
        words = nltk.word_tokenize(i)
        #for i in range(len(words)):
        #    words = [w for w in words if w not in stopwords.words('english')]
        for element in ignore: # given the tokenized list, return a list that doesn't contain any of the elements
            words = list(filter(lambda x: x!= element and len(x) > 1, words))
        lyric = " ".join(words)
        clean_text.append(lyric)
    
    return clean_text
        
df1['LyricsClean'] = remove_stopwords(df1['rem_sp_char'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\elios\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\elios\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['LyricsClean'] = remove_stopwords(df1['rem_sp_char'])


In [386]:
!pip install wordcloud



In [387]:
!pip install colorthief



In [388]:
def rgb2hex(r,g,b):
    return "#{:02x}{:02x}{:02x}".format(r,g,b)

In [389]:
from wordcloud import WordCloud, ImageColorGenerator
import numpy as np
from PIL import Image
from collections import Counter
from colorthief import ColorThief
import matplotlib.backends.backend_pdf as pdf_backend

# Function to create a word cloud and frequency graph for a given album
def create_wordcloud_and_frequency_graph(text, album_name, album_cover_url, pdf):
    # Create a word cloud
    # Fetch the album cover image from the URL
    album_cover = Image.open(requests.get(album_cover_url, stream=True).raw).convert('RGB')
    color_thief = ColorThief(requests.get(album_cover_url, stream=True).raw)
    dominant_color = color_thief.get_color(quality=1)
    dominant_color = rgb2hex(dominant_color[0],dominant_color[1],dominant_color[2])
    mask = np.array(album_cover)
    image_colors = ImageColorGenerator(mask)
    wordcloud = WordCloud(width=400, height=400, background_color="white", mask=mask,collocations =False).generate(text)
    
    # Calculate word frequencies
    word_counts = Counter(text.split())
    most_common_words = word_counts.most_common(15)
    words, counts = zip(*most_common_words)
    
    
    # Create a bar chart for word frequencies
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 3, 1)
    plt.imshow(mask, cmap=plt.cm.gray, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Album Cover for {album_name}")
    
    plt.subplot(1, 3, 2)
    plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Word Cloud for {album_name}")
    
    plt.subplot(1, 3, 3)
    plt.barh(words, counts, color=dominant_color)
    plt.xlabel("Frequency")
    plt.title(f"Top 15 Most Frequent Words for {album_name}")
    
    plt.tight_layout()
    pdf.savefig()
    plt.close()

with pdf_backend.PdfPages(f'wordcloud_plots_{ARTIST_NAME_U}.pdf') as pdf:
    for album_name, group in df1.groupby("Album Name"):
        lyrics = " ".join(group["LyricsClean"])
        album_cover_url = group["url"].iloc[0]  # Assuming the URL is the same for all songs in the album
        create_wordcloud_and_frequency_graph(lyrics, album_name, album_cover_url, pdf)
        print("album analysis done")


album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
album analysis done
