In [None]:
import requests, json
from time import sleep
import json


# constant values.
BASE_URL = "https://api.genius.com"
CLIENT_ACCESS_TOKEN = "ENTER YOUR API KEY HERE"
ARTIST_NAME = "Sopico"
#ENTER THE NAME OF THE ARTIST HERE, AS IT IS WRITTEN ON THE GENIUS PAGE UNDER THE PROFILE PICTURE

# send request and get response in json format.
def _get(path,params=None, headers=None):

    # generate request URL
    requrl = '/'.join([BASE_URL,path])
    token = "Bearer {}".format(CLIENT_ACCESS_TOKEN)
    if headers:
        headers['Authorization'] = token
    else:
        headers = {"Authorization": token}

    response = requests.get(url=requrl, params=params, headers=headers)
    response.raise_for_status()

    return response.json()


def get_artist_songs(artist_id):
    current_page = 1
    next_page = True
    songs = []

    # main loop
    while next_page:

        path = "artists/{}/songs/".format(artist_id)
        params = {'page': current_page}
        data = _get(path=path, params=params)

        page_songs = data['response']['songs']

        if page_songs:
            # add all the songs of current page,
            # and increment current_page value for next loop.
            songs += page_songs
            current_page += 1
        else:
            # if page_songs is empty, quit.
            next_page = False

    # get all the song ids, excluding not-primary-artist songs.
    songs = [song["id"] for song in songs
             if song["primary_artist"]["id"] == artist_id]

    return songs

def get_song_information(song_ids):
    song_list = {}

    # main loop
    for i, song_id in enumerate(song_ids):
        print("id:" + str(song_id) + " start. ->")

        path = "songs/{}".format(song_id)
        data = _get(path=path)["response"]["song"]

        song_list.update({
        i: {
            "title": data["title"],
            "album": data["album"]["name"] if data["album"] else "<single>",
            "release_date": data["release_date"] if data["release_date"] else "unidentified",
            "featured_artists":
                [feat["name"] if data["featured_artists"] else "" for feat in data["featured_artists"]],
            "producer_artists":
                [feat["name"] if data["producer_artists"] else "" for feat in data["producer_artists"]],
            "writer_artists":
                [feat["name"] if data["writer_artists"] else "" for feat in data["writer_artists"]],
            "genius_track_id": song_id,
            "genius_album_id": data["album"]["id"] if data["album"] else "none",
            "image_url" : data["song_art_image_url"]}
        })

        print("-> id:" + str(song_id) + " is finished. \n")
    return song_list




In [None]:
print("Searching " + ARTIST_NAME + "'s artist id. \n")

# find artist id
find_id = _get("search", {'q': ARTIST_NAME})
for hit in find_id["response"]["hits"]:
   if hit["result"]["primary_artist"]["name"] == ARTIST_NAME:
       artist_id = hit["result"]["primary_artist"]["id"]
       break

print("-> " + ARTIST_NAME + "'s id is " + str(artist_id) + "\n")

In [None]:
print("-> " + ARTIST_NAME + "'s id is " + str(artist_id) + "\n")

print("getting song ids. \n")

# get all song ids and make a list.
song_ids = get_artist_songs(artist_id)
print(song_ids)

print("getting meta data of each song. \n")

# finally, make a full list of songs with meta data.
full_list_of_songs = get_song_information(song_ids)

print("-> Finished! Dump the data into json data. \n")

with open("./" + ARTIST_NAME + "Songs.json", "w", encoding="utf-8") as f:
    json.dump(full_list_of_songs, f, indent=4, ensure_ascii=False)

print("-> Mission complete! Check it out!")

In [None]:
# Open the JSON file with UTF-8 encoding
with open(f"{ARTIST_NAME}Songs.json", "r", encoding="utf-8") as f:
    data = json.load(f)


In [None]:
# Initialize a set to store unique album IDs
album_ids = set()
album_name = set()

# Iterate through the keys in the dictionary (0, 1, 2, etc.)
for key in data:
    song = data[key]
    if "genius_album_id" in song:
        album_ids.add(song["genius_album_id"])
    if "album" in song:
        album_name.add(song["album"])
    
# Count the number of unique album IDs
num_unique_albums = len(album_ids)

print(f"There are {num_unique_albums} different album IDs in the JSON file.")

In [None]:
print(album_name)

In [None]:
# Create a dictionary to store albums, their tracks, and track IDs
albums_dict_with_ids = {}

# Iterate through the keys in the dictionary (0, 1, 2, etc.)
for key in data:
    song = data[key]
    album_id = song.get("genius_album_id")
    title = song.get("title")
    track_id = song.get("genius_track_id")

    if album_id is not None:
        # Check if the album ID already exists in the dictionary
        if album_id not in albums_dict_with_ids:
            albums_dict_with_ids[album_id] = {"album_name": song["album"], "tracks": []}

        # Append the song title and track ID to the album's "tracks" list
        albums_dict_with_ids[album_id]["tracks"].append({"title": title, "track_id": track_id})

# Save the album, track, and track ID data to a new JSON file
with open(f"{ARTIST_NAME}SongsWithIDs.json", "w", encoding="utf-8") as output_file:
    json.dump(albums_dict_with_ids, output_file, indent=4, ensure_ascii=False)

print(f"Albums, tracks, and track IDs data saved to {ARTIST_NAME}SongsWithIDs.json.")

In [None]:
# Load the original JSON data
with open(f"{ARTIST_NAME}SongsWithIDs.json", "r", encoding="utf-8") as input_file:
    original_data = json.load(input_file)


min_songs_per_album = 6

# Create a new dictionary to store filtered albums
filtered_albums_dict = {}

# Iterate through albums in the original dictionary
for album_id, album_data in original_data.items():
    num_tracks = len(album_data.get("tracks", []))
    # Check if the album has at least min_songs_per_album tracks
    if num_tracks >= min_songs_per_album:
        filtered_albums_dict[album_id] = album_data

# Save the filtered album data to a new JSON file
with open(f"{ARTIST_NAME}SongsWithIDsFiltered.json", "w", encoding="utf-8") as output_file:
    json.dump(filtered_albums_dict, output_file, indent=4, ensure_ascii=False)

print(f"Albums with at least {min_songs_per_album} songs data saved to {ARTIST_NAME}SongsWithIDsFiltered.json.")


In [None]:
from unidecode import unidecode
import re

def transform_artist_name(input_string):
    parts = input_string.split("'")
    if len(parts) > 1:
        parts[1] = parts[1].lower()
    cleaned_string = ''.join(parts)
    return cleaned_string


ARTIST_NAME_U = unidecode(ARTIST_NAME)
ARTIST_NAME_U = transform_artist_name(ARTIST_NAME_U)
print(ARTIST_NAME_U)

In [None]:
from bs4 import BeautifulSoup

# Function to scrape lyrics from Genius
def scrape_lyrics(artist, track):
    track = unidecode(track.lower().replace(' ', '-'))
    track = track.replace("'","")
    track = track.replace("(","")
    track = track.replace(")","")
    track = track.replace(":","-")
    track = track.replace("[","")
    track = track.replace("]","")
    track = track.replace(",","")
    track = track.replace("+","")
    track = track.replace("%","")
    track = track.replace("#","")
    
    search_url = f"https://genius.com/{artist.replace(' ', '-')}-{track}-lyrics"
    try:
        response = requests.get(search_url)
        print("track's lyrics found")
        if response.status_code == 404:
            print(f"Lyrics not found for {track} by {artist}.")
            print(search_url)
            return None
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        lyrics_div = soup.find('div', class_='Lyrics__Container-sc-1ynbvzw-1 kUgSbL')
        if lyrics_div:
            lyrics = lyrics_div.get_text('\n')
            return lyrics.strip()
    except requests.ConnectionError:
        print(f"Connection error for {track} by {artist}. Skipping the track.")
        return None
    return None

# Open the JSON file with UTF-8 encoding
with open(f"{ARTIST_NAME}SongsWithIDsFiltered.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Iterate through the data and add lyrics to each track
for album_id, album_data in data.items():
    tracks = album_data["tracks"]
    for track_info in tracks:
        track_name = track_info["title"]
        lyrics = scrape_lyrics(ARTIST_NAME_U, track_name)  
        if lyrics is not None:
            track_info["lyrics"] = lyrics

# Save the updated data to a new JSON file while keeping it sorted by album
with open(f"{ARTIST_NAME}SongsWithLyricsSorted.json", "w", encoding="utf-8") as output_file:
    json.dump(data, output_file, indent=4, ensure_ascii=False, sort_keys=True)

print(f"Lyrics added and data saved to '{ARTIST_NAME}SongsWithLyricsSorted.json'.")


In [None]:
# Load data 
with open(f"{ARTIST_NAME}Songs.json", "r", encoding="utf-8") as songs_file:
    songs_data = json.load(songs_file)

# Load data wiht lyrics
with open(f"{ARTIST_NAME}SongsWithLyricsSorted.json", "r", encoding="utf-8") as lyrics_file:
    lyrics_data = json.load(lyrics_file)

# Create a CSV file for the combined data
with open(f"{ARTIST_NAME}SongsCombined.csv", "w", newline="", encoding="utf-8") as csv_file:
    fieldnames = ["Song Name", "Song ID", "Album Name", "Album ID", "Release Date", "Lyrics","url"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    
    # Write the CSV header
    writer.writeheader()

    # Iterate through the data and match song names and album IDs
    for song_id, song_info in songs_data.items():
        song_name = song_info["title"]
        album_name = song_info["album"]
        release_date = song_info["release_date"]
        album_id = None  # Initialize album_id as None
        lyrics = ""  # Initialize lyrics as an empty string
        url = song_info["image_url"]
        
        # Search for the matching album in the lyrics data
        for lyr_album_id, lyr_album_info in lyrics_data.items():
            if lyr_album_info["album_name"] == album_name:
                album_id = lyr_album_id  # Match found, set the album_id
                break
        
        # Check if album_id is not None and song_name is in the track titles
        if album_id is not None:
            album_tracks = lyrics_data[album_id]["tracks"]
            for track in album_tracks:
                if track["title"] == song_name and "lyrics" in track:
                    lyrics = track["lyrics"]
                    break
                
        # Write the data to the CSV file
        writer.writerow({
            "Song Name": song_name,
            "Song ID": song_info["genius_track_id"],
            "Album Name": album_name,
            "Album ID": album_id,
            "Release Date": release_date,
            "Lyrics": lyrics,
            "url":url
        })

print(f"CSV file '{ARTIST_NAME}SongsCombined.csv' created successfully.")


In [None]:
import pandas as pd
import csv
df = pd.read_csv(f"{ARTIST_NAME}SongsCombined.csv")

In [None]:
df.head(n=30)

In [None]:
df.info()

In [None]:
# Filter rows where the "Lyrics" column is not NaN and when the album has an ID
df1 = df[(~df["Lyrics"].isna() & (df["Album ID"] != "none"))]

# Reset the index of the DataFrame
df1.reset_index(drop=True, inplace=True)

# Save the filtered DataFrame back to a CSV file
df1.to_csv(f"{ARTIST_NAME}SongsFiltered.csv", index=False, encoding="utf-8")

print(f"Filtered CSV file '{ARTIST_NAME}SongsFiltered.csv' created successfully.")


In [None]:
df1.info()

In [None]:
df1.drop_duplicates(subset = ['Song Name'], inplace=True)

In [None]:
df1.info()

In [None]:
import string
def text_cleansing(data):
    '''Removes brackets, replaces new line breaks with spaces, 
    lowercases everything, removes punctuations, extra whitespaces, and break words'''
    data = data.str.replace("[\(\[].*?[\)\]]", '')
    data = data.str.replace("\n", ' ')
    data = data.str.lower()
    data = data.str.replace('[{}]'.format(string.punctuation), '')
    return data

df1.loc[:,'rem_sp_char'] = text_cleansing(df1.loc[:,'Lyrics'])
df1

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:


def remove_stopwords(text):   
    clean_text = list()
    nltk.download('punkt')
    nltk.download('stopwords')
    
    words=["ouai","oui","no","nan","non","jsais","ca","ça","jai","cest","jsuis","si","jme","tas","ni","jte","ya","eh","oh","comme","plus","tant","plus","rien","tout","quand","ouais","ouai","trop","là","va","dla","où","san","quon","quil","quelle"
          "qujdois","davoir","skandalize","dun","sen","car","faire","fais","jirai","quelle","faut",
          "ai", "as", "a", "avons", "avez", "ont","suis", "es", "est", "sommes", "êtes", "sont","fais", "fais", "fait", "faisons", "faites", "font",
          "peux", "peut", "pouvons", "pouvez", "peuvent","dois", "dois","cette","tous","doit","jveux","jmets","devons","jfais","yeah","devez", "doivent", "vais", "vas", "va", "allons", "allez", "vont","estce","dit","quelque","jetais"]
    ignore= (stopwords.words('french') + words)
    
    for i in text:
        words = nltk.word_tokenize(i)
        for element in ignore: 
            words = list(filter(lambda x: x!= element and len(x) > 1, words))
        lyric = " ".join(words)
        clean_text.append(lyric)
    
    return clean_text
        
df1['LyricsClean'] = remove_stopwords(df1['rem_sp_char'])

In [None]:
!pip install wordcloud

In [None]:
!pip install colorthief

In [None]:
def rgb2hex(r,g,b):
    return "#{:02x}{:02x}{:02x}".format(r,g,b)

In [None]:
from wordcloud import WordCloud, ImageColorGenerator
import numpy as np
from PIL import Image
from collections import Counter
from colorthief import ColorThief
import matplotlib.backends.backend_pdf as pdf_backend

# Function to create a word cloud and frequency graph for a given album
def create_wordcloud_and_frequency_graph(text, album_name, album_cover_url, pdf):
    # Create a word cloud
    # Fetch the album cover image from the URL
    album_cover = Image.open(requests.get(album_cover_url, stream=True).raw).convert('RGB')
    color_thief = ColorThief(requests.get(album_cover_url, stream=True).raw)
    dominant_color = color_thief.get_color(quality=1)
    dominant_color = rgb2hex(dominant_color[0],dominant_color[1],dominant_color[2])
    mask = np.array(album_cover)
    image_colors = ImageColorGenerator(mask)
    wordcloud = WordCloud(width=400, height=400, background_color="white", mask=mask,collocations =False).generate(text)
    
    # Calculate word frequencies
    word_counts = Counter(text.split())
    most_common_words = word_counts.most_common(15)
    words, counts = zip(*most_common_words)
    
    
    # Create a bar chart for word frequencies
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 3, 1)
    plt.imshow(mask, cmap=plt.cm.gray, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Album Cover for {album_name}")
    
    plt.subplot(1, 3, 2)
    plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Word Cloud for {album_name}")
    
    plt.subplot(1, 3, 3)
    plt.barh(words, counts, color=dominant_color)
    plt.xlabel("Frequency")
    plt.title(f"Top 15 Most Frequent Words for {album_name}")
    
    plt.tight_layout()
    pdf.savefig()
    plt.close()

with pdf_backend.PdfPages(f'wordcloud_plots_{ARTIST_NAME_U}.pdf') as pdf:
    for album_name, group in df1.groupby("Album Name"):
        lyrics = " ".join(group["LyricsClean"])
        album_cover_url = group["url"].iloc[0] 
        create_wordcloud_and_frequency_graph(lyrics, album_name, album_cover_url, pdf)
        print("This album's analysis is done")
print("PDF generated !")