In [1]:
import pandas as pd
import numpy as np
import os
from PIL import Image

In [2]:
df = pd.read_csv('songs_dict.csv')

In [3]:
df = df.rename(columns={'Unnamed: 0': 'ID'})

In [4]:
df = df.drop_duplicates('ID', keep='first')

In [5]:
df.to_csv('songs_dict.csv', index=False)

In [6]:
df.loc[:, 'Features'] = df['Features'].apply(lambda x: None if x=='[]' else x)
df.loc[:, 'Writers'] = df['Writers'].apply(lambda x: None if x=='[]' else x)
df.loc[:, 'Producers'] = df['Producers'].apply(lambda x: None if x=='[]' else x)
df.loc[:, 'Album'] = df['Album'].apply(lambda x: None if pd.isna(x) else x)
df.loc[:, 'Release_Date'] = df['Release_Date'].apply(lambda x: None if pd.isna(x) else x)

In [7]:
import re
def clean_features(features):
    features = features.replace('[', '').replace(']', '')
    pattern = r'([A-Za-z\s]+)\s*\(\d+\)'
    matches = re.findall(pattern, features)
    matches = [match.strip() for match in matches]
    return ', '.join(matches)

In [8]:
def clean_release_date(release_date):
        return(release_date.split(' ')[0].strip())

In [9]:
def get_name_filepath(cover_img):
    base_url = '/'.join(cover_img.split('/')[:-1])
    name_file = cover_img.strip(base_url)+'.png'
    return name_file    

In [10]:
df.loc[:, 'Features'] = df['Features'].apply(lambda x: clean_features(x) if x is not None else x)
df.loc[:, 'Writers'] = df['Writers'].apply(lambda x: clean_features(x) if x is not None else x)
df.loc[:, 'Producers'] = df['Producers'].apply(lambda x: clean_features(x) if x is not None else x)
df.loc[:, 'Release_Date'] = df['Release_Date'].apply(lambda x: clean_release_date(x) if x is not None else x)
df.loc[:, 'Album'] = df['Album'].apply(lambda x: clean_release_date(x) if x is not None else x)
df['Release_Year'] = df['Release_Date'].apply(lambda x: x if x==None else x.split('-')[0])
df['Release_Month'] = df['Release_Date'].apply(lambda x: x if x==None else x.split('-')[1])
df['Cover_Filepath'] = df['Cover_Img'].apply(get_name_filepath)

In [11]:
df.to_csv('df_cleaned.csv', index=False)

In [12]:
def compress_png(input_file_path, output_file_path, quality=60):
    with Image.open(input_file_path) as img:
        if img.mode in ("RGBA", "P"):
            img = img.convert("RGB")
        width, height = img.size
        new_size = (width//2, height//2)
        img = img.resize(new_size)
        img.save(output_file_path, format='PNG', optimize=True, quality=quality)


In [13]:
import requests
from PIL import Image
from io import BytesIO
import os

def download_image(url, not_found_imgs):
    save_path = 'img/'+get_name_filepath(url)
    if not os.path.exists(save_path):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                image_data = response.content
                image = Image.open(BytesIO(image_data))
                image.save(save_path)
                compress_png(save_path, save_path)
            else:
                print(f"{response.status_code}")
        except FileNotFoundError as e:
            print(url)
            not_found_imgs.append(url)

In [14]:
album_imgs = (list(set(df['Cover_Img'])))
not_found_imgs = []
for album in album_imgs[500:1000]:
    download_image(album, not_found_imgs)



400
