In [259]:
import pandas as pd
import numpy as np
import os
from PIL import Image

In [260]:
df = pd.read_csv('songs_dict.csv')

In [261]:
df = df.rename(columns={'Unnamed: 0': 'ID'})

In [262]:
df = df.drop_duplicates('ID', keep='first')

In [263]:
df.to_csv('songs_dict.csv', index=False)

In [264]:
df.loc[:, 'Features'] = df['Features'].apply(lambda x: None if x=='[]' else x)
df.loc[:, 'Writers'] = df['Writers'].apply(lambda x: None if x=='[]' else x)
df.loc[:, 'Producers'] = df['Producers'].apply(lambda x: None if x=='[]' else x)
df.loc[:, 'Album'] = df['Album'].apply(lambda x: None if pd.isna(x) else x)
df.loc[:, 'Release_Date'] = df['Release_Date'].apply(lambda x: None if pd.isna(x) else x)

In [265]:
import re
def clean_list(list):
    list = list.replace('[', '').replace(']', '').replace('(ITA)', '').replace(' & ', ', ').replace(' x ', ', ')
    pattern = r'([\w\sÀ-ÿ]+)\s*\(\d+\)'
    matches = re.findall(pattern, list)
    matches = [match.strip() for match in matches]
    return ', '.join(matches)

In [266]:
def clean_release_date(release_date):
        return(release_date.split(' ')[0].strip())

In [267]:
def get_name_filepath(cover_img):
    base_url = '/'.join(cover_img.split('/')[:-1])
    name_file = cover_img.strip(base_url)+'.png'
    return name_file    

In [268]:
df.loc[:, 'Features'] = df['Features'].apply(lambda x: clean_list(x) if x is not None else x)
df.loc[:, 'Writers'] = df['Writers'].apply(lambda x: clean_list(x) if x is not None else x)
df.loc[:, 'Producers'] = df['Producers'].apply(lambda x: clean_list(x) if x is not None else x)
df.loc[:, 'Release_Date'] = df['Release_Date'].apply(lambda x: clean_release_date(x) if x is not None else x)
df.loc[:, 'Album'] = df['Album'].apply(lambda x: x.split(' (')[0] if x is not None else x)
df.loc[:, 'Artist'] = df['Artist'].apply(lambda x: x.replace(' & ', ', ').replace(' x ', ', ') if x is not None else x)
df['Release_Year'] = df['Release_Date'].apply(lambda x: x if x==None else x.split('-')[0])
df['Release_Month'] = df['Release_Date'].apply(lambda x: x if x==None else x.split('-')[1])
df['Cover_Filepath'] = df['Cover_Img'].apply(get_name_filepath)

In [201]:
df.to_csv('df_cleaned.csv', index=False)

### Getting album cover images

In [12]:
def compress_png(input_file_path, output_file_path, quality=60):
    with Image.open(input_file_path) as img:
        if img.mode in ("RGBA", "P"):
            img = img.convert("RGB")
        width, height = img.size
        new_size = (width//2, height//2)
        img = img.resize(new_size)
        img.save(output_file_path, format='PNG', optimize=True, quality=quality)


In [13]:
import requests
from PIL import Image
from io import BytesIO
import os

def download_image(url, not_found_imgs):
    save_path = 'img/'+get_name_filepath(url)
    if not os.path.exists(save_path):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                image_data = response.content
                image = Image.open(BytesIO(image_data))
                image.save(save_path)
                compress_png(save_path, save_path)
            else:
                print(f"{response.status_code}")
        except FileNotFoundError as e:
            print(url)
            not_found_imgs.append(url)

In [14]:
album_imgs = (list(set(df['Cover_Img'])))
not_found_imgs = []
for album in album_imgs[500:1000]:
    download_image(album, not_found_imgs)



400


### Reducing df to rappers

In [307]:
def filter_hh(artist, hip_hop_list):
    if artist is not None:
        artist_list = artist.split(', ')
        for art in artist_list:
            if art in hip_hop_list:
                return True
    return None

In [316]:
hip_hop_list = []
with open('hip-hop.txt', 'r') as hiphop_file:
    for line in hiphop_file:
        hip_hop_list.append(line.strip())

filtered_df = df.copy()
filtered_df = filtered_df[['Artist', 'Features']]
filtered_df['Artist'] = filtered_df['Artist'].apply(filter_hh, hip_hop_list=hip_hop_list)
filtered_df['Features'] = filtered_df['Features'].apply(filter_hh, hip_hop_list=hip_hop_list)
filtered_df = filtered_df.dropna(how='all')
hh_df = df.loc[filtered_df.index]

In [384]:
def find_collab(df, artist1, artist2):
    for _, row in df.iterrows():
        if row['Features'] != None:
            featuring_artists = row['Features'].split(', ')
        main_artists = row['Artist'].split(', ')
        if (artist1 in main_artists and artist2 in main_artists) or \
        (artist1 in featuring_artists and artist2 in featuring_artists) or \
        (artist1 in main_artists and artist2 in featuring_artists) or \
        (artist1 in featuring_artists and artist2 in main_artists):
            return True
        
    return False

In [394]:
dict_collab = {}
for artist1 in hip_hop_list:
    dict_collab[artist1] = set()
    for artist2 in hip_hop_list:
        if artist1 is not artist2 and find_collab(hh_df, artist1, artist2):
            dict_collab[artist1].add(artist2)            

In [377]:
import random
def find_axis(hip_hop_list, df, dict_collab):
    x_sel = random.sample(hip_hop_list, 3)
    hip_hop_list = [elem for elem in hip_hop_list if elem not in x_sel]
    y_sel = []
    common = dict_collab[]
    while len(y_sel)<3:
        curr_art = random.sample(hip_hop_list, 1)

print(find_axis(hip_hop_list, hh_df, dict_collab))

KeyboardInterrupt: 