In [1]:
import pandas as pd
import time
import numpy as np

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt

import umap
from sklearn.preprocessing import StandardScaler


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
token_url = "https://accounts.spotify.com/api/token"
headers = {"Content-Type": "application/x-www-form-urlencoded"}
data = {
    'grant_type': 'client_credentials',
    'client_id': 'very_secret',
    'client_secret': 'very_very_secret'
}

audio_features_url = "https://api.spotify.com/v1/audio-features"

desired_properties = [
 "acousticness",
  "danceability",
  "duration_ms",
  "energy",
  "instrumentalness",
  "key",
  "liveness",
  "loudness",
  "mode",
  "speechiness",
  "tempo",
  "time_signature",
  "valence"]

error_file_path = "c:/Users/Alina/Master-Projects/visual-data-science/project/data/errors.txt"

In [3]:
def getAccessToken():
    response = requests.post(token_url, headers=headers, data=data)

    if response.status_code == 200:
        # Successful request
        result = response.json()
        print(result)
        return result.get('access_token')
    else:
        print(f"Error: {response.status_code}")
        print(response.text)  # Print the response content for debugging

In [2]:
def read_csv_file(file_path):
    try:
        # Read CSV file into a DataFrame
        df = pd.read_csv(file_path)

        # Display the DataFrame (optional)
        print("Columns in CSV file:")
        print(df.columns)

        return df
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

In [None]:
def add_track_to_df_1(df, track_ids, access_token):
    time.sleep(1)
    auth_header = {"Authorization": "Bearer " + access_token}
    params = {"ids": track_ids}
    response = requests.get(audio_features_url, headers=auth_header, params=params)

    if response.status_code == 200:
        # Successful request
        for entry in response.json():
            data = {
            "track_id" : [entry['id']],
            "acousticness" : [entry['acousticness']],
            "danceability" :  [entry['danceability']],
            "duration_ms" : [entry['duration_ms']],
            "instrumentalness" : [entry['instrumentalness']],
            "key" : [entry['key']],
            "energy" : [entry['energy']],
            "liveness" :[entry['liveness']],
            "loudness" : [entry['loudness']],
            "mode" : [entry['mode']],
            "speechiness" : [entry['speechiness']],
            "tempo" :[entry['tempo']],
            "time_signature" : [entry['time_signature']],
            "valence" : [entry['valence']]
            }
            df = df.append(pd.DataFrame(data), ignore_index=True)
        return df, "OK"
    else:
        print(f"Error: {response.status_code}")
        print(response.text)  # Print the response content for debugging
        print(response.request)
        if response.status_code == 429:
            print(f"Appending track {track_ids} to error list. Not fetched. Stopping")
            with open(error_file_path, 'a') as file:
                for track_id in track_ids:
                    file.write(track_id + '\n')
            return df, "Error"


In [None]:
df = read_csv_file("c:/Users/Alina/Master-Projects/visual-data-science/project/data/charts.csv")
shortened_df = pd.DataFrame(columns=df.columns)
print(len(shortened_df))

In [None]:
for country in df["country"].unique():
    print(country)
    for year in range(2014, 2023):
        for month in range(1, 13):
            month_str = str(month).zfill(2)
            date_str = f"{year}/{month_str}"
            print(f"{date_str}, {country}")
            all_tracks_per_country_and_month = df.loc[(df['country'] == country) & (df["date"].str.contains(f"{date_str}/"))]
            all_tracks_per_country_and_month = all_tracks_per_country_and_month.groupby("track_id", as_index=False).agg({
                'streams': 'sum',
                'country': 'first',
                'artists': 'first',
                'artists': 'first',
                'artist_genres': 'first',
                'duration': 'first',
                'name': 'first'
            }).reset_index()
            all_tracks_per_country_and_month["date"] = date_str
            all_tracks_per_country_and_month = all_tracks_per_country_and_month.sort_values(by='streams', ascending=False)
            shortened_df = pd.concat([all_tracks_per_country_and_month, shortened_df], ignore_index=True)

In [None]:
shortened_df_top_50 =pd.DataFrame(shortened_df)
shortened_df_top_50['streams'] = pd.to_numeric(shortened_df_top_50['streams'], errors='coerce')
print(f"number of rows in shortened dataset: {len(shortened_df)}")
grouped_df = shortened_df_top_50.groupby(['date', 'country']).apply(lambda x: x.nlargest(50, 'streams')).reset_index(drop=True)
print(f"number of rows shortened dataset top 50 {len(grouped_df)}")
print(len(grouped_df["track_id"].unique()))

grouped_df.to_csv("c:/Users/Alina/Master-Projects/visual-data-science/project/data/charts_monthly_top_50.csv")

In [None]:

file_path = "c:/Users/Alina/Master-Projects/visual-data-science/project/data/track_ids_monthly_top_50.txt"
df_path = "c:/Users/Alina/Master-Projects/visual-data-science/project/data/track_detail_info_monthly_top_50.csv"
original_shortened_df_path = "c:/Users/Alina/Master-Projects/visual-data-science/project/data/charts_monthly_top_50.csv"
access_token = getAccessToken()

def add_track_to_df(df, track_ids, access_token):
    time.sleep(2)
    auth_header = {"Authorization": "Bearer " + access_token}
    params = {}
    params["ids"] = (','.join(track_ids))
    response = requests.get(audio_features_url, headers=auth_header, params=params)

    if response.status_code == 200:
        # Successful request
        for entry in response.json()['audio_features']:
            data = {
            "track_id" : [entry['id']],
            "acousticness" : [entry['acousticness']],
            "danceability" :  [entry['danceability']],
            "duration_ms" : [entry['duration_ms']],
            "instrumentalness" : [entry['instrumentalness']],
            "key" : [entry['key']],
            "energy" : [entry['energy']],
            "liveness" :[entry['liveness']],
            "loudness" : [entry['loudness']],
            "mode" : [entry['mode']],
            "speechiness" : [entry['speechiness']],
            "tempo" :[entry['tempo']],
            "time_signature" : [entry['time_signature']],
            "valence" : [entry['valence']]
            }
            df = pd.concat([pd.DataFrame(data), df], ignore_index=True)
        return df, "OK"
    else:
        print(f"Error: {response.status_code}")
        print(response.text)  # Print the response content for debugging
        print(response.request)
        if response.status_code == 429:
            print(f"Appending track {track_ids} to error list. Not fetched. Stopping")
            with open(error_file_path, 'a') as file:
                for track_id in track_ids:
                    file.write(track_id + '\n')
            return df, "Error"


df_detail_info = pd.DataFrame(columns=desired_properties)
index = 0
df = read_csv_file(original_shortened_df_path)
total_length = len(df["track_id"].unique())
with open(file_path, 'r') as file:
    track_ids = []
    for line in file:
        track_id = line.strip()
        if index == 24026:
            track_ids.append(track_id)
            print(f"requesting details for {len(track_ids)} tracks")
            df_detail_info, response = add_track_to_df(df_detail_info, track_ids, access_token)
            track_ids = []
            if response == "Error":
                print(f"Stopping at line {index}")
                break
            else:
                print(f"total processed {index}/{total_length}")

        index += 1



In [None]:
print(len(df_detail_info))
print(df_detail_info)

df_detail_info.to_csv("c:/Users/Alina/Master-Projects/visual-data-science/project/data/detail_info_monthly_top_50.csv", mode='a', header=False, index=False)

In [None]:
df_charts = read_csv_file("c:/Users/Alina/Master-Projects/visual-data-science/project/data/charts_monthly_top_50.csv")
df_detail_info = read_csv_file("c:/Users/Alina/Master-Projects/visual-data-science/project/data/detail_info_monthly_top_50.csv")
merged_df = pd.merge(df_charts, df_detail_info, on='track_id', how='left')
print(len(df_charts))
print(len(merged_df))
merged_df.to_csv("c:/Users/Alina/Master-Projects/visual-data-science/project/data/charts_with_detail.csv")

In [60]:
genres_df = read_csv_file("C:/Users/Alina/Master-Projects/visual-data-science/project/data/unique_genres.csv")
df = read_csv_file("C:/Users/Alina/Master-Projects/visual-data-science/project/data/charts_with_detail.csv")

genres_df = pd.DataFrame(genres_df, columns= ['genre', 'occurence'])
genres_df['occurence'] = 0

df = df.drop_duplicates(subset='track_id', keep='first')

for _, row in df.iterrows():
    all_genres = row['artist_genres']
    all_genres = eval(all_genres)
    for genre in all_genres:
        genres_df.loc[genres_df['genre'] == genre, 'occurence'] += 1
genres_df = genres_df.sort_values(by='occurence', ascending=False)
genres_df.to_csv("C:/Users/Alina/Master-Projects/visual-data-science/project/data/unique_genres_with_occurence.csv")

Columns in CSV file:
Index(['Unnamed: 0', 'genre'], dtype='object')
Columns in CSV file:
Index(['Unnamed: 0', 'Unnamed: 0_x', 'index', 'track_id', 'streams', 'country',
       'artists', 'artist_genres', 'duration', 'name', 'date', 'position',
       'Unnamed: 0_y', 'acousticness', 'danceability', 'duration_ms',
       'instrumentalness', 'key', 'energy', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'],
      dtype='object')


In [11]:
df = read_csv_file("../data/charts_monthly_top_50.csv")
print(df['country'].unique())
selected_countries = ['at', 'de', 'gb', 'global', 'us', 'ca', 'cy', 'jp', 'mx', 'au', 'eg']
df = df[df['country'].isin(selected_countries)]
df = df.drop(columns=["Unnamed: 0"])
df_details = read_csv_file('../data/detail_info_monthly_top_50.csv')
df_details = df_details.drop(columns=["Unnamed: 0"])
df_merged = pd.merge(df, df_details, how='left', on="track_id")
df_merged = df_merged.drop(columns=["position"])
df_merged = df_merged.dropna(subset=['name'])
print(df_merged.columns)
df_merged.to_csv('../data/charts_monthly_top_50_with_details_shortened.csv')

Columns in CSV file:
Index(['Unnamed: 0', 'index', 'track_id', 'streams', 'country', 'artists',
       'artist_genres', 'duration', 'name', 'date', 'position'],
      dtype='object')
['ar' 'at' 'au' 'be' 'bg' 'ch' 'cl' 'co' 'cr' 'cz' 'de' 'dk' 'do' 'ec'
 'ee' 'es' 'fi' 'fr' 'gb' 'global' 'gr' 'gt' 'hk' 'hn' 'hu' 'ie' 'is' 'it'
 'lt' 'lu' 'lv' 'mt' 'mx' 'my' 'nl' 'no' 'nz' 'pa' 'pe' 'pl' 'pt' 'se'
 'sg' 'sk' 'sv' 'tr' 'tw' 'us' 'uy' 'py' 'bo' 'br' 'ni' 'cy' 'ph' 'ad'
 'ca' 'id' 'jp' 'th' 'il' 'ro' 'vn' 'in' 'ae' 'eg' 'ma' 'sa' 'za' 'ru'
 'ua' 'kr' 'by' 'kz' 'ng' 've' 'pk']
Columns in CSV file:
Index(['Unnamed: 0', 'track_id', 'acousticness', 'danceability', 'duration_ms',
       'instrumentalness', 'key', 'energy', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'],
      dtype='object')
Index(['index', 'track_id', 'streams', 'country', 'artists', 'artist_genres',
       'duration', 'name', 'date', 'acousticness', 'danceability',
       'duration

In [16]:
def normalize_column(column):
    return (column - column.min()) / (column.max() - column.min())

reducer = umap.UMAP()
audio_features = ["acousticness","danceability","instrumentalness","energy","liveness",
                       "speechiness","valence", "key_normalized", "tempo_normalized"]

unique_songs = df_merged.drop_duplicates(subset='track_id')
unique_songs["key_normalized"] = normalize_column(unique_songs['key'])
unique_songs["tempo_normalized"] = normalize_column(unique_songs['tempo'])
unique_songs["time_signature_normalized"] = normalize_column(unique_songs['time_signature'])



features = unique_songs[audio_features].values
scaled_features = StandardScaler().fit_transform(features)
embedding = reducer.fit_transform(scaled_features)
for i in range(embedding.shape[1]):
    unique_songs[f"embedding_{i}"] = embedding[:, i]
unique_songs.to_csv('../data/charts_monthly_top_50_with_details_shortened.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_songs["key_normalized"] = normalize_column(unique_songs['key'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_songs["tempo_normalized"] = normalize_column(unique_songs['tempo'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_songs["time_signature_normalized"] = normalize_colum

In [14]:
print(unique_songs.head())

   index                track_id  streams country                 artists  \
0    5.0  18u4eXpEvQpssIWMJ6rrZF    97370      at  ['My Fearless Friend']   
1   15.0  40ia0VswpL9SUuPdfLyabK    93352      at    ['Pitbull', 'Kesha']   
2   51.0  7398yjipWAaoCGIl7ukaeX    91033      at        ['Milky Chance']   
3   31.0  5pY3ovFxbvAg7reGZjJQSp    76060      at          ['Ed Sheeran']   
4   19.0  4RXpgGM7A4Hg7cFBoH5KyF    69671      at              ['Avicii']   

                                       artist_genres  duration  \
0                                                 []    147000   
1  ['pop rap', 'pop', 'miami hip hop', 'dance pop...    204280   
2                                     ['german pop']    315693   
3                                  ['pop', 'uk pop']    299146   
4  ['pop rap', 'pop', 'pop dance', 'dance pop', '...    255093   

                   name     date  acousticness  ...  energy  liveness  \
0           Dieses Lied  2014/01      0.000055  ...   0.925    0.06