In [None]:
'''
author: Alexander Staub
Date: 2025-04-21
Description: Short script to get a 150 song sample from the dataset of songs we have acoustic characteristics for already. 

'''

In [3]:
#load packages to read csv and to mainpuate dataframes
import pandas as pd
import numpy as np
import random
import os #to get the current working directory
import json


In [2]:
# set the working directory two sections down
base_dir = os.path.abspath(os.path.join(os.getcwd(), '../..'))

# load the dataset ivan_sample_full.csv from data/interim/Ivan_spotify_sample
df_ivan_shared = pd.read_csv(os.path.join(base_dir, 'data', 'interim_data', 'Ivan_spotify_sample', 'ivan_sample_full.csv'))

In [6]:
#loading the json that Ivan returned
with open(os.path.join(base_dir, 'data','raw_data', 'Ivan_soundstat','sample_tracks_analyzed.json'), 'r') as f:
    json_data = json.load(f)

In [7]:
#Extract the relevant track information
ivan_returned_full = json_data['successful']

#Extract the relevant track information
valid_tracks = [
    {
        'Spotify ID': t['data'].get('id'),
        'Title': t['data'].get('name'),
        'Artist': ', '.join(t['data'].get('artists', [])),
        **pd.json_normalize(t['data']['features'], sep='_').iloc[0].to_dict()
    }
    for t in ivan_returned_full
    if isinstance(t.get('data'), dict) and 'features' in t['data']
]

#convert to DataFrame
df_ivan_returned = pd.DataFrame(valid_tracks)

In [9]:
#add 2 example tracks to the df_ivan_returned dataset.
# the first one has 0s for all features, the second one has 1s for all features

df_ivan_returned = pd.concat([df_ivan_returned, pd.DataFrame([
    {
        'Spotify ID': '0',
        'Title': '0s track',
        'Artist': '0s artist',
        **{col: 0 for col in df_ivan_returned.columns if col not in ['Spotify ID', 'Title', 'Artist']}
    },
    {
        'Spotify ID': '1',
        'Title': '1s track',
        'Artist': '1s artist',
        **{col: 1 for col in df_ivan_returned.columns if col not in ['Spotify ID', 'Title', 'Artist']}
    }
])])


In [10]:
#remove all rows from the _shared dataset that aren't in the _returned dataset
df_ivan_shared_subset = df_ivan_shared[df_ivan_shared['Spotify ID'].isin(df_ivan_returned['Spotify ID'])]

In [None]:
#keep only the feature columns that match between the two datasets
df_ivan_shared_subset = df_ivan_shared_subset[df_ivan_shared_subset.columns.intersection(df_ivan_returned.columns)]
df_ivan_returned_subset = df_ivan_returned[df_ivan_returned.columns.intersection(df_ivan_shared_subset.columns)]


In [19]:
#calculate the correlation coefficients between each feature variable (excluding Spotify ID, Title, and Artist) between each dataset
correlation_matrix = {}


for c in df_ivan_shared_subset.columns:
    if c in ['Spotify ID','Title','Artist']:
        continue
    correlation_matrix[c] = (
        df_ivan_shared_subset[c]
        .corr(df_ivan_returned_subset[c])
    )