In [15]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

In [45]:
# Import data
segmentData = pd.read_csv("./data/all_segments.csv")

# Remove irrelevent columns
segmentColumnsToRemove = ['confidence', 'loudness_end', 'track_title', 'album_title', 'album_artist', 'order', 'pitches']
segmentData.drop(columns = segmentColumnsToRemove, axis = 1, inplace = True)

# Convert Track Title to unique number
segmentTrackIds = segmentData['track_ids'].astype('category') # Extract the song title column
segmentDataCodes = segmentTrackIds.cat.codes # Assign each song title to a unique number
segmentData['track_title'] = segmentDataCodes # Replace track title with unique number

# Initialise output matrix
allSongEuclidianTimbreHeadings = ['song_timbre', 'song_timbre_start', 'song_timbre_end', 'loudness_start', 'loudness_end']
allSongEuclidianTimbre = pd.DataFrame(np.zeros((max(segmentDataCodes), 5)), columns = allSongEuclidianTimbreHeadings) 

i = 0
for song in range(0, max(segmentDataCodes)):

    if i%100 == 0:
        print(str(i/4949 * 100) + ' percent complete')

    songSegmentData = segmentData.loc[segmentData['track_title'] == song] # Extract segments for one song

    # Find number of segments in first 10 secs of song
    time = 0
    numStartSegments = 0
    while time < 10:
        time += songSegmentData.iloc[numStartSegments,1]
        numStartSegments += 1

    # Find number of segments in last 10 secs of song
    time = 0
    numEndSegments = 0
    while time < 10:
        time += songSegmentData.iloc[-numStartSegments,1]
        numEndSegments += 1

    # Timbre
    timbreStr = songSegmentData['timbres'] # Extract Timbre column vector from segmentData
    timbre = pd.DataFrame(np.zeros((12, len(timbreStr)))) # Initialise timbre matrix
    # Separate out each value of the Timbre column vector, and save to a column of the timbre matrix (then transpose)
    x=0
    for row in timbreStr:
        timbre[x] = pd.DataFrame(row.split(', '))
        x+=1
    timbre = timbre.transpose()
    # Remove the square brackets from start and end of the data rows
    timbre[0] = timbre[0].str[1:]
    timbre[11] = timbre[11].str[:-1]

    timbre = timbre.astype('float64') # Convert timbre matrix 

    # Weight timbre dimensions in order accourding to importance
    weightedTimbre = pd.DataFrame(np.zeros((len(timbre), 12))) 
    for x in range(1,12):
        weightedTimbre[x] = timbre[x]*(0.9**x)

    # For each row of timbre values find the euclidean distance of the 12 variables
    euclidianTimbre = pd.DataFrame(np.zeros((1, len(timbre))))
    for x in range(len(timbre)):
        euclidianTimbre[x] = np.linalg.norm(weightedTimbre.loc[x,:])
    euclidianTimbre = euclidianTimbre.transpose()

    # Normalise the data between zero and one
    euclidianTimbre = pd.DataFrame(MinMaxScaler(feature_range=(0,1)).fit_transform(euclidianTimbre))
    # euclidianTimbre.plot.line()

    # Average for euclidian distance for song
    euclidianTimbreSong = float(np.mean(euclidianTimbre, axis=0))
    # Average for the first 10 secs of the song
    euclidianTimbreSongStart = float(np.mean(euclidianTimbre.head(numStartSegments), axis=0))
    # Average for the last 10 secs of the song
    euclidianTimbreSongEnd = float(np.mean(euclidianTimbre.tail(numEndSegments), axis=0))

    # Loudness
    songLoudness = songSegmentData['loudness_max']
    # Average for the first 10 secs of the song
    loudnessSongStart = float(np.mean(songLoudness.head(numStartSegments), axis=0))
    # Average for the last 10 secs of the song
    loudnessSongEnd = float(np.mean(songLoudness.tail(numEndSegments), axis=0))

    allSongEuclidianTimbre.loc[song,:] = [euclidianTimbreSong, euclidianTimbreSongStart, euclidianTimbreSongEnd, loudnessSongStart, loudnessSongEnd] # Export values

    i += 1

allSongEuclidianTimbre

Unnamed: 0,song_timbre,song_timbre_start,song_timbre_end,loudness_start,loudness_end
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
4943,0.0,0.0,0.0,0.0,0.0
4944,0.0,0.0,0.0,0.0,0.0
4945,0.0,0.0,0.0,0.0,0.0
4946,0.0,0.0,0.0,0.0,0.0


In [41]:
allSongEuclidianTimbre.to_csv('./data/timbre.csv')

In [42]:
data = pd.read_csv('./data/data.csv')
data = pd.concat([data, allSongEuclidianTimbre], axis=1)

data.to_csv('./data/data_timbre.csv')

In [38]:
# len(segmentData['track_ids'].unique())
data = pd.read_csv('./data/data.csv')

prev_ids = []

for id in data['id']:
    prev_ids.append(id)
    if len(np.unique(prev_ids)) != len(prev_ids):
        print(id)
        1/0

print(len(np.unique(prev_ids)))
print(len(prev_ids))


4949
4949
