In [127]:
import pandas as pd
import numpy as np
import csv
import json
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from sklearn.preprocessing import MinMaxScaler

# Download Data

## Pairwise Data
A utility function to get read the pairwise distance matrices and convert them to a 0 to 1 range

In [137]:
PATH = './data/indian/similarity/pairwise_distance_'
CLUSTER_MAP = {'north': {'musician': 'A', 'non_musician':'B'}, 'south': {'musician':'C', 'non_musician':'D'}}

def format_pair(group, region, demographic):
    df = pd.read_csv(PATH+str(group)+CLUSTER_MAP[region][demographic]+'.csv', header=None)
    return df

#format_pair(0, 'north', 'musician')/100

## Feature Data
A utility function to read the feature annotations

In [129]:
PATH = './data/indian/evaluation/evaluation_'
CLUSTER_MAP = {'north': {'musician': 'A', 'non_musician':'B'}, 'south': {'musician':'C', 'non_musician':'D'}}

def format_feature(group, region, demographic):
    df = pd.read_csv(PATH+str(group)+CLUSTER_MAP[region][demographic]+'.csv', header=None, index_col=0)
    if 14 in df.columns:
        df = df.drop(14, axis=1)
    return df

format_feature(0, 'north', 'musician')

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,100,0,50,50,50,100,50,50,0,50,100,100,50
1,100,0,50,50,100,100,0,100,50,100,50,50,0
2,0,0,50,50,50,50,0,0,50,0,0,0,0
3,100,50,50,50,50,100,50,50,0,100,100,100,50
4,0,0,50,0,50,50,100,50,50,50,0,0,100


## Feature Distance matrix
A utility function to convert the songwise features into a distance matrix within a 0 to 1 range

In [130]:
def get_feature_distance(input_data):
    data = squareform(pdist(input_data, metric='euclidean'))
    scaler = MinMaxScaler()
    scaler.fit(data)
    return scaler.transform(data)
get_feature_distance(format_feature(0, 'north', 'musician'))

array([[0.        , 0.64888568, 0.90453403, 0.30151134, 0.9459053 ],
       [0.66666667, 0.        , 0.85280287, 0.60302269, 1.        ],
       [1.        , 0.91766294, 0.        , 1.        , 0.76088591],
       [0.33333333, 0.64888568, 1.        , 0.        , 1.        ],
       [0.97182532, 1.        , 0.70710678, 0.92932038, 0.        ]])

## Save Full Distance Matrix By Region & Category

Here we create separate distance matrices for North Indian Musician, South Indian Musician, North Indian non-musician and South Indian non-musician

In [131]:
def save_full_feature_distance(region, category):
    full_matrix = format_feature(0, region, category)
    for i in range(1,6):
        full_matrix = np.concatenate([full_matrix, format_feature(i, region, category)], axis=0)
    distance_matrix = pd.DataFrame(get_feature_distance(full_matrix))
    distance_matrix.to_csv('./output/india/feature/'+region+'_'+category+'_full.csv')
save_full_feature_distance('north', 'musician')
save_full_feature_distance('south', 'musician')
save_full_feature_distance('north', 'non_musician')
save_full_feature_distance('south', 'non_musician')

## Get Average Feature Distance Matrix

In [132]:
def average_distance_matrix(input_categories, label):
    v1 = pd.read_csv('./output/india/feature/'+input_categories[0]+'.csv', index_col=0)
    v2 = pd.read_csv('./output/india/feature/'+input_categories[1]+'.csv', index_col=0)
    average = pd.concat([v1, v2]).groupby(level=0).mean()
    average.to_csv('./output/india/feature/'+label+'_average.csv')
average_distance_matrix(['north_musician_full', 'north_non_musician_full'], 'north')
average_distance_matrix(['south_musician_full', 'south_non_musician_full'], 'south')
average_distance_matrix(['south_musician_full', 'south_non_musician_full'], 'south')
average_distance_matrix(['south_musician_full', 'north_musician_full'], 'musician')
average_distance_matrix(['north_non_musician_full', 'south_non_musician_full'], 'non_musician')
average_distance_matrix(['north_average', 'north_average'], 'india')

## Get feature distances

In [133]:
feature_names = ['Ornamentation', 'Grooviness', 'Familiarity', 'Liking', 'Consonance', 'Valence', 'Excitement', 'Vocal Range', 'Sound Quality', 'Tempo', 'Rhythmic Regularity', 'Vocal Tension', 'Vocal Texture']
feature_index = 1
region = 'north'
category = 'musician'

def get_individual_feature(feature_index, region, category):
    feature_vector = format_feature(0, region, category)[[feature_index]]
    label = feature_names[feature_index-1].lower().replace(' ', '_')
    for i in range(1,6):
        feature_vector = np.concatenate([feature_vector, format_feature(i, region, category)[[feature_index]]], axis=None)
    feature_df = pd.DataFrame(feature_vector, columns=[feature_index])
    feature_distance = pd.DataFrame(get_feature_distance(feature_df))
    feature_distance.to_csv('./output/india/feature/'+label+'_'+region+'_'+category+'.csv')
#all average, north, south, musician, non-musician

for i in range(1, len(feature_names)):
    get_individual_feature(i, 'north', 'musician')
    get_individual_feature(i, 'south', 'musician')
    get_individual_feature(i, 'north', 'non_musician')
    get_individual_feature(i, 'south', 'non_musician')
    label = feature_names[i-1].lower().replace(' ', '_')
    average_distance_matrix([label+'_north_musician', label+'_south_musician'], label+'_musician')
    average_distance_matrix([label+'_north_non_musician', label+'_south_non_musician'], label+'_non_musician')
    average_distance_matrix([label+'_north_musician', label+'_north_non_musician'], label+'_north')
    average_distance_matrix([label+'_south_musician', label+'_south_non_musician'], label+'_south')
    average_distance_matrix([label+'_north_average', label+'_north_average'], label+'_india')

## Combine all partial pairwise matrices

In [187]:
def save_pairwise_partial(region, category):
    partial_matrix = np.zeros((30,30), dtype=int)
    group = 0
    for i in range(30):
        if i%5 == 0:
            partial_matrix[i:i+5, i:i+5] = format_pair(group, region, category)
            group += 1
    np.fill_diagonal(partial_matrix, 0)
    partial_df = pd.DataFrame(partial_matrix)
    partial_df.to_csv('./output/india/pairwise/'+region+'_'+category+'_partial.csv')
save_pairwise_partial('north', 'musician')
save_pairwise_partial('south', 'musician')
save_pairwise_partial('north', 'non_musician')
save_pairwise_partial('south', 'non_musician')

In [191]:
def average_distance_matrix(input_categories, label):
    v1 = pd.read_csv('./output/india/pairwise/'+input_categories[0]+'.csv', index_col=0)
    v2 = pd.read_csv('./output/india/pairwise/'+input_categories[1]+'.csv', index_col=0)
    average = pd.concat([v1, v2]).groupby(level=0).mean()
    average.to_csv('./output/india/pairwise/'+label+'_average.csv')
average_distance_matrix(['north_musician_partial', 'north_non_musician_partial'], 'north')
average_distance_matrix(['south_musician_partial', 'south_non_musician_partial'], 'south')
average_distance_matrix(['north_musician_partial', 'south_musician_partial'], 'musician')
average_distance_matrix(['north_non_musician_partial', 'south_non_musician_partial'], 'non_musician')
average_distance_matrix(['north_average', 'south_average'], 'india')