In [7]:
from urllib.request import urlopen
import os
import random
import operator

FILES = {
    'https://www.dropbox.com/s/4tffm9iyn93rws2/list_albums.txt?dl=1' : 'list_albums.txt',
    'https://www.dropbox.com/s/0lm8r5brgbjudsb/list_songs.txt?dl=1' : 'list_songs.txt',
    'https://www.dropbox.com/s/tw7v1niidyakqnf/list_features.txt?dl=1' : 'list_features.txt'
}

DESTINATION_FOLDER = "/usr/app/data"

def get_class_distribution(labels, target_idx, n_classes):
    class_distribution = {}
    for target_class in set([target.split("|")[target_idx].strip() for target in labels]):
        class_distribution[target_class] = [target.split("|")[target_idx].strip() for target in labels].count(target_class)
    return sorted(class_distribution.items(), key=operator.itemgetter(1), reverse=True)[0:n_classes]

def sample_songs(label_idx, target, all_features, all_labels, n=20):
    labels_for_idx = [label.split("|")[label_idx].strip() for label in all_labels]
    idxs = [i for i, x in enumerate(labels_for_idx) if x == str(target)]
    
    selected_features = []
    selected_labels = []
    sampled_idxs = []
    for sampled_idx in random.sample(idxs, min(n, len(idxs))):
        selected_features.append(all_features[sampled_idx].strip())
        selected_labels.append(target)
        sampled_idxs.append(sampled_idx)
    return selected_features, selected_labels, sampled_idxs
    

if not os.path.exists(DESTINATION_FOLDER):
    os.makedirs(DESTINATION_FOLDER)

for file_url in FILES.keys():
    file_name = FILES[file_url]
    
    print("Downloading file %s" %  file_name)
    
    with open(os.path.join(DESTINATION_FOLDER, file_name),'wb') as f:
        f.write(urlopen(file_url).read())
        f.close()    

print() 
print("All folders and files were downloaded and stored in %s" % DESTINATION_FOLDER)
print()

#### Extracting labels for each classification target
artists_list = []
albums_list = []
with open(os.path.join(DESTINATION_FOLDER, "labels.txt"), 'w') as label_file, open(os.path.join(DESTINATION_FOLDER, "list_songs.txt")) as songs_file:
    for entry in songs_file.readlines():
        song_info = entry.split("|")        
        year = int(song_info[0])        
        decade_class = (year - 1940) // 10
        
        artist = song_info[1].split("-")[1].strip()
        album = '-'.join(song_info[1].split("-")[2:]).strip()
        
        if artist not in artists_list:
            artists_list.append(artist)
        if album not in albums_list:
            albums_list.append(album)
            
        label_file.write("%i | %i | %s | %s\n" % (year, decade_class, artists_list.index(artist), albums_list.index(album)))
      
print("Sampling features and labels for each targeted classification task")
print()
random.seed(0)
### Deriving balanced samples with limited number of classes for each attribute
classes_per_attr_idx = {1: 7, 2: 20, 3: 20}
samples_per_attr_idx = {1: 425, 2: 200, 3: 25}
for i in [1, 2, 3]:
    with open(os.path.join(DESTINATION_FOLDER, 'list_features.txt')) as features, open(os.path.join(DESTINATION_FOLDER, 'labels.txt')) as labels, open(os.path.join(DESTINATION_FOLDER, 'features_task_%i.txt' % (i)), 'w') as features_task_i, open(os.path.join(DESTINATION_FOLDER, 'labels_task_%i.txt' % (i)), 'w') as labels_task_i, open(os.path.join(DESTINATION_FOLDER, 'idxs_task_%i.txt' % (i)), 'w') as idxs_task_i:
        all_features = features.readlines()
        all_labels = labels.readlines()
        top_songs_for_attribute = get_class_distribution(all_labels, i, classes_per_attr_idx[i])
        class_count = 0
        entry_count = 0
        for attribute_class in [item[0] for item in top_songs_for_attribute]:
            class_count += 1
            sampled_features, sampled_labels, sampled_idxs = sample_songs(i, attribute_class, all_features, all_labels, n=samples_per_attr_idx[i])
            for feature in sampled_features:
                entry_count+=1
                features_task_i.write('%s\n' % str(feature))
            for label in sampled_labels:
                labels_task_i.write('%s\n' % str(label))
            for idx in sampled_idxs:
                idxs_task_i.write('%i\n' % idx)
    print("%i features (for %i classes) sampled for classification task %i" % (entry_count, class_count, i))
            
print()   
print("Sampling finished!")

Downloading file list_albums.txt
Downloading file list_features.txt
Downloading file list_songs.txt

All folders and files were downloaded and stored in /usr/app/data

Sampling features and labels for each targeted classification task

2975 features (for 7 classes) sampled for classification task 1
4000 features (for 20 classes) sampled for classification task 2
489 features (for 20 classes) sampled for classification task 3

Sampling finished!
