In [3]:
root_path = "/home/fat-fighter/Documents/cs771-project/hybrid-method/"

## Description of Files

### Folder: features

- **tracks-mfcc.csv** - Contains already extracted mfcc features from all tracks using 30-60 seconds of tracks
- **tracks-cluster-probabilities.csv** - Contains the cluster probabolities and assignments for all tracks (based on their mfcc features_
- **timbres-cluster-probabilities.csv** - Contains the cluster probabilities and assignments for all segment timbres of all tracks
- **tracks-collective-timbres-clusters-features.csv** - Contains the extracted features of a track using its timbres' collective cluster probabilities

### Folder: million-song-subset

- **tracks-features.csv** - Contains mfcc features extracted from tracks in the MSS
- **tracks-timbres.csv** - Contains segment timbres for all tracks

### Folder: taste-profile-subset

- **songs.txt** - A list of song ids
- **users.txt** - A list of user ids
- **train-triplets.txt** - A user-song-count triplets
- **song-to-tracks.txt** - A song-track id mapping

## Clustering Tracks (Based on Tracks' MFCC Features)

In [None]:
import pandas as pd
from sklearn.mixture import GaussianMixture

In [None]:
tracks_mfcc = pd.read_csv(
    root_path + "hybrid-method/data/features/tracks-mfcc.csv", sep="\t")

In [None]:
n_mfcc = 13
n_clusters = 10
max_iter = 1000

In [None]:
estimator = GaussianMixture(n_components=n_clusters, covariance_type='diag', max_iter=1000, random_state=0)

In [None]:
cols = ["av" + str(i) for i in range(1, n_mfcc + 1)] + ["sd" + str(i) for i in range(1, n_mfcc + 1)]

In [None]:
estimator.fit(tracks_mfcc[cols])

In [None]:
probs = estimator.predict_proba(tracks_mfcc[cols])
clusters = estimator.predict(tracks_mfcc[cols])

In [None]:
cols = ["id"] + ["k" + str(k) for k in range(1, n_clusters + 1)] + ["cluster"]

In [None]:
outfile = root_path + "hybrid-method/data/features/tracks-cluster-probabilities.csv"

In [None]:
with open(outfile, "w") as f:
    f.write("\t".join(cols) + "\n")
    for i, song_id in enumerate(tracks_mfcc["id"]):
        params = [song_id] + list(probs[i]) + [clusters[i]]

        params = [str(param) for param in params]

        f.write("\t".join(params) + "\n")

## Mapping Users to Tracks

In [None]:
local_path = root_path + "data/taste-profile-subset/"

In [None]:
songs_to_tracks = dict()
count = 0
with open(local_path + "songs-to-tracks.txt", "r") as f:
    for line in f.readlines():
        line = line.strip(" \t\n\r").split()
        if len(line) > 1:
            songs_to_tracks[line[0]] = line[1:]

In [None]:
outfile = open(local_path + "user-track-counts.txt", "w")

In [None]:
with open(local_path + "user-song-counts.txt", "r") as f:
    line = f.readline()
    while line:
        line = line.strip(" \t\n\r").split()
        if len(line) == 3 and line[1] in songs_to_tracks:
            for track in songs_to_tracks[line[1]]:
                outfile.write("\t".join([line[0], track, line[2]]) + "\n")
        line = f.readline()

In [None]:
outfile.close()

## Computing User Features (Based on Tracks' Cluster Probabilities)

In [None]:
import numpy as np

In [None]:
local_path = root_path + "data/"

n_clusters = 10

In [None]:
tracks_mfcc = dict()
with open(local_path + "features/tracks-cluster-probabilities.csv", "r") as f:
    line = f.readline()
    while line:
        line = f.readline()
        line = line.strip(" \t\n\r").split()
        if len(line) == 12:
            tracks_mfcc[line[0]] = np.array([float(field) for field in line[1:-1]])

In [None]:
user_features = dict()
user_track_counts = dict()

In [None]:
with open(local_path + "taste-profile-subset/user-track-counts.txt", "r") as f:
    line = f.readline()
    while line:
        line = line.strip(" \t\n\r").split()
        if len(line) == 3 and line[1] in tracks_mfcc:
            if line[0] not in user_track_counts:
                user_features[line[0]] = np.zeros(n_clusters)
                user_track_counts[line[0]] = 0
                
            user_features[line[0]] += tracks_mfcc[line[1]]
            user_track_counts[line[0]] += 1
        line = f.readline()

In [None]:
outfile = local_path + "features/user-features.csv"

In [None]:
with open(outfile, "w") as f:
    for user in user_features:
        f.write("\t".join([user] + [str(field) for field in (user_features[user] / float(user_track_counts[user]))]) + "\n")

## Clustering Users (Based on Users' Computed Features)

In [1]:
import numpy as np
import pandas as pd

from sklearn.mixture import GaussianMixture
from sklearn.model_selection import StratifiedKFold

In [4]:
local_path = root_path + "data/features/"
n_clusters = 10

In [5]:
user_data = pd.read_csv(local_path + "user-features.csv", sep="\t", header=None)

cols = user_data.columns.tolist()[1:]
user_features = user_data[cols]

In [6]:
estimator = GaussianMixture(n_components=n_clusters, covariance_type='diag', max_iter=1000, random_state=0)

In [7]:
estimator.fit(user_features)

GaussianMixture(covariance_type='diag', init_params='kmeans', max_iter=1000,
        means_init=None, n_components=10, n_init=1, precisions_init=None,
        random_state=0, reg_covar=1e-06, tol=0.001, verbose=0,
        verbose_interval=10, warm_start=False, weights_init=None)

In [8]:
probs = estimator.predict_proba(user_features)
clusters = estimator.predict(user_features)

In [9]:
count = 0
with open(local_path + "user-cluster-probabilities.csv", "w") as f:
    for i, user in enumerate(user_data[0]):
        params = [user] + list(probs[i]) + [clusters[i]]
        
        params = [str(param) for param in params]

        f.write("\t".join(params) + "\n")
        count += 1

print "\tObserved AIC Value:", estimator.aic(user_features), "\n"

	Observed AIC Value: -42927772.7076 

