In [1]:

import os
import pandas as pd

TESTFILE = os.path.join("data", "testTrack_hierarchy.txt")
TEST_OUTPUT = os.path.join("data", "testTracks.csv")
TRAINFILE = os.path.join("data", "trainItem.csv")


In [2]:

typemap = lambda x: int(x) if x != "None" else None

def process_train_line(line):
    user_entry = line.strip().split(",")
    return list(map(int, user_entry))

def process_test_line(line):
    
    user_entry = line.strip().split("|")
    
    user_id, track_id = user_entry[:2]
    album_id = typemap(user_entry[2])
    artist_id  = typemap(user_entry[3])
    genres = None      
    if len(user_entry) > 4:
        genres = list(map(int, user_entry[4:]))
        
    return [user_id, track_id, album_id, artist_id, genres]     
    
def write_test_tracks(filename, output):
    with open(output, "w") as out:
        with open(filename, "r") as f:
            for line in f:
                user_track_vector = process_test_line(line)
                l = ",".join(map(str, user_track_vector)) + "\n"
                out.write(l)
                
def read_test_tracks(filename, cols=["User ID", "Track", "Album", "Artist", "Genre(s)"]):
    user_tracks = []
    with open(filename, "r") as f:
        for line in f:
            user_track_vector = process_test_line(line)
            user_tracks.append(user_track_vector)
    return pd.DataFrame(user_tracks, columns=cols)

def read_train_tracks(filename, cols=["User ID", "Track", "Rating"]):
    user_tracks = []
    with open(filename, "r") as f:
        for line in f:
            user_track_vector = process_train_line(line)
            user_tracks.append(user_track_vector)
    return pd.DataFrame(user_tracks, columns=cols)


In [3]:

write_test_tracks(TESTFILE, TEST_OUTPUT)
test_df = read_test_tracks(TESTFILE)
test_df


Unnamed: 0,User ID,Track,Album,Artist,Genre(s)
0,1,188135,,,"[158282, 242383, 207648, 279143]"
1,1,250273,39718.0,141075.0,"[131552, 61215, 17453, 35389, 256783, 47898, 1..."
2,1,60428,224404.0,79500.0,"[158282, 242383, 207648, 110478, 47898, 280261..."
3,1,187953,206461.0,91679.0,"[61215, 17453, 199606, 144378, 88853]"
4,1,108088,18130.0,,"[198263, 61215, 34486, 99463]"
...,...,...,...,...,...
94285,38465,54238,110520.0,246889.0,"[61215, 17453, 35389, 253120]"
94286,38465,228598,32833.0,22935.0,"[33722, 176858, 224280]"
94287,38465,217068,166623.0,165176.0,"[17453, 35389, 189467]"
94288,38465,257173,2203.0,86068.0,[275144]


In [4]:

train_df = read_train_tracks(TRAINFILE, cols=["User ID", "Track", "Rating"])
train_df


Unnamed: 0,User ID,Track,Rating
0,199808,248969,90
1,199808,2663,90
2,199808,28341,90
3,199808,42563,90
4,199808,59092,90
...,...,...,...
12403570,249011,270557,90
12403571,249011,273574,90
12403572,249011,286938,90
12403573,249011,287681,80
