Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 167 lines (137 sloc) 6.151 kb
99fc1ed @plamere Improved docs
plamere authored
1
2 """ Processes track data from the Million Song Database. Specifically, this
3 file contains functions that load the flat-file format of tracks for the
4 MSD. The format is one track per line, where each line is represented by 54
5 fields as described here:
6
7 http://labrosa.ee.columbia.edu/millionsong/pages/field-list
8
9 except that in the flat file format, the 'track id' field has been moved
10 from field 52 to the first field.
11
12 A track is represented as a dictionary.
13 """
14
f5bc75b @plamere first commit
plamere authored
15 import sys
16 import pprint
17
18 def load_track(line):
99fc1ed @plamere Improved docs
plamere authored
19 """ Loads a track from a single line """
f5bc75b @plamere first commit
plamere authored
20 t = {}
21
22 f = line.split('\t')
23 if len(f) == 54:
24 t['track_id'] = f[0]
25 t['analysis_sample_rate'] = f[1]
26 t['artist_7digitalid'] = f[2]
27 t['artist_familiarity'] = float(f[3])
28 t['artist_hotttnesss'] = float(f[4])
29 t['artist_id'] = f[5]
30 t['artist_latitude'] = float(f[6])
31 t['artist_location'] = f[7]
32 t['artist_longitude'] = float(f[8])
33 t['artist_mbid'] = f[9]
34
35 tag_words = f[10].split(',')
36 tag_count = f[11].split(',')
37 mbtags = [ (w, int(c)) for w,c in zip(tag_words, tag_count) if len(w) > 0]
38 t['artist_mbtags'] = mbtags
39
40 t['artist_name'] = f[12]
41 t['artist_playmeid'] = int(f[13])
42
43 artist_terms = f[14].split(',')
44 artist_terms_freq = f[15].split(',')
45 artist_terms_weight = f[16].split(',')
46 t['artist_terms'] = [ (term, float(freq), float(weight)) \
47 for term ,freq, weight in zip(artist_terms, artist_terms_freq, artist_terms_weight) if len(term) > 0]
48
49 t['audio_md5'] = f[17]
50
51 bars_confidence = f[18].split(',')
52 bars_start = f[19].split(',')
53 t['bars'] = [ (float(start), float(conf)) \
54 for start, conf in zip(bars_start, bars_confidence) if len(start) > 0 ]
55
56 beats_confidence = f[20].split(',')
57 beats_start = f[21].split(',')
58 t['beats'] = [ (float(start), float(conf)) \
59 for start, conf in zip(beats_start, beats_confidence) if len(start) > 0 ]
60
61 t['danceability'] = float(f[22])
62 t['duration'] = float(f[23])
63 t['end_of_fade_in'] = float(f[24])
64 t['energy'] = float(f[25])
65 t['key'] = (int(f[26]), float(f[27]))
66 t['loudness'] = float(f[28])
67 t['mode'] = (int(f[29]), float(f[30]))
68 t['release'] = f[31]
69 t['release_7digitalid'] = f[32]
70 srid = f[32].zfill(10)
71 t['cover_art'] = 'http://cdn.7static.com/static/img/sleeveart/%s/%s/%s/%s_200.jpg' \
72 % (srid[0:2], srid[2:5], srid[5:8], srid)
73
74 sections_confidence = f[33].split(',')
75 sections_start = f[34].split(',')
76 t['sections'] = [ (float(start), float(conf)) \
77 for start, conf in zip(sections_start, sections_confidence) if len(start) > 0 ]
78
79 seg_confidence = f[35].split(',')
80 seg_loudness_max = f[36].split(',')
81 seg_loudness_max_time = f[37].split(',')
82 seg_loudness_max_start = f[38].split(',')
83 seg_pitches = f[39].split(',')
25679d2 @plamere Updated to support the 'ramp' MR task
plamere authored
84 seg_start = f[40].split(',')[:-1]
f5bc75b @plamere first commit
plamere authored
85 seg_timbre = f[41].split(',')
86
87 PITCH_COUNT = 12
88 TIMBRE_COUNT = 12
89 t['segments'] = []
90 for i, sstart in enumerate(seg_start):
91 if len(sstart) > 0:
92 seg = {}
93 seg['start'] = float(sstart)
94 seg['confidence'] = float(seg_confidence[i])
95 seg['loudness_max'] = float(seg_loudness_max[i])
96 seg['loudness_max_time'] = float(seg_loudness_max_time[i])
25679d2 @plamere Updated to support the 'ramp' MR task
plamere authored
97 seg['loudness_start'] = float(seg_loudness_max_start[i])
f5bc75b @plamere first commit
plamere authored
98 seg['pitch'] =[ float(p) for p in seg_pitches[i * PITCH_COUNT: i * PITCH_COUNT + PITCH_COUNT]]
99 seg['timbre'] =[ float(p) for p in seg_timbre[i * TIMBRE_COUNT: i * TIMBRE_COUNT + TIMBRE_COUNT]]
100 t['segments'].append(seg)
25679d2 @plamere Updated to support the 'ramp' MR task
plamere authored
101 if i < len(seg_start) - 1:
102 seg['duration'] = float(seg_start[i + 1]) - seg['start']
103 else:
104 seg['duration'] = t['duration'] - seg['start']
f5bc75b @plamere first commit
plamere authored
105
106 t['similar_artists'] = [s for s in f[42].split(',') if len(s) > 0]
107 t['song_hotttnesss'] = float(f[43])
108 t['song_id'] = f[44]
109 t['start_of_fade_out'] = float(f[45])
110
111 tatums_confidence = f[46].split(',')
112 tatums_start = f[47].split(',')
113 t['tatums'] = [ (float(start), float(conf)) \
114 for start, conf in zip(tatums_start, tatums_confidence) if len(start) > 0 ]
115 t['tempo'] = float(f[48])
116 t['time_signature'] = (int(f[49]), float(f[50]))
117 t['title'] = f[51]
118 t['track_7digitalid'] = int(f[52])
119 t['preview'] = 'http://previews.7digital.com/clips/34/%d.clip.mp3' % (int(f[52]), )
120 t['year'] = int(f[53])
121 return t
122 else:
123 print 'mismatched fields, found', len(f), 'should have 54'
124 return None
125
126
127
128
129 def load_tracks(path):
99fc1ed @plamere Improved docs
plamere authored
130 """ Loads a list of track from a file """
f5bc75b @plamere first commit
plamere authored
131
132 tracks = []
133 file = open(path)
134 for which, line in enumerate(file):
135 track = load_track(line)
136 if track <> None:
137 track['path'] = path
138 track['line'] = which
139 tracks.append(track)
140 file.close()
141 return tracks
142
143 def process_tracks(path, func):
99fc1ed @plamere Improved docs
plamere authored
144 """ applies func(track) to each track found in path """
f5bc75b @plamere first commit
plamere authored
145 file = open(path)
146 for which, line in enumerate(file):
147 track = load_track(line)
148 if track <> None:
149 track['path'] = path
150 track['line'] = which
151 func(track)
152 file.close()
153
154
155 def dump(track):
99fc1ed @plamere Improved docs
plamere authored
156 """ Dumps some data from a track for debugging """
f5bc75b @plamere first commit
plamere authored
157 print track['line'], track['track_id'], track['artist_id'], len(track['artist_mbtags']), \
158 len(track['artist_terms'] ), len(track['bars']), len(track['beats']), track['title'], \
159 track['key'], track['mode'], len(track['segments'])
25679d2 @plamere Updated to support the 'ramp' MR task
plamere authored
160 for seg in track['segments']:
161 print ' ', seg['start'], seg['duration'], track['duration']
162 print
f5bc75b @plamere first commit
plamere authored
163
164
165 if __name__ == '__main__':
166 process_tracks(sys.argv[1], dump)
Something went wrong with that request. Please try again.