/
score.py
164 lines (137 loc) · 4.88 KB
/
score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import xrange
def inception_score(
audio_fps,
k,
metagraph_fp,
ckpt_fp,
batch_size=100,
tf_ffmpeg_ext=None,
fix_length=False):
use_tf_ffmpeg = tf_ffmpeg_ext is not None
if not use_tf_ffmpeg:
from scipy.io.wavfile import read as wavread
if len(audio_fps) % k != 0:
raise Exception('Number of audio files ({}) is not divisible by k ({})'.format(len(audio_fps), k))
group_size = len(audio_fps) // k
# Restore graph
graph = tf.Graph()
with graph.as_default():
saver = tf.train.import_meta_graph(metagraph_fp)
if use_tf_ffmpeg:
x_fp = tf.placeholder(tf.string, [])
x_bin = tf.read_file(x_fp)
x_samps = tf.contrib.ffmpeg.decode_audio(x_bin, tf_ffmpeg_ext, 16000, 1)[:, 0]
x = graph.get_tensor_by_name('x:0')
scores = graph.get_tensor_by_name('scores:0')
# Restore weights
sess = tf.Session(graph=graph)
saver.restore(sess, ckpt_fp)
# Evaluate audio
_all_scores = []
for i in xrange(0, len(audio_fps), batch_size):
batch = audio_fps[i:i+batch_size]
# Load audio files
_xs = []
for audio_fp in batch:
if use_tf_ffmpeg:
_x = sess.run(x_samps, {x_fp: audio_fp})
else:
fs, _x = wavread(audio_fp)
if fs != 16000:
raise Exception('Invalid sample rate ({})'.format(fs))
if _x.dtype==np.int16:
_x = _x.astype(np.float32)
_x /= 32767.
if _x.ndim != 1:
raise Exception('Invalid shape ({})'.format(_x.shape))
if fix_length:
_x = _x[:16384]
#_x = _x[-16384:]
_x = np.pad(_x, (0, 16384 - _x.shape[0]), 'constant')
if _x.shape[0] != 16384:
raise Exception('Invalid number of samples ({})'.format(_x.shape[0]))
_xs.append(_x)
# Compute model scores
_all_scores.append(sess.run(scores, {x: _xs}))
sess.close()
# Find labels
_all_scores = np.concatenate(_all_scores, axis=0)
_all_labels = np.argmax(_all_scores, axis=1)
# Compute inception scores
_inception_scores = []
for i in xrange(k):
_group = _all_scores[i * group_size:(i + 1) * group_size]
_kl = _group * (np.log(_group) - np.log(np.expand_dims(np.mean(_group, 0), 0)))
_kl = np.mean(np.sum(_kl, 1))
_inception_scores.append(np.exp(_kl))
return np.mean(_inception_scores), np.std(_inception_scores), _all_labels
if __name__ == '__main__':
import argparse
import glob
import os
import random
import sys
parser = argparse.ArgumentParser()
parser.add_argument('--audio_dir', type=str,
help='Directory with 16-bit signed integer PCM WAV files at 16kHz')
parser.add_argument('--fix_length', action='store_true', dest='fix_length',
help='If set, pad or trim audio files to length 16384')
parser.add_argument('--labels_fp', type=str,
help='If set, write model predictions to this file')
parser.add_argument('--metagraph_fp', type=str,
help='MetaGraph for classifier; must have tensors x:0 [None, 16384] and scores:0 [None, 10]')
parser.add_argument('--ckpt_fp', type=str,
help='Checkpoint for metagraph')
parser.add_argument('--n', type=int,
help='Number of samples to test')
parser.add_argument('--k', type=int,
help='Number of subsets to score')
parser.add_argument('--batch_size', type=int,
help='Evaluate audio in batches of this size')
parser.add_argument('--tf_ffmpeg_ext', type=str,
help='If set, uses ffmpeg to decode audio files with specified extension through tensorflow')
parser.set_defaults(
audio_dir=None,
fix_length=False,
labels_fp=None,
metagraph_fp='infer.meta',
ckpt_fp='best_acc-103005',
n=50000,
k=10,
batch_size=100,
tf_ffmpeg_ext=None)
args = parser.parse_args()
# Find audio files
if args.audio_dir is None:
raise Exception('No audio directory specified')
ext = 'wav' if args.tf_ffmpeg_ext is None else args.tf_ffmpeg_ext
audio_fps = sorted(glob.glob(os.path.join(args.audio_dir, '*.{}'.format(ext))))
random.seed(0)
random.shuffle(audio_fps)
if len(audio_fps) < args.n:
raise Exception('Found fewer ({}) than specified ({}) audio files'.format(len(audio_fps), args.n))
audio_fps = audio_fps[:args.n]
# Compute scores
mean, std, labels = inception_score(
audio_fps,
args.k,
args.metagraph_fp,
args.ckpt_fp,
batch_size=args.batch_size,
tf_ffmpeg_ext=args.tf_ffmpeg_ext,
fix_length=args.fix_length)
print('Inception score: {} +- {}'.format(mean, std))
print('p(y)')
for i in xrange(10):
n = len([x for x in labels if x == i])
print('{}: {}'.format(i, n / float(args.n)))
# Save labels
if args.labels_fp is not None:
labels_txt = []
for audio_fp, label in zip(audio_fps, labels):
labels_txt.append(','.join([audio_fp, str(label)]))
with open(args.labels_fp, 'w') as f:
f.write('\n'.join(labels_txt))