Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding support for distributed TensorFlow. #14

Merged
merged 17 commits into from Feb 22, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
34 changes: 28 additions & 6 deletions README.md
Expand Up @@ -47,10 +47,11 @@ follow the instructions [here](https://cloud.google.com/ml/docs/how-tos/getting-
If you are participating in the Google Cloud & YouTube-8M Video Understanding
Challenge hosted on kaggle.com, see [these instructions](https://www.kaggle.com/c/youtube8m#getting-started-with-google-cloud) instead.

Please also verify that you have Tensorflow 1.0.0 or higher installed by
running the following command:
Please also verify that you have Python 2.7+ and Tensorflow 1.0.0 or higher
installed by running the following commands:

```sh
python --version
python -c 'import tensorflow as tf; print(tf.__version__)'
```

Expand Down Expand Up @@ -210,16 +211,29 @@ gsutil cp $BUCKET_NAME/${JOB_TO_EVAL}/predictions.csv .
Append
```sh
--frame_features=True --model=FrameLevelLogisticModel --feature_names="rgb" \
--feature_sizes="1024" --batch_size=256
--feature_sizes="1024" --batch_size=256 \
--train_dir=$BUCKET_NAME/yt8m_train_frame_level_logistic_model
```

to the 'gcloud' commands given above, and change 'video_level' in paths to
'frame_level'.
'frame_level'. Here is a sample command to kick-off a frame-level job:

```sh
JOB_NAME=yt8m_train_$(date +%Y%m%d_%H%M%S); gcloud --verbosity=debug beta ml jobs \
submit training $JOB_NAME \
--package-path=youtube-8m --module-name=youtube-8m.train \
--staging-bucket=$BUCKET_NAME --region=us-central1 \
--config=youtube-8m/cloudml-gpu.yaml \
-- --train_data_pattern='gs://youtube8m-ml/1/frame_level/train/train*.tfrecord' \
--frame_features=True --model=FrameLevelLogisticModel --feature_names="rgb" \
--feature_sizes="1024" --batch_size=256 \
--train_dir=$BUCKET_NAME/yt8m_train_frame_level_logistic_model
```

The 'FrameLevelLogisticModel' is designed to provide equivalent results to a
logistic model trained over the video-level features. Please look at the
'models.py' file to see how to implement your own models.
'video_level_models.py' or 'frame_level_models.py' files to see how to implement
your own models.


### Using Audio Features
Expand Down Expand Up @@ -257,6 +271,14 @@ the instructions on [tensorflow.org](https://www.tensorflow.org/install/).
This code has been tested with Tensorflow 1.0.0. Going forward, we will continue
to target the latest released version of Tensorflow.

Please verify that you have Python 2.7+ and Tensorflow 1.0.0 or higher
installed by running the following commands:

```sh
python --version
python -c 'import tensorflow as tf; print(tf.__version__)'
```

You can find complete instructions for downloading the dataset on the
[YouTube-8M website](https://research.google.com/youtube8m/download.html).
We recommend downloading the smaller video-level features dataset first when
Expand Down Expand Up @@ -326,7 +348,7 @@ When you are happy with your model, you can generate a csv file of predictions
from it by running

```sh
python inference.py --output_file=$MODEL_DIR/video_level_logistic_model/predictions.csv --input_data_pattern='/path/to/features/validate*.tfrecord' --train_dir=$MODEL_DIR/video_level_logistic_model
python inference.py --output_file=$MODEL_DIR/video_level_logistic_model/predictions.csv --input_data_pattern='/path/to/features/test*.tfrecord' --train_dir=$MODEL_DIR/video_level_logistic_model
```

This will output the top 20 predicted labels from the model for every example
Expand Down
7 changes: 4 additions & 3 deletions average_precision_calculator.py
Expand Up @@ -123,7 +123,7 @@ def accumulate(self, predictions, actuals, num_positives=None):
topk = self._top_n
heap = self._heap

for i in xrange(numpy.size(predictions)):
for i in range(numpy.size(predictions)):
if topk is None or len(heap) < topk:
heapq.heappush(heap, (predictions[i], actuals[i]))
else:
Expand All @@ -146,7 +146,8 @@ def peek_ap_at_n(self):
"""
if self.heap_size <= 0:
return 0
predlists = numpy.array(zip(*self._heap))
predlists = numpy.array(list(zip(*self._heap)))

ap = self.ap_at_n(predlists[0],
predlists[1],
n=self._top_n,
Expand Down Expand Up @@ -237,7 +238,7 @@ def ap_at_n(predictions, actuals, n=20, total_num_positives=None):
r = len(sortidx)
if n is not None:
r = min(r, n)
for i in xrange(r):
for i in range(r):
if actuals[sortidx[i]] > 0:
poscount += 1
ap += poscount / (i + 1) * delta_recall
Expand Down
8 changes: 8 additions & 0 deletions cloudml-gpu-distributed.yaml
@@ -0,0 +1,8 @@
trainingInput:
runtimeVersion: "1.0"
scaleTier: CUSTOM
masterType: standard_gpu
workerCount: 2
workerType: standard_gpu
parameterServerCount: 1
parameterServerType: standard
2 changes: 1 addition & 1 deletion eval.py
Expand Up @@ -103,7 +103,7 @@ def get_input_evaluation_tensors(reader,
filename_queue = tf.train.string_input_producer(
files, shuffle=False, num_epochs=1)
eval_data = [
reader.prepare_reader(filename_queue) for _ in xrange(num_readers)
reader.prepare_reader(filename_queue) for _ in range(num_readers)
]
return tf.train.batch_join(
eval_data,
Expand Down
6 changes: 3 additions & 3 deletions eval_util.py
Expand Up @@ -117,12 +117,12 @@ def top_k_by_class(predictions, labels, k=20):
prediction_triplets= []
for video_index in range(predictions.shape[0]):
prediction_triplets.extend(top_k_triplets(predictions[video_index],labels[video_index], k))
out_predictions = [[] for v in xrange(num_classes)]
out_labels = [[] for v in xrange(num_classes)]
out_predictions = [[] for v in range(num_classes)]
out_labels = [[] for v in range(num_classes)]
for triplet in prediction_triplets:
out_predictions[triplet[0]].append(triplet[1])
out_labels[triplet[0]].append(triplet[2])
out_true_positives = [numpy.sum(labels[:,i]) for i in xrange(num_classes)]
out_true_positives = [numpy.sum(labels[:,i]) for i in range(num_classes)]

return out_predictions, out_labels, out_true_positives

Expand Down
10 changes: 7 additions & 3 deletions inference.py
Expand Up @@ -24,6 +24,7 @@
from tensorflow import flags
from tensorflow import gfile
from tensorflow import logging
from builtins import range

import eval_util
import losses
Expand Down Expand Up @@ -66,12 +67,15 @@

def format_lines(video_ids, predictions, top_k):
batch_size = len(video_ids)
for video_index in xrange(batch_size):
for video_index in range(batch_size):
top_indices = numpy.argpartition(predictions[video_index], -top_k)[-top_k:]
line = [(class_index, predictions[video_index][class_index])
for class_index in top_indices]
# print("Type - Test :")
# print(type(video_ids[video_index]))
# print(video_ids[video_index].decode('utf-8'))
line = sorted(line, key=lambda p: -p[1])
yield video_ids[video_index] + "," + " ".join("%i %f" % pair
yield video_ids[video_index].decode('utf-8') + "," + " ".join("%i %f" % pair
for pair in line) + "\n"


Expand Down Expand Up @@ -101,7 +105,7 @@ def get_input_data_tensors(reader, data_pattern, batch_size, num_readers=1):
filename_queue = tf.train.string_input_producer(
files, num_epochs=1, shuffle=False)
examples_and_labels = [reader.prepare_reader(filename_queue)
for _ in xrange(num_readers)]
for _ in range(num_readers)]

video_id_batch, video_batch, unused_labels, num_frames_batch = (
tf.train.batch_join(examples_and_labels,
Expand Down
2 changes: 1 addition & 1 deletion losses.py
Expand Up @@ -43,7 +43,7 @@ class CrossEntropyLoss(BaseLoss):

def calculate_loss(self, predictions, labels, **unused_params):
with tf.name_scope("loss_xent"):
epsilon = 10e-8
epsilon = 10e-6
float_labels = tf.cast(labels, tf.float32)
cross_entropy_loss = float_labels * tf.log(predictions + epsilon) + (
1 - float_labels) * tf.log(1 - predictions + epsilon)
Expand Down
6 changes: 3 additions & 3 deletions mean_average_precision_calculator.py
Expand Up @@ -64,7 +64,7 @@ def __init__(self, num_class):

self._ap_calculators = [] # member of AveragePrecisionCalculator
self._num_class = num_class # total number of classes
for i in xrange(num_class):
for i in range(num_class):
self._ap_calculators.append(
average_precision_calculator.AveragePrecisionCalculator())

Expand All @@ -89,7 +89,7 @@ def accumulate(self, predictions, actuals, num_positives=None):
num_positives = [None for i in predictions.shape[1]]

calculators = self._ap_calculators
for i in xrange(len(predictions)):
for i in range(len(predictions)):
calculators[i].accumulate(predictions[i], actuals[i], num_positives[i])

def clear(self):
Expand All @@ -108,5 +108,5 @@ def peek_map_at_n(self):
class.
"""
aps = [self._ap_calculators[i].peek_ap_at_n()
for i in xrange(self._num_class)]
for i in range(self._num_class)]
return aps
1 change: 0 additions & 1 deletion readers.py
Expand Up @@ -18,7 +18,6 @@
import utils

from tensorflow import logging

def resize_axis(tensor, axis, new_size, fill_value=0):
"""Truncates or pads a tensor to new_size on on a given axis.

Expand Down