Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Docker for chunking #1

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions datasets/music/docker/new_experiment16k/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
FROM ubuntu:bionic

WORKDIR /dadabots_sampleRNN/datasets/music/

COPY new_experiment16k.py ./

ADD downloads ./downloads

RUN mkdir -p chunks

RUN apt-get update && apt-get upgrade -y
RUN apt-get install python-dev python-numpy python-setuptools libsndfile-dev libasound2-dev python-pip -y
RUN pip install -U pip
RUN pip install scikits.audiolab
RUN apt-get install ffmpeg -y

CMD ["python", "./new_experiment16k.py", "chunks", "downloads"]





91 changes: 91 additions & 0 deletions datasets/music/docker/new_experiment16k/new_experiment16k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import numpy as np
import sys, os, subprocess, scikits.audiolab, random, time, glob, errno

PWD = os.getcwd()
print 'PWD is', PWD
#store dataset name
DATASET_NAME = str(sys.argv[1])
DOWNLOAD_DIR = str(sys.argv[2])
print 'dl_dir is set to', DOWNLOAD_DIR
#create the
print "creating directory for", DATASET_NAME
DATASET_DIR = os.path.join(PWD, DATASET_NAME)
try:
os.makedirs(DATASET_DIR)
except OSError as e:
if e.errno != errno.EEXIST:
raise
#move samples from directory to use dataset name
print "moving samples"
types = {'wav', "mp3"}
for t in types:
os.system('mv {}/*.{} {}/'.format(DOWNLOAD_DIR, t, DATASET_DIR))
#run proprocess
print "preprocessing"
OUTPUT_DIR=os.path.join(DATASET_DIR, "parts")
os.makedirs(OUTPUT_DIR)
# Step 1: write all filenames to a list
with open(os.path.join(DATASET_DIR, 'preprocess_file_list.txt'), 'w') as f:
for dirpath, dirnames, filenames in os.walk(DATASET_DIR):
for filename in filenames:
if filename.endswith(".wav") or filename.endswith("mp3"):
f.write("file '" + dirpath + '/'+ filename + "'\n")

# Step 2: concatenate everything into one massive wav file
print "concatenate all files"
os.system('pwd')
os.system("ffmpeg -f concat -safe 0 -i {}/preprocess_file_list.txt {}/preprocess_all_audio.wav".format(DATASET_DIR, OUTPUT_DIR))
audio = "preprocess_all_audio.wav"
print "get length"
# # get the length of the resulting file
length = float(subprocess.check_output('ffprobe -i {}/{} -show_entries format=duration -v quiet -of csv="p=0"'.format(OUTPUT_DIR, audio), shell=True))
print length, "DURATION"
print "print big file into chunks"
# # Step 3: split the big file into 8-second chunks
# overlapping 3 times per 8 seconds
'''
for i in xrange(int((length//8)*3)-1):
time = (i * 8 )/ 3
os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 16000 {}/p{}.flac'.format(time, OUTPUT_DIR, OUTPUT_DIR, i))
'''
size = 12
num = 6400
for i in xrange(0, num):
time = i * ((length-size)/float(num))
os.system('ffmpeg -ss {} -t 8 -i {}/preprocess_all_audio.wav -ac 1 -ab 16k -ar 16000 {}/p{}.flac'.format(time, OUTPUT_DIR, OUTPUT_DIR, i))
print "clean up"
# # Step 4: clean up temp files
os.system('rm {}/preprocess_all_audio.wav'.format(OUTPUT_DIR))
os.system('rm {}/preprocess_file_list.txt'.format(DATASET_DIR))
print 'save as .npy'
__RAND_SEED = 123
def __fixed_shuffle(inp_list):
if isinstance(inp_list, list):
random.seed(__RAND_SEED)
random.shuffle(inp_list)
return
#import collections
#if isinstance(inp_list, (collections.Sequence)):
if isinstance(inp_list, numpy.ndarray):
numpy.random.seed(__RAND_SEED)
numpy.random.shuffle(inp_list)
return
# destructive operations; in place; no need to return
raise ValueError("inp_list is neither a list nor a numpy.ndarray but a "+type(inp_list))

paths = sorted(glob.glob(OUTPUT_DIR+"/*.flac"))
__fixed_shuffle(paths)

arr = [(scikits.audiolab.flacread(p)[0]).astype('float16') for p in paths]
np_arr = np.array(arr)
# 88/6/6 split
length = len(np_arr)
train_size = int(np.floor(length * .88)) # train
test_size = int(np.floor(length * .06)) # test

np.save(os.path.join(DATASET_DIR,'all_music.npy'), np_arr)
np.save(os.path.join(DATASET_DIR,'music_train.npy'), np_arr[:train_size])
np.save(os.path.join(DATASET_DIR,'music_valid.npy'), np_arr[train_size:train_size + test_size])
np.save(os.path.join(DATASET_DIR,'music_test.npy'), np_arr[train_size + test_size:])

#pass dataset name through two_tier.py || three_tier.py to datasets.py
17 changes: 17 additions & 0 deletions datasets/music/docker/new_experiment16k/run_docker_16k.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

#Usage: sudo ./run_docker_16k <artist name>

artist_name=$1

if [ -z "$1"]
then
artist_name="chunks"
fi

mkdir -p $artist_name
chmod 777 $artist_name
sed -i "s/chunks/$artist_name/g" Dockerfile
docker build --no-cache . -t dada-chunk
sudo docker run --rm --mount type=bind,source="$(pwd)"/$artist_name,target=/dadabots_sampleRNN/datasets/music/$artist_name -ti dada-chunk
sed -i "s/$artist_name/chunks/g" Dockerfile