diff --git a/ahem_detector.ipynb b/ahem_detector.ipynb index 136f0b6..8dc0386 100644 --- a/ahem_detector.ipynb +++ b/ahem_detector.ipynb @@ -2,34 +2,570 @@ "cells": [ { "cell_type": "code", - "execution_count": 4, + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from __future__ import print_function\n", + "import numpy as np\n", + "np.random.seed(1337) # for reproducibility\n", + "\n", + "import matplotlib\n", + "# Force matplotlib to not use any Xwindows backend.\n", + "matplotlib.use('Agg')\n", + "\n", + "#from keras.datasets import mnist\n", + "from keras.models import Sequential\n", + "#from keras.layers import Dense, Dropout, Activation, Flatten\n", + "#from keras.layers import Convolution2D, MaxPooling2D\n", + "from keras.layers.core import Dense, Dropout, Activation, Flatten\n", + "from keras.layers.convolutional import Convolution2D, MaxPooling2D\n", + "from keras.utils import np_utils\n", + "from keras import backend as K\n", + "from keras.models import model_from_json\n", + "\n", + "\n", + "import skimage.io as io \n", + "import matplotlib.pyplot as plt\n", + "from os import listdir\n", + "from os.path import isfile, join\n", + "import os " + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "batch_size = 64\n", + "nb_epoch = 5\n", + "# number of convolutional filters to use\n", + "nb_filters = 32\n", + "# size of pooling area for max pooling\n", + "pool_size = (2, 2)\n", + "# convolution kernel size\n", + "kernel_size = (3, 3)\n", + "\n", + "# input image dimensions\n", + "img_channels = 3 # RGB\n", + "img_rows, img_cols = 158, 164 # this is big already \n", + "input_shape = (3, img_rows, img_cols)" + ] + }, + { + "cell_type": "code", + "execution_count": 155, "metadata": { "collapsed": false }, + "outputs": [], + "source": [ + "# the data, shuffled and split between train and test sets\n", + "path_class_0 = './data/class_0/'\n", + "path_class_1 = './data/class_1/'\n", + "\n", + "nb_classes = 2\n", + "#input_shape = (1, img_rows, img_cols)\n", + "\n", + "class0_files = [f for f in listdir(path_class_0) if isfile(join(path_class_0, f))]\n", + "class1_files = [f for f in listdir(path_class_1) if isfile(join(path_class_1, f))]\n", + "\n", + "X_t = []\n", + "Y_t = []\n", + "\n", + "for fn in class0_files[:1000]:\n", + " img = io.imread(os.path.join(path_class_0, fn))\n", + " img = img.transpose((2,0,1))\n", + " img = img[:3, :, :]\n", + " X_t.append(img)\n", + " Y_t.append(0)\n", + "\n", + "for fn in class1_files[:500]:\n", + " img = io.imread(os.path.join(path_class_1, fn))\n", + " img = img.transpose((2,0,1))\n", + " img = img[:3, :, :]\n", + " X_t.append(img)\n", + " Y_t.append(1)\n", + "\n", + "\n", + "X_t = np.asarray(X_t)\n", + "Y_t = np.asarray(Y_t)\n", + "X_t = X_t.astype('float32')\n", + "X_t /= 255\n", + "\n", + "Y_t = np_utils.to_categorical(Y_t, nb_classes)" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": { + "collapsed": true + }, "outputs": [ { - "name": "stderr", + "data": { + "text/plain": [ + "array([[ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 1., 0.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.],\n", + " [ 0., 1.]])" + ] + }, + "execution_count": 127, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Y_t" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# seems to break\n", + "permutation = np.random.permutation(142)\n", + "\n", + "X_t = X_t[permutation]\n", + "Y_t = Y_t[permutation]" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "\" \\n\\ndef train_model(model, X_train, Y_train, X_test, Y_test):\\n nb_epoch = 1\\n batch_size = 16\\n optimizer='adadelta' \\n \\n #sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)\\n model.compile(loss='categorical_crossentropy', optimizer=optimizer)\\n \\n model.fit(X_train, Y_train, nb_epoch=nb_epoch, \\n batch_size=batch_size,\\n #validation_split=0.1, \\n show_accuracy=True, \\n verbose=1)\\n \\n #print('Testing...')\\n #res = model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1, show_accuracy=True)\\n #print('Test accuracy: {0}'.format(res[1]))\\n \\n\"" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\" \n", + "\n", + "def train_model(model, X_train, Y_train, X_test, Y_test):\n", + " nb_epoch = 1\n", + " batch_size = 16\n", + " optimizer='adadelta' \n", + " \n", + " #sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)\n", + " model.compile(loss='categorical_crossentropy', optimizer=optimizer)\n", + " \n", + " model.fit(X_train, Y_train, nb_epoch=nb_epoch, \n", + " batch_size=batch_size,\n", + " #validation_split=0.1, \n", + " show_accuracy=True, \n", + " verbose=1)\n", + " \n", + " #print('Testing...')\n", + " #res = model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1, show_accuracy=True)\n", + " #print('Test accuracy: {0}'.format(res[1]))\n", + " \n", + "\"\"\" " + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(1500, 3, 158, 164)" + ] + }, + "execution_count": 156, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_t.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def make_model():\n", + " model = Sequential()\n", + "\n", + " model.add(Convolution2D(nb_filters, kernel_size[0], kernel_size[1],\n", + " border_mode='valid',\n", + " input_shape=input_shape))\n", + " model.add(Activation('relu'))\n", + " model.add(Convolution2D(nb_filters, kernel_size[0], kernel_size[1]))\n", + " model.add(Activation('relu'))\n", + " model.add(MaxPooling2D(pool_size=pool_size))\n", + " model.add(Dropout(0.25))\n", + "\n", + " \"\"\"\n", + " model.add(Convolution2D(nb_filters, 3, 3, border_mode='same'))\n", + " model.add(Activation('relu'))\n", + " model.add(Convolution2D(nb_filters, 3, 3))\n", + " model.add(Activation('relu'))\n", + " model.add(MaxPooling2D(pool_size=(2, 2)))\n", + " model.add(Dropout(0.25))\n", + " \"\"\"\n", + "\n", + " model.add(Flatten())\n", + " model.add(Dense(128))\n", + " model.add(Activation('relu'))\n", + " model.add(Dropout(0.5))\n", + " model.add(Dense(nb_classes))\n", + " model.add(Activation('softmax'))\n", + "\n", + "\n", + " model.compile(loss='binary_crossentropy',\n", + " optimizer='adadelta',\n", + " metrics=['accuracy'])\n", + " return model " + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def save_model(model):\n", + " model_json = model.to_json()\n", + " open('ahem_architecture.json', 'w').write(model_json)\n", + " model.save_weights('ahem_weights.h5', overwrite=True)\n", + " \n", + "\n", + "def load_model(model_def_fname, model_weight_fname):\n", + " model = model_from_json(open(model_def_fname).read())\n", + " model.load_weights(model_weight_fname)\n", + " return model\n", + "\n", + "def load_image(filename):\n", + " img = io.imread(filename)\n", + " img = img.transpose((2,0,1))\n", + " img = img[:3, :, :]\n", + " return img" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "model = make_model()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", "output_type": "stream", "text": [ - "/usr/local/lib/python2.7/dist-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.\n", - " warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')\n", - "/usr/local/lib/python2.7/dist-packages/matplotlib/__init__.py:878: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.\n", - " warnings.warn(self.msg_depr % (key, alt_key))\n" + "Epoch 1/1\n", + " 128/1500 [=>............................] - ETA: 1006s - loss: 3.4802 - acc: 0.5234" ] } ], "source": [ - "import glob\n", - "import os\n", - "#import librosa\n", - "import numpy as np\n", - "#import matplotlib.pyplot as plt\n", - "#from matplotlib.pyplot import specgram\n", - "%matplotlib inline\n", - "\n", - "import utils as ut" + "for e in xrange(3):\n", + " # if load existing model, compile before fitting\n", + " #model = load_model('ahem_architecture.json', 'ahem_weights.h5')\n", + " #model.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy'])\n", + " \n", + " model.fit(X_t, Y_t, batch_size=batch_size, nb_epoch=1, verbose=1)\n", + " save_model(model)" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I know it should be class 0 and it is...\n", + "6/6 [==============================] - 1s\n", + "class [0 0 1 1 1 0]\n" + ] + } + ], + "source": [ + "if __name__ == '__main__':\n", + " new_stuff = []\n", + " img = load_image('./data/class_1/partial_spectrum_12299.png')\n", + " new_stuff.append(img)\n", + " img = load_image('./data/class_1/partial_spectrum_9649.png')\n", + " new_stuff.append(img)\n", + " img = load_image('./data/class_0/partial_spectrum_34111.png')\n", + " new_stuff.append(img)\n", + " img = load_image('./data/class_0/partial_spectrum_44148.png')\n", + " new_stuff.append(img)\n", + " img = load_image('./data/class_0/partial_spectrum_31938.png')\n", + " new_stuff.append(img)\n", + " img = load_image('./data/class_1/partial_spectrum_12000.png')\n", + " new_stuff.append(img)\n", + " \n", + " new_stuff = np.asarray(new_stuff)\n", + " model = load_model('ahem_architecture.json', 'ahem_weights.h5')\n", + " print('I know it should be class 0 and it is...')\n", + " predictions = model.predict_classes(new_stuff)\n", + " print('class %s' %predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0])" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 5, @@ -268,6 +804,13 @@ "ut.specgram_frombuffer(raw_sounds[3][0:22050], 5, 2, show=True)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Playback and clean new samples" + ] + }, { "cell_type": "code", "execution_count": 15, @@ -325,7 +868,6 @@ }, "outputs": [], "source": [ - "\n", "clean_samples = np.concatenate((raw_sounds[3][0:50000], raw_sounds[3][100000:] ))" ] }, @@ -385,7 +927,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.6" + "version": "2.7.8" } }, "nbformat": 4, diff --git a/cnn.py b/cnn.py index 98edf2d..f7b15cd 100644 --- a/cnn.py +++ b/cnn.py @@ -9,7 +9,11 @@ import numpy as np np.random.seed(1337) # for reproducibility -from keras.datasets import mnist +import matplotlib +# Force matplotlib to not use any Xwindows backend. +matplotlib.use('Agg') + +#from keras.datasets import mnist from keras.models import Sequential from keras.layers import Dense, Dropout, Activation, Flatten from keras.layers import Convolution2D, MaxPooling2D @@ -36,7 +40,7 @@ path_class_1 = './data/class_1/' # input image dimensions -img_rows, img_cols = 258, 263 +img_rows, img_cols = 158, 164 nb_classes = 2 input_shape = (1, img_rows, img_cols) @@ -47,12 +51,12 @@ X_t = [] Y_t = [] -for fn in class0_files[:5000]: +for fn in class0_files[:1000]: img = io.imread(os.path.join(path_class_0, fn), as_grey=True) X_t.append([img]) Y_t.append(0) -for fn in class1_files[:5000]: +for fn in class1_files[:1000]: img = io.imread(os.path.join(path_class_1, fn), as_grey=True) X_t.append([img]) Y_t.append(1) @@ -64,6 +68,7 @@ X_t /= 255 + model = Sequential() model.add(Convolution2D(nb_filters, kernel_size[0], kernel_size[1], diff --git a/data/unnamed.png b/data/unnamed.png deleted file mode 100644 index 8d7e591..0000000 Binary files a/data/unnamed.png and /dev/null differ diff --git a/make_data_class_0.py b/make_data_class_0.py index 79461b6..8abafa7 100644 --- a/make_data_class_0.py +++ b/make_data_class_0.py @@ -1,3 +1,6 @@ +import matplotlib +# Force matplotlib to not use any Xwindows backend. +matplotlib.use('Agg') import os, sys import utils as ut @@ -10,9 +13,9 @@ sound_names = ["ahem_sounds", "podcast_17_sample"] raw_sounds = ut.load_sound_files(sound_file_paths) -windowsize = 22050 # size of sliding window (22050 samples == 0.5 sec) -step = 11025 -maxfiles = 100000 +windowsize = 44100 # size of sliding window (22050 samples == 0.5 sec) +step = 22050 +maxfiles = 50000 # create negative samples audiosamples = raw_sounds[1] diff --git a/make_data_class_1.py b/make_data_class_1.py index e105108..c93aa4c 100644 --- a/make_data_class_1.py +++ b/make_data_class_1.py @@ -1,3 +1,7 @@ +import matplotlib +# Force matplotlib to not use any Xwindows backend. +matplotlib.use('Agg') + import os, sys import utils as ut @@ -10,9 +14,9 @@ sound_names = ["ahem_sounds", "podcast_17_sample"] raw_sounds = ut.load_sound_files(sound_file_paths) -windowsize = 22050 # size of sliding window (22050 samples == 0.5 sec) -step = 11025 -maxfiles = 100000 +windowsize = 44100 # size of sliding window (22050 samples == 0.5 sec) +step = 22050 +maxfiles = 50000 # create positive samples audiosamples = raw_sounds[0] diff --git a/utils.py b/utils.py index 8f64452..887f5b3 100644 --- a/utils.py +++ b/utils.py @@ -1,5 +1,9 @@ import librosa import numpy as np +import matplotlib +# Force matplotlib to not use any Xwindows backend. +matplotlib.use('Agg') + from matplotlib.pyplot import specgram import matplotlib.pyplot as plt import sys