# Gender Classificaton


## Imports

In [56]:
import os
import sys
from scrape import get_remote_tgz_files, download_extract_files
from parse import paths, parse_readme
from pathlib import Path
import sqlite3
import librosa
import librosa.display
import IPython.display
import numpy as np
from dbhelpers import select
from sklearn import preprocessing

## Input variables

In [2]:
SOURCE_URL = '''http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/'''
DATA_DIR = '''data'''
DB = '''genderclass.db'''

## Scrape data
Only files not already downloaded and extracted will be fetched.

In [3]:
data_folder = Path(DATA_DIR)
urls = get_remote_tgz_files(SOURCE_URL)
download_extract_files(urls, data_folder)

## Create DB to hold features

In [4]:
conn = sqlite3.connect(DB)
db = conn.cursor()
db.execute('''DROP TABLE IF EXISTS features''')
db.execute('''CREATE TABLE features
                     (file text, 
                      path text, 
                      female integer, 
                      age text, 
                      language text, 
                      dialect text,
                      cepstrum blob,
                      n_mfcc integer,
                      n_frames integer)''')
conn.commit()
conn.close()

## Extract features
Silence trimmed at beginning and end of each file.

MFCC extracted (with default librosa settings).

Save into database.

(skipped 127 records due to lack of parsable readme or wav-folder, takes about 20 min to run)

In [5]:
import datetime
print(datetime.datetime.now())

data_folder = Path(DATA_DIR)
conn = sqlite3.connect(DB)
db = conn.cursor()
samples = os.listdir(data_folder)

skipped = []
for sample in samples:
    (wav_folder, readme_path) = paths(data_folder, sample)

    if(wav_folder is None or readme_path is None):
        skipped.append(wav_folder)
        continue;

    meta = parse_readme(readme_path)
    if(not 'gender' in meta):
        skipped.append(wav_folder)
        continue;
    try:
        wavs = os.listdir(wav_folder)
        for wav in wavs:
            y, fs = librosa.load(wav_folder / wav, sr=None)
            y, idx = librosa.effects.trim(y)
            mfcc = librosa.feature.mfcc(y, fs)

            db.execute('''INSERT INTO features VALUES
             (?,?,?,?,?,?,?,?,?)''', 
                (wav[:-4], 
                str(wav_folder), 
                meta['gender'] == 'female', 
                meta.get('age range', ''), 
                meta.get('language', ''), 
                meta.get('pronunciation dialect',''),
                bytes(memoryview(mfcc)),
                mfcc.shape[0],
                mfcc.shape[1]))

        conn.commit()
    except Exception as e:
        print(wav_folder)
        print("type error: " + str(e))
        print(traceback.format_exc())

conn.close()
print(datetime.datetime.now())
print(len(skipped))

2019-03-25 21:43:11.750068
2019-03-25 22:04:53.244463
127


## Split into data sets
Known problem: Will not return same data set every time since sqlite does not support seeded random. Should be done in python instead.

Trade off: unbalanced data set with most subjects having 10 files, but there are those with a lot more (max 530). Only 430 females and over 5000 males. Optimally would be good if not same subject was present in training and test set, and if subjects were sampled to an equal representation, but this will have to do for now! (target has been equal sampling of male/female). 

Could create validation data set by same (improved) method if required.

In [52]:
conn = sqlite3.connect(DB)
db = conn.cursor()
females = select('''SELECT * FROM features WHERE female == 1 ORDER BY RANDOM()''', db)
males = select('''SELECT * FROM features WHERE female == 0 ORDER BY RANDOM() LIMIT {}'''.format(len(females)), db)

conn.commit()
conn.close()

l = int(len(females) / 2)

train_fem = females[0:l] 
test_fem = females[l+1:2*l] 
train_mal = males[0:l]
test_mal = males[l+1:2*l]

Also, clips are not the same length so a fixed number of random frames are selected per clip.

In [69]:
n_frames = 25

for s in train_fem:
    mfcc = np.frombuffer(s[6])
    mfcc = np.reshape(mfcc,(s[7], s[8]))
    mfcc = preprocessing.scale(mfcc)
    
    row_ix = np.arange(mfcc.shape[0])
    row_ix = row_ix[1:mfcc.shape[0]]
    print(row_ix)
    
    col_ix = np.arange(mfcc.shape[1])
    col_ix = np.random.choice(col_ix, (1, n_frames), replace=False)
    print(col_ix)
    print(mfcc[np.ix_(row_ix, col_ix[0])])
    break;
    

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[[ 99  91  32  76 105  16  87  78  43  51  74 110  44  26  17  55  59  38
   75  37   3  33  27 113   0]]
[[-8.60042089e-03  2.84286629e+00  2.16576692e+00  2.91479688e+00
   1.93330353e+00  1.78467435e+00  2.06177452e+00  2.73382681e+00
   5.69522210e-01  1.58614436e+00  2.51166528e+00  2.26736594e+00
   8.91101645e-01  2.93814112e+00  1.44141537e+00  2.32882984e+00
   1.97598767e+00  1.81793325e+00  2.72225100e+00  2.69187637e+00
   2.45152501e-01  2.66240742e+00  2.76943401e+00  7.75363196e-01
   8.71286352e-01]
 [ 1.88463546e-01 -1.82672304e+00  1.16282732e-01 -4.70045515e-01
  -4.28173495e-01  3.12656363e-01 -2.64784488e-01 -3.53314225e-01
   3.24827112e-01  2.17953564e-01 -1.86445276e-01  6.10978056e-01
   1.01835077e+00 -1.89171247e+00  7.05080293e-01 -4.40772740e-01
   4.85999248e-01 -2.32159320e-01 -3.48121179e-01 -8.77322617e-01
   2.08632020e-01 -5.39155984e-01 -1.45963180e+00  5.57682961e-01
   4.75121746e-01]
 [ 7.

In [None]:
## remember taht you might have to transpose mfcc to train!

In [7]:
## Possibly rewisit narrower frames:
#spec = librosa.feature.melspectrogram(y=y, sr=fs, S=None, n_fft=400, hop_length=160, power=2.0)
#mfcc1 = librosa.feature.mfcc(y, fs, S=numpy.log(spec))

#librosa.display.specshow(mfcc1, sr=fs, x_axis='time', hop_length=160)
#librosa.display.specshow(mfcc2, sr=fs, x_axis='time')

#IPython.display.Audio(y, rate=fs)




#db.execute('''SELECT * FROM features''')
#res = db.fetchall()
#print(res)
#mfcc2 = numpy.frombuffer(res[0][6])
#mfcc2 = numpy.reshape(mfcc2,(res[0][7], res[0][8]))
#librosa.display.specshow(mfcc2, sr=16000, x_axis='time')
#break

## Additional Improvements

ML:
- Make use of other features - potentially other classification methods
- Narrower frames of e.g. 25 ms
#spec = librosa.feature.melspectrogram(y=y, sr=fs, S=None, n_fft=400, hop_length=160, power=2.0)
#mfcc1 = librosa.feature.mfcc(y, fs, S=numpy.log(spec))

Coding
- Error and type checks in functions, can really only be used in known context