# Gender Classificaton


## Imports

In [1]:
import os
import sys
from scrape import get_remote_tgz_files, download_extract_files
from parse import paths, parse_readme
from pathlib import Path
import sqlite3
import librosa
import librosa.display
import IPython.display
import numpy as np
from dbhelpers import select
from sklearn import preprocessing
from sklearn.mixture import GaussianMixture as GMM
from features import extract_mfcc_vectors

## Input variables

In [2]:
SOURCE_URL = '''http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/'''
DATA_DIR = '''data'''
DB = '''genderclass.db'''

## Scrape data
Only files not already downloaded and extracted will be fetched.

In [3]:
data_folder = Path(DATA_DIR)
urls = get_remote_tgz_files(SOURCE_URL)
download_extract_files(urls, data_folder)

## Create DB to hold features

In [4]:
conn = sqlite3.connect(DB)
db = conn.cursor()
db.execute('''DROP TABLE IF EXISTS features''')
db.execute('''CREATE TABLE features
                     (file text, 
                      path text, 
                      female integer, 
                      age text, 
                      language text, 
                      dialect text,
                      cepstrum blob,
                      n_mfcc integer,
                      n_frames integer)''')
conn.commit()
conn.close()

## Extract features
Silence trimmed at beginning and end of each file.

MFCC extracted (with default librosa settings).

Save into database.

(skipped 127 records due to lack of parsable readme or wav-folder, takes about 20 min to run)

In [5]:
import datetime
print(datetime.datetime.now())

data_folder = Path(DATA_DIR)
conn = sqlite3.connect(DB)
db = conn.cursor()
samples = os.listdir(data_folder)

skipped = []
for sample in samples:
    (wav_folder, readme_path) = paths(data_folder, sample)

    if(wav_folder is None or readme_path is None):
        skipped.append(wav_folder)
        continue;

    meta = parse_readme(readme_path)
    if(not 'gender' in meta):
        skipped.append(wav_folder)
        continue;
    try:
        wavs = os.listdir(wav_folder)
        for wav in wavs:
            y, fs = librosa.load(wav_folder / wav, sr=None)
            y, idx = librosa.effects.trim(y)
            mfcc = librosa.feature.mfcc(y, fs)

            db.execute('''INSERT INTO features VALUES
             (?,?,?,?,?,?,?,?,?)''', 
                (wav[:-4], 
                str(wav_folder), 
                meta['gender'] == 'female', 
                meta.get('age range', ''), 
                meta.get('language', ''), 
                meta.get('pronunciation dialect',''),
                bytes(memoryview(mfcc)),
                mfcc.shape[0],
                mfcc.shape[1]))

        conn.commit()
    except Exception as e:
        print(wav_folder)
        print("type error: " + str(e))
        print(traceback.format_exc())

conn.close()
print(datetime.datetime.now())
print(len(skipped))

2019-03-25 21:43:11.750068
2019-03-25 22:04:53.244463
127


## Split into data sets
Known problem: Will not return same data set every time since sqlite does not support seeded random. Should be done in python instead.

Trade off: unbalanced data set with most subjects having 10 files, but there are those with a lot more (max 530). Only 430 females and over 5000 males. Optimally would be good if not same subject was present in training and test set, and if subjects were sampled to an equal representation, but this will have to do for now! (target has been equal sampling of male/female). 

Could create validation data set by same (improved) method if required.

In [6]:
conn = sqlite3.connect(DB)
db = conn.cursor()
females = select('''SELECT * FROM features WHERE female == 1 ORDER BY RANDOM()''', db)
males = select('''SELECT * FROM features WHERE female == 0 ORDER BY RANDOM() LIMIT {}'''.format(len(females)), db)

conn.commit()
conn.close()

l = int(len(females) / 2)

train = (males[0:l], females[0:l])
test = (males[l+1:2*l], females[l+1:2*l])

## Train one GMM model per gender
Also, clips are not the same length so a fixed number of random frames are selected per clip.

Skip first coefficient since that represents average power, but normalize before.

In [12]:
models = []
frames_per_speaker = 2
for gender in train:
    features = extract_mfcc_vectors(gender, frames_per_speaker)
    
    gmm = GMM(n_components = 4, max_iter = 200, covariance_type='diag',n_init = 3)
    gmm.fit(features)
    
    models.append(gmm)

NameError: name 'arrange' is not defined

## Evaluate

In [23]:
#for gender in np.arange(len(models)):

scores = None
log_likelihood = np.zeros(len(models))

    features = extract_mfcc_vectors(test[i], frames_per_speaker)

for i in range(len(models)):
    
    gmm    = models[i]         #checking with each model one by one
    scores = np.array(gmm.score(features))
    print(len(scores))
    #log_likelihood[i] = scores.sum()
#winner = np.argmax(log_likelihood)

#print(winner)


TypeError: len() of unsized object

In [None]:
## remember taht you might have to transpose mfcc to train!

In [7]:
## Possibly rewisit narrower frames:
#spec = librosa.feature.melspectrogram(y=y, sr=fs, S=None, n_fft=400, hop_length=160, power=2.0)
#mfcc1 = librosa.feature.mfcc(y, fs, S=numpy.log(spec))

#librosa.display.specshow(mfcc1, sr=fs, x_axis='time', hop_length=160)
#librosa.display.specshow(mfcc2, sr=fs, x_axis='time')

#IPython.display.Audio(y, rate=fs)




#db.execute('''SELECT * FROM features''')
#res = db.fetchall()
#print(res)
#mfcc2 = numpy.frombuffer(res[0][6])
#mfcc2 = numpy.reshape(mfcc2,(res[0][7], res[0][8]))
#librosa.display.specshow(mfcc2, sr=16000, x_axis='time')
#break

## Additional Improvements

ML:
- Make use of other features - potentially other classification methods
- Narrower frames of e.g. 25 ms
#spec = librosa.feature.melspectrogram(y=y, sr=fs, S=None, n_fft=400, hop_length=160, power=2.0)
#mfcc1 = librosa.feature.mfcc(y, fs, S=numpy.log(spec))

Coding
- Error and type checks in functions, can really only be used in known context
- Latter workbook cells not independent, should use db between cells and not variables




Thoughts:
Fit one gmm per class instead of one gaussian per class and evalate likelihood since there the MFCCs per gender wont be one gaussian each? Could then just have used median frequency?