# svc训练方案2.0

### 训练时对yzc训练集进行大batch的采样，测试时对测试集进行小batch的采样

In [1]:
import sklearn
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import os
import os.path as path
from tqdm import tqdm as progress
import random
import pickle

## Some Constants

In [2]:
FRAME_MS_RATIO = 0.09380235476687636 # frames per milisecond
offset = int(2000 * FRAME_MS_RATIO) # offset of subsampling, in frames (2s in this eg.)
duration = int(6000 * FRAME_MS_RATIO) # maximun length of subsampling range, in frames
unit = int(80 * FRAME_MS_RATIO) # length of a single subsample, in frames
stride = unit // 2 # step in frames

## Subsampling from `.ftr` Files

In [3]:
# IO support
def load_from_file(filename):
    with open(filename, 'rb') as f:
        obj = pickle.load(f)
    return obj

def save_to_file(obj, filename):
    with open(filename, 'wb') as f:
        pickle.dump(obj, f)
        
def suffix_filter(files, suffix):
    '''
    return list of files with given suffix
    '''
    return filter(lambda x: x.endswith(suffix), files)

def show_shape(obj):
    return np.array(obj).shape

In [4]:
def subsampling(mfcc: object, offset, duration, unit, stride=None):
    '''
    mfcc: shape like (40, xxx)
    return: list of subsamples
    '''
    if stride is None: stride = unit // 2
    subsamples = []
    high = offset + duration
    left = offset
    right = left + unit
    while right <= high:
        subsamples.append(mfcc[:, left : right])
        left += stride
        right += stride
    return subsamples

def get_batches(subsamples, batch_size):
    '''
    batch-size is suggested to be an odd number
    return a list of test-batches from subsamples
    '''
    batches, batch = [], []
    for sample in subsamples:
        batch.append(sample)
        if len(batch) == batch_size:
            batches.append(batch)
            batch = []
    return batches

In [5]:
def scan_single_label_dir_batchly(file_dir, batch_size, label):
    '''
    scan .ftr files in file_dir and return (batches, labels)
    labels are copies of label
    '''
    batches, labels, names = [], [], []
    old_path = os.getcwd()
    os.chdir(file_dir)
    
    files = suffix_filter(os.listdir(), '.ftr')
    for filename in progress(files):
        mfcc = load_from_file(filename)
        subs = subsampling(mfcc, offset, duration, unit, stride)
        new_batches = get_batches(subs, batch_size)
        batches += new_batches
        labels += [label for _ in new_batches]
        names += [filename for _ in new_batches]
    
    os.chdir(old_path)
    return batches, labels, names

In [6]:
def scan_dir_batchly(wkdir, batch_size, shuffle=False, random_seed=1273):
    '''
    wkdir has '/Positive/' and '/Negative/' directory
    return batches, labels and descriptions
    '''
    old_path = os.getcwd()
    os.chdir(wkdir)
    assert path.exists('Positive')
    assert path.exists('Negative')
    
    p_batches, p_labels, p_names = scan_single_label_dir_batchly('Positive', batch_size, '+')
    n_batches, n_labels, n_names = scan_single_label_dir_batchly('Negative', batch_size, '-')
    
    batches = p_batches + n_batches
    labels = p_labels + n_labels
    names = p_names + n_names
    
    if shuffle:
        for obj in batches, labels, names:
            random.seed(random_seed)
            random.shuffle(obj)
    
    os.chdir(old_path)
    return batches, labels, names
    

## 训练模型

首先导入训练数据

In [9]:
os.chdir('../Data/Sounds/yzc/')

In [10]:
train_batches, train_labels, train_names = scan_dir_batchly('Train/', 51, shuffle=True)
show_shape(train_batches)

33it [00:00, 808.09it/s]
32it [00:00, 786.49it/s]


(195, 51, 40, 7)

`train_batches`的维度依次是：batch, unit, mfcc, timeframe

下面对训练集的每个batch的unit做平均，并压扁频率、时间维度

In [11]:
train_units = np.mean(train_batches, axis=1)
train_units = [unit.flatten() for unit in train_units]
show_shape(train_units)

(195, 280)

### 下面开始训练...

In [12]:
clf = SVC(kernel='rbf', gamma=1e-8, C=1.0)
clf.fit(train_units, train_labels)
clf.score(train_units, train_labels)

0.5076923076923077

### 下面，在yzc的测试集和zfs的全集上测试

导入测试数据，用小batch

In [13]:
test1_batches, test1_labels, test1_names = scan_dir_batchly('Test/', 5, shuffle=True, random_seed=1)
show_shape(test1_batches)

3it [00:00, 713.32it/s]
4it [00:00, 699.69it/s]


(259, 5, 40, 7)

In [14]:
os.chdir('..')

['yzc', '.DS_Store', '.vtdata', 'zfs', '.backup', '.nomedia']

In [15]:
test2_batches, test2_labels, test2_names = scan_dir_batchly('zfs/', 5, shuffle=True, random_seed=13)
show_shape(test2_batches)

35it [00:00, 696.05it/s]
35it [00:00, 636.73it/s]


(2590, 5, 40, 7)

定义投票决策机制

In [16]:
def predict_batchly(clf, batch):
    '''
    predict a class label based on in-batch voting
    batch-size is suggested to be an odd number
    clf: svm classifier
    batch: one batch, shape like (xxx, 40, 7)
    '''
    flattened_batch = [sample.flatten() for sample in batch] # shape (xxx, 280)
    votes = clf.predict(flattened_batch)
    p_cnt = len(votes[votes == '+'])
    n_cnt = len(votes[votes == '-'])
    return '+' if p_cnt > n_cnt else '-'

定义评价机制，可以输出错误的情况有哪些

In [35]:
def score_batches(clf, batches, labels, names=None):
    '''
    score a classifier's performance on batches
    return: acc (float number)
    '''
    total, correct = len(batches), 0
    incorrect_cases = []
    if names == None:
        for batch, label in zip(batches, labels):
            if predict_batchly(clf, batch) == label: correct += 1
        return correct / total
    else:
        for batch, label, name in zip(batches, labels, names):
            if predict_batchly(clf, batch) == label:
                correct += 1
            else:
                incorrect_cases.append(name)
        return correct / total, incorrect_cases

In [18]:
score1, incorrect_cases1 = score_batches(clf, test1_batches, test1_labels, test1_names)
score1

0.42857142857142855

In [19]:
score2, incorrect_cases2 = score_batches(clf, test2_batches, test2_labels, test2_names)
score2

0.5

## 效果不理想，下面将集中调参

In [25]:
os.chdir('yzc/')

下面两个语句块可以多次反复运行

In [105]:
# 和采样相关的参数
offset = int(2000 * FRAME_MS_RATIO) # offset of subsampling, in frames (2s in this eg.) 偏移量
duration = int(6000 * FRAME_MS_RATIO) # maximun length of subsampling range, in frames 持续时间
unit = int(100 * FRAME_MS_RATIO) # length of a single subsample, in frames 单元窗口长度
stride = unit // 2 # step in frames 移动窗口的步长

test1_batches, test1_labels, test1_names = scan_dir_batchly('Test/', 5, shuffle=True, random_seed=1)
print('yzc test size:', show_shape(test1_batches))

test2_batches, test2_labels, test2_names = scan_dir_batchly('../zfs/', 5, shuffle=True, random_seed=13)
print('zfs test size:', show_shape(test2_batches))

3it [00:00, 1069.88it/s]
4it [00:00, 1827.19it/s]
35it [00:00, 1554.99it/s]
35it [00:00, 1243.86it/s]

yzc test size: (189, 5, 40, 9)
zfs test size: (1890, 5, 40, 9)





In [111]:
train_batch_size = 11 # 训练集上的加权规模
svm_config = {        # svm 参数
    'gamma': 1e-6,
    'C': 1,
    'random_state': 10
}

train_batches, train_labels, train_names = scan_dir_batchly('Train/', train_batch_size, shuffle=True)
train_units = np.mean(train_batches, axis=1)
train_units = [unit.flatten() for unit in train_units]
print('train size:', show_shape(train_batches), '\n')

clf = SVC(kernel='rbf', **svm_config)
clf.fit(train_units, train_labels)
print('train score:    ', clf.score(train_units, train_labels))
print('yzc test score: ', score_batches(clf, test1_batches, test1_labels))
print('zfs test score: ', score_batches(clf, test2_batches, test2_labels))

33it [00:00, 1746.02it/s]
32it [00:00, 1437.93it/s]


train size: (780, 11, 40, 9) 

train score:     0.8179487179487179
yzc test score:  0.7566137566137566
zfs test score:  0.7126984126984127
