# 用SVM对特征进行分类
2019年02月27日

In [71]:
import sklearn
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import os
import os.path as path
import pickle
from tqdm import tqdm as progress

## Some Constants

In [52]:
FRAME_MS_RATIO = 0.09380235476687636 # frames per milisecond
offset = int(2000 * FRAME_MS_RATIO) # offset of subsampling, in frames (2s in this eg.)
duration = int(6000 * FRAME_MS_RATIO) # maximun length of subsampling range, in frames
unit = int(80 * FRAME_MS_RATIO) # length of a single subsample, in frames
stride = unit // 2 # step in frames

## Subsampling from `.ftr` Files

In [45]:
def subsampling(mfcc: object, offset, duration, unit, stride=None):
    '''
    mfcc: shape like (40, xxx)
    return: list of subsamples
    '''
    if stride is None: stride = unit // 2
    subsamples = []
    high = offset + duration
    left = offset
    right = left + unit
    while right < high:
        subsamples.append(mfcc[:, left : right])
        left += stride
        right += stride
    return subsamples

In [48]:
def suffix_filter(files, suffix):
    '''
    return list of files with given suffix
    '''
    return filter(lambda x: x.endswith(suffix), files)

def scan_dir(file_dir, label):
    '''
    scan .ftr files in file_dir and return (samples, labels)
    labels are copies of label
    '''
    samples = []
    labels = []
    old_path = os.getcwd()
    os.chdir(file_dir)
    
    files = suffix_filter(os.listdir(), '.ftr')
    for filename in progress(files):
        with open(filename, 'rb') as f:
            mfcc = pickle.load(f)
        subs = subsampling(mfcc, offset, duration, unit, stride)
        samples += subs
        labels += [label for _ in subs]
    
    os.chdir(old_path)
    return samples, labels

In [None]:
os.chdir('../Data/Sounds/yzc/')
os.listdir()

In [53]:
p_samples, p_labels = scan_dir('Positive/', '+')
n_samples, n_labels = scan_dir('Negative/', '-')

36it [00:00, 1000.89it/s]
36it [00:00, 924.36it/s]


In [56]:
print(np.array(p_samples).shape)
print(np.array(p_labels).shape)

(6660, 40, 7)
(6660,)


In [59]:
samples = p_samples + n_samples # dataset
labels = p_labels + n_labels

## Flatten Features

In [63]:
flattened_samples = [sample.flatten() for sample in samples]
np.array(flattened_samples).shape

(13320, 280)

In [64]:
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(flattened_samples, labels, shuffle=True, test_size=0.1)

In [70]:
print(len(X_train1))
print(len(X_test1))

11988
1332


In [78]:
clf = SVC(kernel='rbf', gamma=3e-5)
clf.fit(X_train1, Y_train1)
print('train score:', clf.score(X_train1, Y_train1))
print('test  score:', clf.score(X_test1, Y_test1))

train score: 0.9764764764764765
test  score: 0.93993993993994


### Save Model

In [81]:
with open('../../../voice/svm98-94.clf', 'wb') as f:
    pickle.dump(clf, f)

### 前两维特征可视化

In [None]:
import matplotlib.pyplot as plt

In [None]:
class_p, class_n = [], []
for sample, label in zip(avg_samples, labels):
    if label == '+':
        class_p.append(sample)
    else:
        class_n.append(sample)
class_p = np.array(class_p)
class_n = np.array(class_n)

In [None]:
plt.scatter(class_p[:, 0], class_p[:, 1], c='red')
plt.scatter(class_n[:, 0], class_n[:, 1], c='blue')
plt.show()

### PCA主成分可视化

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA()
X = pca.fit_transform(avg_samples)

In [None]:
pca.explained_variance_ratio_ # PC占比

In [None]:
class_p, class_n = [], []
for sample, label in zip(X, labels):
    if label == '+':
        class_p.append(sample)
    else:
        class_n.append(sample)
class_p = np.array(class_p)
class_n = np.array(class_n)

In [None]:
plt.scatter(class_p[:, 0], class_p[:, 1], c='red')
plt.scatter(class_n[:, 0], class_n[:, 1], c='blue')
plt.show()

## 迁移学习

In [None]:
os.chdir('../../MP3')

In [None]:
samples_, labels_ = [], []

os.chdir('Positive/')
for filename in progress(os.listdir()):
    if filename.endswith('.ftr'):
        with open(filename, 'rb') as f:
            sample = pickle.load(f)
        sample = select_mid(sample, time_len)
        samples_.append(sample)
        labels_.append('+')
        
os.chdir('../Negative/')
for filename in progress(os.listdir()):
    if filename.endswith('.ftr'):
        with open(filename, 'rb') as f:
            sample = pickle.load(f)
        sample = select_mid(sample, time_len)
        samples_.append(sample)
        labels_.append('-')

In [None]:
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(avg_samples, labels, shuffle=True, test_size=0.1)
clf2 = SVC(kernel='linear', gamma=1e-5)
clf2.fit(X_train2, Y_train2)
print(clf2.score(X_train2, Y_train2))
print(clf2.score(X_test2, Y_test2))
print('>>', clf2.score(avg_samples_, labels_))

In [None]:
# average features
avg_samples_ = [sample.mean(axis=1) for sample in samples_]

In [None]:
# flatten features
flatten_samlpes_ = [sample.flatten() for sample in samples_]

In [None]:
clf1.score(flatten_samlpes_, labels_)

## Todo
- 哪些容易被错分？
- 一个源音频提供多个.ftr
- 50% overlap

In [16]:
a = [1, 2, 3]
print(list(filter(lambda x: x >= 2, a)))
del a

[2, 3]
