# 特征提取

---------------------------------
### 语音流程度特征提取



安装依赖
```
pip install librosa
```

In [2]:
import os
import librosa
import numpy as np

In [3]:

def MaxMinNormalization(x):
    """
    线性归一化，将输入list归一化
    :param x: list类型
    :return: 归一化list
    """
    x = (x - np.min(x)) / (np.max(x) - np.min(x))
    return x


def normalization(list):
    """
    归一化接口，目前只支持线性归一化
    :param list: 矩阵形式
    :return: 归一化矩阵
    """
    out = []
    for x in list:
        out.append(MaxMinNormalization(x))
    return out


def get_max(list):
    """
    提取音频序列中的极大值特征
    :param list，宽度固定为20维，长度不限
    :return:20维数组
    """
    average = []
    arr_temp = np.array(list)
    # arr_temp=np.dot(arr_temp,arr_temp.T)
    for a in arr_temp:
        average.append(max(a))
    # average.append(math.atan(max(a)) * 2 / 3.1415926)
    return average


def load(file):
    """
    输入文件名，加载数据
    :param file:文件名
    :return:浮点型数组
    """
    list = []
    f = open(file, 'r', encoding='UTF-8')
    for line in f:
        line_list = line.replace(',\n', '').split(',')
        for i in range(len(line_list)):
            line_list[i] = float(line_list[i])
        list.append(line_list)
    return list


def get_data():
    """
    获取所有数据，包括音频mfcc特征数据和标签数据，一共三个人的音频数据
    https://blog.csdn.net/u013378306/article/details/65954965  文档
    https://www.cnblogs.com/LXP-Never/p/11602510.html

    x_data:[[20],[20]]
    :return: x_data,y_data
    """
    x_data = []
    y_data = []
    src_path = 'dataset/test_chinese/'
    filename = os.listdir(src_path)
    for item in filename:  # 进入到文件夹内，对每个文件进行循环遍历
        y, sr = librosa.load(src_path + item)
        a = librosa.feature.mfcc(y=y, sr=sr)
        x_data.append(get_max(a))
        y_data.append(0)
    src_path = 'dataset/test_english/'
    filename = os.listdir(src_path)
    for item in filename:  # 进入到文件夹内，对每个文件进行循环遍历
        y, sr = librosa.load(src_path + item)
        a = librosa.feature.mfcc(y=y, sr=sr)
        x_data.append(get_max(a))
        y_data.append(1)
    print("OK")
    return x_data, y_data


def shuffer(x, y):
    """
    打乱数据
    :param x: [[20],[20]]
    :param y: [[0,1,0],[1,0,0]]]onehot数据
    :return: x_out，y_out打乱的数据
    """
    x_out = []
    y_out = []
    all = []
    for i in range(0, len(y)):
        all.append([x[i], y[i]])
    import random
    random.seed(0)
    random.shuffle(all)
    for item in all:
        x_out.append(item[0])
        y_out.append(item[1])
    return x_out, y_out


def data_split(x, y, rate):
    """
    通过设定训练集和验证集的比率，来调节数据
    :param x: 输入矩阵
    :param y: 输出矩阵
    :param rate: 浮点型0-1之间
    :return: train_data，test_data
    """
    num = int(rate * len(y))
    train_data = [x[:num], y[:num]]
    test_data = [x[num:], y[num:]]
    return train_data, test_data




In [4]:
def generate_train_data(input_file_dir):
    """

    :param input_file_dir: 声音文件保存路径
    :return:
    """
    x_data = []
    y_data = []
    label_count = 0
    for f in os.listdir(input_file_dir):
        print('foler: ', f)
        src_path = '{}/{}'.format(input_file_dir, f)
        file_list = os.listdir(src_path)



        for item in file_list:  # 进入到文件夹内，对每个文件进行循环遍历

            file_type = item.rsplit('.')[1]
            if file_type != 'mp3':
                continue


            file_path = '{}/{}'.format(src_path, item)
            print(file_path)
            y, sr = librosa.load(file_path)
            a = librosa.feature.mfcc(y=y, sr=sr)
            x_data.append(get_max(a))
            y_data.append(label_count)

        label_count += 1

    print('Train data size:', len(x_data))
    print(len(y_data))
    return x_data, y_data, label_count



def write_train_file(input_file_dir, generate_file_path):
    """
    生成训练文件
    :param input_file_dir:
    :param generate_file_path:
    :return:
    """

    x, y, label_count= generate_train_data(input_file_dir)

    x, y = shuffer(x, y)
    print('label_count: ', label_count)

    f=open('{}/{}'.format(generate_file_path, 'x.txt'), 'w', encoding='UTF-8')
    for line in x:
        for a in line :
            f.write(str(a)+',')
        f.write('\n')
    f.close()

    f = open('{}/{}'.format(generate_file_path, 'y.txt'), 'w', encoding='UTF-8')

    for line in y:
        a = int(line)
        for i in range(label_count):
            if i == a:
                f.write('1')
            else:
                f.write('0')
            if i < label_count-1:
                f.write(',')

        f.write('\n')
    f.close()
    print("OK")

###   提取特征

声音文件的组织方式

声音文件路径 ./test_data， 请改写

```
test-data
    voice-class-1/
        voice-1-01.mp3
        voice-1-02.mp3
        ...
    voice-class-2/
        ...
    voice-class-3/
        ...
    ...

```

In [7]:
%%time
target_dir = './target'
if not os.path.exists(target_dir): 
    os.mkdir(target_dir)
write_train_file('./test_data', target_dir)

foler:  z
/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/82.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/96.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/69.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/41.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/55.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/7.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/54.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/6.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/40.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/68.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/97.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/83.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/95.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/81.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/4.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/56.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/42.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/43.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/5.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/57.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/80.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/94.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/90.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/84.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/53.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/1.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/47.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/46.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/52.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/0.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/85.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/91.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/87.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/93.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/44.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/2.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/50.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/78.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/79.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/3.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/51.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/45.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/92.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/86.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/22.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/36.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/37.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/23.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/120.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/108.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/35.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/21.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/20.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/34.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/109.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/121.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/119.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/30.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/24.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/18.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/19.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/25.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/31.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/118.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/27.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/33.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/32.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/26.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/102.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/116.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/17.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/16.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/117.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/103.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/115.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/101.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/28.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/14.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/15.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/29.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/100.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/114.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/110.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/104.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/11.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/39.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/38.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/10.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/105.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/111.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/107.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/113.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/12.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/13.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/112.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/106.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/48.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/60.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/74.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/75.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/61.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/49.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/88.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/77.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/63.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/62.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/76.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/89.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/99.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/72.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/66.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/8.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/9.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/67.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/73.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/98.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/65.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/71.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/59.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/58.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/70.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/z/64.mp3




foler:  h
/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/82.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/96.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/69.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/41.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/55.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/7.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/54.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/6.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/40.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/68.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/97.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/83.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/95.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/81.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/4.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/56.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/42.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/43.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/5.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/57.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/80.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/94.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/90.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/84.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/53.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/1.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/47.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/46.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/52.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/0.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/85.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/91.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/87.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/93.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/44.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/2.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/50.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/78.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/79.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/3.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/51.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/45.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/92.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/86.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/22.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/36.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/37.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/23.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/120.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/108.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/35.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/21.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/20.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/34.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/109.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/119.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/30.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/24.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/18.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/19.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/25.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/31.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/118.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/27.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/33.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/32.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/26.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/102.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/116.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/17.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/16.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/117.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/103.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/115.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/101.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/28.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/14.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/15.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/29.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/100.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/114.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/110.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/104.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/11.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/39.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/38.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/10.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/105.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/111.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/107.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/113.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/12.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/13.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/112.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/106.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/48.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/60.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/74.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/75.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/61.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/49.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/88.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/77.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/63.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/62.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/76.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/89.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/99.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/72.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/66.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/8.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/9.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/67.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/73.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/98.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/65.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/71.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/59.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/58.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/70.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/h/64.mp3




foler:  w
/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/82.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/96.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/69.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/41.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/55.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/7.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/54.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/6.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/40.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/68.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/97.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/83.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/95.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/81.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/4.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/56.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/42.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/43.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/5.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/57.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/80.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/94.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/90.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/84.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/53.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/1.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/47.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/46.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/52.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/0.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/85.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/91.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/87.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/93.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/44.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/2.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/50.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/78.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/79.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/3.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/51.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/45.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/92.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/86.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/22.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/36.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/37.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/23.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/120.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/108.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/35.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/21.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/20.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/34.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/109.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/119.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/30.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/24.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/18.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/19.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/25.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/31.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/118.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/27.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/33.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/32.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/26.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/102.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/116.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/17.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/16.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/117.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/103.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/115.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/101.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/28.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/14.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/15.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/29.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/100.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/114.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/110.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/104.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/11.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/39.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/38.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/10.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/105.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/111.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/107.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/113.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/12.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/13.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/112.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/106.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/48.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/60.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/74.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/75.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/61.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/49.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/88.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/77.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/63.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/62.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/76.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/89.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/99.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/72.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/66.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/8.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/9.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/67.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/73.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/98.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/65.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/71.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/59.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/58.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/70.mp3




/Users/dikers/work/nwcd/001客户资料/001蓝拓扑/test_data/w/64.mp3




Train data size: 364
364
label_count:  3
OK
CPU times: user 14min 23s, sys: 13.8 s, total: 14min 37s
Wall time: 6min 3s


-------------------------------------
### 编译word2vec
```
cd word2vec
make
cp word2vec ../target
```



----------------------------------------------
###  同义词训练

生成词向量

raw-data/english_train.txt  包含训练用的英语文章

target/vec.txt 包含生成的词向量

```shell



cd shell
./train.sh   '../raw-data/english_train.txt'   '../target/vec.txt'

```

In [5]:
import numpy as np
import operator


class Synonym:
    def __init__(self, word_vec_file_path, threshold_rate=0.6):
        self.threshold_rate = threshold_rate
        self.word_vec_file_path = word_vec_file_path
        self.item_vec = self._load_item_vec(self.word_vec_file_path)

    @staticmethod
    def _load_item_vec(input_file):
        """
        Args:
            input_file: item vec file
        Return:
            dict key:itemid value:np.array([num1, num2....])
        """
        if not os.path.exists(input_file):
            return {}
        linenum = 0
        item_vec = {}
        fp = open(input_file)
        for line in fp:
            if linenum == 0:
                linenum += 1
                continue
            item = line.strip().split()
            if len(item) < 129:
                continue
            itemid = item[0]
            if itemid == "</s>":
                continue
            item_vec[itemid] = np.array([float(ele) for ele in item[1:]])
        fp.close()
        return item_vec

    def cal_item_sim(self, itemid):
        """
        Args
            item_vec:item embedding vector
            itemid:fixed itemid to clac item sim
            output_file: the file to store result
        """
        if itemid not in self.item_vec:
            return
        score = {}
        topk = 10
        fix_item_vec = self.item_vec[itemid]
        for tmp_itemid in self.item_vec:
            if tmp_itemid == itemid:
                continue
            tmp_itemvec = self.item_vec[tmp_itemid]
            fenmu = np.linalg.norm(fix_item_vec) * np.linalg.norm(tmp_itemvec)
            if fenmu == 0:
                score[tmp_itemid] = 0
            else:
                score[tmp_itemid] =  round(np.dot(fix_item_vec, tmp_itemvec)/fenmu, 3)
        out_str = itemid + "\t"
        # print(out_str)
        synonym_list = []
        for zuhe in sorted(score.items(), key=operator.itemgetter(1), reverse=True)[:topk]:
            if zuhe[1] > self.threshold_rate:
                synonym_list.append(zuhe)
                # print('{} :  {}'.format(zuhe[0], zuhe[1]))

        return synonym_list


In [8]:
synonym = Synonym("./target/vec.txt", 0.45)
synonym.cal_item_sim('answer')

[('encouragement', 0.498), ('unthinkable', 0.462), ('decision', 0.451)]

In [37]:
from __future__ import print_function
import time
import boto3
import json

# 大于threshold_rate 算相似
threshold_rate = 0.45

class FeatureExtract:


    def __init__(self, word_vec_file_path, right_content):
        self.comprehend_client = boto3.client('comprehend')
        self.word_type_name_list = ['NOUN', 'VERB']
        self.score_dict = self.get_score_dict()
        self.right_content = right_content
        self.synonym = Synonym("./target/vec.txt", threshold_rate)

    @staticmethod
    def get_score_dict():
        """
        测试数据， 后期可以从数据库读取
        """
        score_dict = dict()
        score_dict['184190001'] = 4.0
        score_dict['184190003'] = 3.0
        score_dict['184190010'] = 4.0
        score_dict['184190020'] = 4.0
        score_dict['184190045'] = 4.0

        score_dict['184190058'] = 3.0
        score_dict['184190071'] = 3.5
        score_dict['184190081'] = 4.0
        score_dict['184190109'] = 4.0
        score_dict['184190151'] = 3.5

        score_dict['184190170'] = 4.0
        score_dict['184190177'] = 4.0
        score_dict['184190189'] = 4.0
        score_dict['184190199'] = 3.5
        score_dict['184430141'] = 3.5
        return score_dict

    def read_json_file(self, file_path):
        """

        :param file_path:
        :return:
        """
        with open(file_path, "r") as f:
            new_dict = json.load(f)

        return new_dict['results']['transcripts'][0]['transcript']

    def create_word_dict(self, content):

        result = self.comprehend_client.detect_syntax(Text= content, LanguageCode='en')
        result = result['SyntaxTokens']
        word_type_dict = dict()

        for item in result:
            tag_name = item['PartOfSpeech']['Tag']
            if tag_name not in self.word_type_name_list:
                continue

            item_set = word_type_dict.get(tag_name)
            if item_set is None:
                item_set = set()

            item_set.add(item['Text'].lower())
            word_type_dict[tag_name] = item_set

        for item in word_type_dict.items():
            print('\t', item)
        return word_type_dict

    def read_all_file(self, file):
        """
        item（content, word_count ,word_dis_count,  word_type_dict ）
        :param file:
        :return:
        """

        print(self.right_content)
        word_dict = self.create_word_dict(self.right_content)
        _word_dict_list = list()

        count = 0
        for root, dirs, files in os.walk(file):
            for f in files:
                if not f.endswith('json'):
                    continue
                print('\n', os.path.join(root, f))
                content = self.read_json_file(os.path.join(root, f))
#                 if count >10:
#                     continue
#                 count +=1
                word_count = len(content.split(' '))
                word_dis_count = len(set(content.split(' ')))
                print('word_count{}  word_dis_count {}'.format( word_count, word_dis_count))
                word_type_dict = (f.split('.')[0], word_count, word_dis_count,  self.create_word_dict(content), content)
                _word_dict_list.append(word_type_dict)
        return _word_dict_list


    def get_sim_score(self, base_list, new_list):
        """
        获取相似度得分
        :param base_list:
        :param new_list:
        :return:
        """

        total_score = 0
        for j in new_list:
            if j in base_list:
                total_score += 1.0
            else:
                synonym_list = self.synonym.cal_item_sim(j)
                if synonym_list is None:
                    continue
                for syn_word in synonym_list:
                    if syn_word[0] in base_list:
                        total_score += float(syn_word[1])
#                         print('word {} - > syn_word{}  score: {}'.format(j,  syn_word, total_score))
                        break
        return float('%.2f' % (total_score /len(base_list)))

    def run(self):

        word_dict_list = self.read_all_file('./raw-data')
        count_index = 0
        base_item = self.create_word_dict(self.right_content)
        for word_type_name in self.word_type_name_list:
            print('-------------- {}---------------- '.format(word_type_name))
            tmp_item = sorted(list(base_item[word_type_name]))
            score_dict = self.get_score_dict()

            for item in word_dict_list:
                words = sorted(list(item[3][word_type_name]))
                sim_score = self.get_sim_score(tmp_item, words)

                print('学号:{}\t得分:{}\t 单词个数:{}\t不重复:{}\t相似度:{}\t {}个数: {}'.format(item[0], 
                                        score_dict[item[0]],  item[1], item[2], sim_score, word_type_name, len(words)))
                count_index += 1

In [38]:
_right_content = """It was a sunny day during last summer vacation. Ming practiced speaking English the whole morning. After that, he went to take piano lessons. Then his father took him to the art school to learn painting. Ming didn't have a rest until the evening. Unfortunately, he was so stressed out that he felt terrible. His parents sent him to hospital. And the doctor said Ming had a bad fever and should lie down and rest."""
featureExtract = FeatureExtract("./target/vec.txt", _right_content)
featureExtract.run()

print('------------------------------ end')

It was a sunny day during last summer vacation. Ming practiced speaking English the whole morning. After that, he went to take piano lessons. Then his father took him to the art school to learn painting. Ming didn't have a rest until the evening. Unfortunately, he was so stressed out that he felt terrible. His parents sent him to hospital. And the doctor said Ming had a bad fever and should lie down and rest.
	 ('VERB', {'was', 'sent', 'said', 'took', 'take', 'practiced', 'stressed', 'went', 'learn', 'have', 'had', 'lie', 'speaking', 'rest', 'felt'})
	 ('NOUN', {'art', 'evening', 'rest', 'father', 'vacation', 'hospital', 'fever', 'doctor', 'piano', 'morning', 'day', 'summer', 'lessons', 'school', 'parents', 'painting'})

 ./raw-data/184190001.json
word_count116  word_dis_count 74
	 ('VERB', {'was', 'taught', 'rest', 'starting', "'m", 'heip', 'come', 'let', 'practice', 'get', 'knowing', 'have', 'mean', 'had', 'founding', 'feel', 'taking'})
	 ('NOUN', {'location', 'v', 'song', 'hell', 't