In [21]:
# coding: utf-8

import sys
from collections import Counter

import numpy as np
import tensorflow.keras as kr
import os
from tqdm import tqdm


In [10]:

base_dir = 'data/cnews'
train_dir = os.path.join(base_dir, 'cnews.train.txt')
test_dir = os.path.join(base_dir, 'cnews.test.txt')
val_dir = os.path.join(base_dir, 'cnews.val.txt')
vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt')


# 打开文件
def open_file(filename, mode='r'):
    """
    mode: 'r' or 'w' for read or write
    """
    return open(filename, mode, encoding='utf-8', errors='ignore')


def read_file(filename):
    """读取文件数据"""
    contents, labels = [], []
    with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                label, content = line.strip().split('\t')
                if content:
                    contents.append(list(content))
                    labels.append(label)
            except:
                pass
    return contents, labels


def build_vocab(train_dir, vocab_dir, vocab_size=5000):
    """根据训练集构建词汇表，存储"""
    data_train, _ = read_file(train_dir)

    all_data = []
    for content in data_train:
        all_data.extend(content)

    counter = Counter(all_data)
    count_pairs = counter.most_common(vocab_size - 1)
    words, _ = list(zip(*count_pairs))
    # 添加一个 <PAD> 来将所有文本pad为同一长度
    words = ['<PAD>'] + list(words)
    open_file(vocab_dir, mode='w').write('\n'.join(words) + '\n')


if not os.path.exists(vocab_dir):  # 如果不存在词汇表，重建
    build_vocab(train_dir, vocab_dir, 5000)


def read_category():
    """读取分类目录，固定"""
    categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']

    categories = [x for x in categories]

    cat_to_id = dict(zip(categories, range(len(categories))))

    return categories, cat_to_id


def read_vocab(vocab_dir):
    """读取词汇表"""
    # words = open_file(vocab_dir).read().strip().split('\n')
    with open_file(vocab_dir) as fp:
        # 如果是py2 则每个值都转化为unicode
        words = [_.strip() for _ in fp.readlines()]
    word_to_id = dict(zip(words, range(len(words))))
    return words, word_to_id




生成字典表和label字典表方法

In [18]:
categories, cat_to_id = read_category()
words, word_to_id = read_vocab(vocab_dir)


可以看一下label字典表和文章字典表

In [13]:
cat_to_id

{'体育': 0,
 '财经': 1,
 '房产': 2,
 '家居': 3,
 '教育': 4,
 '科技': 5,
 '时尚': 6,
 '时政': 7,
 '游戏': 8,
 '娱乐': 9}

In [19]:
word_to_id

{'<PAD>': 0,
 '，': 1,
 '的': 2,
 '。': 3,
 '一': 4,
 '是': 5,
 '在': 6,
 '0': 7,
 '有': 8,
 '不': 9,
 '了': 10,
 '中': 11,
 '1': 12,
 '人': 13,
 '大': 14,
 '、': 15,
 '国': 16,
 '': 3903,
 '2': 18,
 '这': 19,
 '上': 20,
 '为': 21,
 '个': 22,
 '“': 23,
 '”': 24,
 '年': 25,
 '学': 26,
 '时': 27,
 '我': 28,
 '地': 29,
 '和': 30,
 '以': 31,
 '到': 32,
 '出': 33,
 '来': 34,
 '会': 35,
 '行': 36,
 '发': 37,
 '：': 38,
 '对': 39,
 '们': 40,
 '要': 41,
 '生': 42,
 '家': 43,
 '他': 44,
 '能': 45,
 '也': 46,
 '业': 47,
 '金': 48,
 '3': 49,
 '成': 50,
 '可': 51,
 '分': 52,
 '多': 53,
 '现': 54,
 '5': 55,
 '就': 56,
 '场': 57,
 '新': 58,
 '后': 59,
 '于': 60,
 '下': 61,
 '日': 62,
 '经': 63,
 '市': 64,
 '前': 65,
 '过': 66,
 '方': 67,
 '得': 68,
 '作': 69,
 '月': 70,
 '最': 71,
 '开': 72,
 '房': 73,
 '》': 74,
 '《': 75,
 '高': 76,
 '9': 77,
 '8': 78,
 '.': 79,
 '而': 80,
 '比': 81,
 '公': 82,
 '4': 83,
 '说': 84,
 ')': 85,
 '将': 86,
 '(': 87,
 '都': 88,
 '资': 89,
 'e': 90,
 '6': 91,
 '基': 92,
 '用': 93,
 '面': 94,
 '产': 95,
 '还': 96,
 '自': 97,
 '者': 98,
 '本': 99,
 '之':

In [22]:
def process_file(filename, word_to_id, cat_to_id, max_length=600):
    """将文件转换为id表示"""
    contents, labels = read_file(filename)

    data_id, label_id = [], []
    for i in tqdm(range(len(contents))):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])

    # 使用keras提供的pad_sequences来将文本pad为固定长度

    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
    y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id))  # 将标签转换为one-hot表示

    return x_pad, y_pad

x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, 600)

100%|█████████████████████████████████████████████████████████████████████████| 50000/50000 [00:04<00:00, 10155.51it/s]


将中文文本、标签转换为训练x集和y集

看一下其中一篇文章

In [24]:
x_train[0]

array([1609,  659,   56,    8,   14, 1190,    1,  108, 1135,  121,  244,
       1564,   20,  951,    2,  977,  851,  194,  165,    8,  264,   32,
        330,  464,  900, 1478,    3,   61,  951,   91,  164,    1,  143,
        157,  244, 1296,  271,  977,  851,   57,   27,    1,   14, 1190,
        167,   63,   61,   10,  385,   22,  122,   27,    1,   80,  505,
       1055, 1342,  165,    8,  886,   61,   34,    2,  215,  730,    3,
       1551,  205,  538,    4,  538,    2,  608,  144,    1,  157,  244,
         72,  404,   10,  143,  125,   61,  951,    2,  644,   36,  977,
        851,    1,   18,   55,   52,  883,   66,  202,   10,    1,  125,
        405,  165,    8,  330,  464,  490,  121,    2, 1278,  554,    1,
         21,   10,  232,  797,  157,  200,   40,    1,   16,  725,  244,
        526,  126,   11,  853,  143,  125,    2,  977,  851,    1,  117,
        244,  371,  534, 1404,  267, 1070,  832,    3,    6, 1190,   11,
        977,  851,   39,  589,  157,  244,   34,   

In [25]:
y_train[0]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

都是numpy数组的形式，且转换为数字。
其中y集用onehot形式表示

In [28]:
import tensorflow as tf 

_x = tf.placeholder(shape=[None, 600], dtype=tf.float32, name="x")
_y = tf.placeholder(shape=[None, 600], dtype=tf.float32, name="y")
z = tf.add(_x, _y);

with tf.Session() as sess:
    result = sess.run(z, feed_dict={_x: [x_train[0]], _y: [x_train[1]]})
    print(result)


[[1.611e+03 7.600e+02 7.200e+01 2.520e+02 1.317e+03 1.369e+03 5.360e+02
  1.380e+02 1.138e+03 1.270e+02 2.930e+02 1.626e+03 3.100e+01 1.902e+03
  4.000e+00 1.260e+03 1.227e+03 2.460e+02 1.047e+03 7.680e+02 2.840e+02
  3.300e+01 3.490e+02 5.210e+02 9.110e+02 1.579e+03 4.250e+02 3.130e+02
  9.970e+02 3.210e+02 2.989e+03 1.150e+02 1.760e+02 3.640e+02 8.200e+02
  1.297e+03 3.890e+02 3.169e+03 2.606e+03 9.700e+01 3.100e+01 2.100e+02
  3.660e+02 1.240e+03 4.250e+02 3.820e+02 6.400e+01 1.740e+02 8.560e+02
  8.800e+01 4.140e+02 3.800e+01 2.000e+00 2.790e+02 1.469e+03 1.335e+03
  1.381e+03 1.790e+02 5.100e+01 8.920e+02 1.640e+02 1.350e+02 1.800e+01
  4.590e+02 8.110e+02 2.150e+02 1.562e+03 3.320e+02 5.920e+02 3.700e+01
  5.720e+02 4.000e+00 2.365e+03 1.086e+03 2.160e+02 9.840e+02 2.450e+02
  3.770e+02 1.567e+03 1.120e+02 2.190e+02 5.960e+02 1.960e+02 9.540e+02
  1.110e+02 1.904e+03 3.800e+01 1.249e+03 9.020e+02 3.100e+01 4.420e+02
  1.386e+03 5.300e+01 1.032e+03 7.100e+01 2.080e+02 2.400e+01 4.

通过tensorflow的图，会话等操作实现了一次向量的加法

学习链接https://blog.csdn.net/qq_37791134/article/details/81712772

准确率 和召回率 https://www.jianshu.com/p/4434ea11c16c

roc曲线 http://charleshm.github.io/2016/03/Model-Performance/