# 文件地址管理

In [1]:
news_info_train_path = './News_info_train_filter.txt'
pic_info_train_path = ''
news_pic_label_train_path = './News_pic_label_train.txt'

news_info_unlabel_path = './News_info_unlabel_filter.txt'
pic_info_unlabel_path = ''

news_info_validate_path = './News_info_validate_filter.txt'
pic_info_validate_path = ''
news_pic_label_validate_path = ''

ltp_model_path = './NLP/ltp_data_v3.4.0/cws.model'   # Ltp 3.4 分词模型库

# 读取原始数据（返回的数据是一个列表,其中每一条数据包括id, content, picture list or text）

In [8]:
from instrument import read_text

news_info_train = read_text(news_info_train_path)                     # 已标注训练集
news_pic_label_train = read_text(news_pic_label_train_path)           # 已标注训练集对应的标签
news_info_validate = read_text(news_info_validate_path)               # 未标注验证集（利用训练好的模型进行标注）
news_info_unlabel = read_text(news_info_unlabel_path)                 # 未标注数据集

# 提取每条新闻文本的id, content, pic_list, label, match_pic_list, match_text

## 1、利用data2bunch函数对训练集、验证集、无标签数据的文本数据进行提取

In [10]:
from instrument import data2bunch

train_bunch = data2bunch(news_info_train, label=False)          # 对训练集数据进行提取
validate_bunch = data2bunch(news_info_validate, label=False)    # 对验证集数据进行提取
train_unlabel_bunch = data2bunch(news_info_unlabel,label=False) # 对无标签数据集进行数据提取

## 2、利用data2bunch函数对训练集中文本和图片的标签进行提取

In [12]:
train_label = data2bunch(news_pic_label_train,label=True)

## 3、进行文本与标签的配对

In [21]:
# 判断判断文本的id与标签的id是否相同来判断文本与标签是否匹配
for i in range(len(train_bunch.news_id)):
    if train_bunch.news_id[i] != train_label.news_pic_id[i]:
        print('出现标签不匹配！')
        print(train_bunch.news_id[i],'is not equal to',train_label.news_pic_id[i])
    else:
        pass

# 中文分词

##### 有待改进：1、选择保留哪些标点符号的方法不是很合理；2、某种符号可能出现一串的存在的情况，只要要保存一个即可；3、未取出停止词便统计句子的长度；4、ltp 分词方法中可解决先去除停止词再计算句子长度（先分词 -> 不带停止词的词列表 -> 计算句长 -> ' '.join(…)）

In [130]:
def text2words_v2(need_to_segment_text, method='jieba', ltp_model_path='',postdict={},stop_words=[]):
    '''
        该函数实现对每条新闻文本进行分词（分词方法包括jieba和ltp）,并将繁体字替换为简体字
    1、need_to_segment_text 为需要进行分词的文本列表
    2、method 为中文分词方法，可选 jieba 或 ltp
    3、ltp_model_path 为选择ltp分词方法时，ltp 分词模型在本地的目录地址
    4、postdist 数据格式：
        postdict = {'解 空间':'解空间','深度 优先':'深度优先'}
    5、计算出每条句子中所包含的字、词和某些符号的总数，作为该条句子的总长度，最终返回句子长度的列表
    
    '''
    from langconv import Converter
    import re
    text_words = []
    sentence_len = []            # 记录句子的长度（按分词后句子所含字或词的个数计算）

    if method == 'jieba':
        import jieba
#         jieba.enable_parallel(4)             # 多线程分词，仅支持 Linux
        for sentence in need_to_segment_text:
#             print(sentence)                       # 查看新闻原文
            sentence = Converter('zh-hans').convert(sentence)             # 将繁体中文转换为简体中文
#             content = re.sub('[^\u4e00-\u9fa5a-zA-Z0-9.]',' ', sentence)     # 将中文、大小写字母和数字外的字符全替换为空格
            content = re.sub("[^\u4e00-\u9fa5a-zA-Z0-9|，。？！、；：“”‘’（）【】{}……《》%,.?!;:'()[]-——/&]",' ', sentence)
            content = re.sub('\s+',' ',content)
            content = re.sub('//\s+(//)?','/',content)
            word_list = [word for word in jieba.cut(content) if word not in stop_words]  # 分词，去停止词
            sentence_len.append(len(word_list))             # 记录该句话的长度
            seg_sent = ' '.join(word_list)  # 去除停止词
            seg_sent = re.sub('\s+',' ',seg_sent)
            for key in postdict:
                seg_sent = seg_sent.replace(key,postdict[key])    # 在分词后处理某些被分错的词和词语
            text_words.append(seg_sent)
    elif method == 'ltp' and ltp_model_path != '':
        from pyltp import Segmentor
        #         model_path = 'E:/Desktop/ZhuFei/Competition/NLP/ltp_data_v3.4.0/cws.model'   # Ltp 3.4 分词模型库
        segmentor = Segmentor()   # 实例化分词模块
        segmentor.load(ltp_model_path)  # 加载分词库
        for sentence in need_to_segment_text:
            #             print(sentence)                         # 查看新闻原文
            sentence = Converter('zh-hans').convert(sentence)     # 将繁体中文转换为简体中文
            content = re.sub("[^\u4e00-\u9fa5a-zA-Z0-9|，。？！、；：“”‘’（）【】{}……《》%,.?!;:'()[]-——/&]",' ', sentence)
            content = re.sub('\s+','',content)
            content = re.sub('//\s+(//)?','/',content)
            word_list = [word for word in segmentor.cut(content) if word not in stop_words]
            sentence_len.append(len(word_list))                       # 记录该句话的长度
            seg_sent = ' '.join(word_list)  # 去除停止词
            seg_sent = re.sub('\s+',' ',seg_sent)
            for key in postdict:
                seg_sent = seg_sent.replace(key,postdict[key])    # 在分词后处理某些被分错的词和词语
            text_words.append(seg_sent) # 去除停止词
    else:
        fill_to_length = 130
        print(''.center(fill_to_length, '#'))
        print(' Method or model path is wrong! Please check it!!!! '.center(fill_to_length, '#'))
        print(''.center(fill_to_length, '#'))

    return [text_words,sentence_len]

In [131]:
postdict = {'基姆 霍根':'基姆•霍根',
            '中央 商场':'中央商场',
            '马克 龙':'马克龙',
            ' 惊天 魔 盗团':' 惊天魔盗团',
            '天将 雄狮':'天降雄狮',
            '天降 雄狮':'天降雄狮',
            '盗墓 笔记':'盗墓笔记',
            '功夫 瑜伽':'功夫瑜伽',
            '大闹 天竺':'大闹天竺',
            '人力 资源管理 师':'人力资源管理师',
            '微信 公众 号':'微信公众号',
            '星球大战 之 最后 的 绝地 武士':'星球大战之最后的绝地武士',
            '前任 3   再见 前任':'前任3:再见 前任',
            '二代 妖精':'二代妖精',
            '妖猫 传':' 妖猫传',
            '解忧 杂货店':'解忧杂货店',
            '机器 之血':'机器之血',
            '寻梦 环 游记':'寻梦环游记'
           }

## 1、对训练集中的新闻文本进行分词

In [132]:
train_content_words_jieba, train_content_sentence_length = text2words_v2(train_bunch.news_content)    # jieba 分词

# model_path = './NLP/ltp_data_v3.4.0/cws.model'   # Ltp 3.4 分词模型库
# train_content_words_ltp = text2words(train_bunch.news_content,method='ltp',model_path=model_path) # ltp 分词

In [127]:
print(train_content_words_jieba)
print(train_content_sentence_length)

## 2、对验证集中的新闻文本进行分词 

In [134]:
validate_content_words_jieba, validate_content_sentence_length = text2words_v2(validate_bunch.news_content)   # jieba 分词 

# validate_content_words_ltp = text2words(validate_bunch.news_content,method='ltp',model_path=model_path) # ltp 分词

## 3、对未标注文本进行分词

In [142]:
unlabel_news_words_jieba, unlabel_news_sentence_length = text2words_v2(train_unlabel_bunch.news_content)   # jieba 分词 

## 4、训练集的标签匹配

In [None]:
for i in range(len(train_bunch.news_id)):
    if train_bunch.news_id[i]!=train_label.news_pic_id[i]:
        print(train_bunch.news_id[i],'is not equal with',train_label.news_pic_id[i])
        break
    else:
        pass
#         print('ID is correct!')
dir(train_bunch)
dir(train_label)
print(dir(validate_bunch))

##### 到此为止，上述代码完成了如下任务：将文本数据读入后按行保存为列表(中文文本数据和标签文本数据) -> 将中文文本数据进行分词保存为列表（按特征对数据进行分离；标点符号保留、繁体替换、多余符号仅保留一个 ）

## 5、对标签进行独热编码

In [194]:
import pandas as pd
import numpy as np

In [196]:
label_one_hot = np.array(pd.get_dummies(train_label.news_pic_label)).tolist()

# 5、将分词结果保存以方便后续使用

## 5.1、将训练集与验证集进行Bunch化 

In [206]:
from sklearn.datasets.base import Bunch

# 1、将训练集的分词结果保存为本地的一个 Bunch 对象
TRAIN_BUNCH = Bunch(news_id=[],news_words_jieba=[],news_length=[],news_pic_list=[])
TRAIN_BUNCH.news_id = train_bunch.news_id
TRAIN_BUNCH.news_words_jieba = train_content_words_jieba
TRAIN_BUNCH.news_length = train_content_sentence_length
TRAIN_BUNCH.news_pic_list = train_bunch.news_pic_list

# 2、将训练集新闻和图片对应的标签保存为本地的一个 Bunch 对象
TRAIN_LABEL_BUNCH = Bunch(news_pic_id=[],news_pic_label=[],news_pic_label_one_hot=[],news_pic_pic=[],news_pic_text=[])
TRAIN_LABEL_BUNCH.news_pic_id = train_label.news_pic_id
TRAIN_LABEL_BUNCH.news_pic_label = train_label.news_pic_label
TRAIN_LABEL_BUNCH.news_pic_label_one_hot = label_one_hot
TRAIN_LABEL_BUNCH.news_pic_pic = train_label.news_pic_pic
TRAIN_LABEL_BUNCH.news_pic_text = train_label.news_pic_text

# 3、将未标注数据集的分词结果保存为本地的一个 Bunch 对象
UNLABEL_NEWS_BUNCH = Bunch(news_id=[],news_words_jieba=[],news_length=[],news_pic_list=[])
UNLABEL_NEWS_BUNCH.news_id = train_unlabel_bunch.news_id
UNLABEL_NEWS_BUNCH.news_words_jieba = unlabel_news_words_jieba
UNLABEL_NEWS_BUNCH.news_length = unlabel_news_sentence_length
UNLABEL_NEWS_BUNCH.news_pic_list = train_unlabel_bunch.news_pic_list

# 4、将验证集（contain id, content and pictures list）保存为一个本地的 Bunch 对象
VALIDATE_BUNCH = Bunch(news_id=[],news_words_jieba=[],news_length=[],news_pic_list=[])
VALIDATE_BUNCH.news_id = validate_bunch.news_id
VALIDATE_BUNCH.news_words_jieba = validate_content_words_jieba
VALIDATE_BUNCH.news_length = validate_content_sentence_length
VALIDATE_BUNCH.news_pic_list = validate_bunch.news_pic_list

In [223]:
# import pandas as pd

# train_df = pd.DataFrame.from_dict(VALIDATE_BUNCH).set_index('news_id')
# # train_df = pd.DataFrame.from_dict(TRAIN_LABEL_BUNCH).set_index('news_pic_id')
# train_df[:100]

## 5.2、将 Bunch 对象保存至本地指定的目录

In [216]:
from instrument import save_bunch

train_bunch_path = './data_bunch/cnn_non_stop_words_train_bunch.dat'
save_bunch(train_bunch_path,TRAIN_BUNCH)

train_label_bunch_path = './data_bunch/cnn_train_label_bunch.dat'
save_bunch(train_label_bunch_path,TRAIN_LABEL_BUNCH)

unlabel_news_bunch_path = './data_bunch/cnn_non_stop_words_unlabel_news_bunch.dat'
save_bunch(unlabel_news_bunch_path,UNLABEL_NEWS_BUNCH)

validate_bunch_path = './data_bunch/cnn_non_stop_words_validate_bunch.dat'
save_bunch(validate_bunch_path,VALIDATE_BUNCH)

# TextCnn模型

# 1、前向传播

##### embedding layer -> convolutional layer -> max pooling layer -> full connecton layer -> full connecton layer -> softmax layer

In [224]:
from instrument import read_bunch

train_data = read_bunch(FLAGS.training_data_file)

NameError: name 'FLAGS' is not defined

In [225]:
len(train_content_words_jieba)

48480

In [226]:
from tensorflow.contrib import learn

In [229]:
max_sentence_length = 22502
vocab_processor = learn.preprocessing.VocabularyProcessor(max_sentence_length)

In [233]:
np.array(list(vocab_processor.fit_transform(train_content_words_jieba[:10]))).shape

(10, 22502)