# 文件地址管理

In [1]:
# 后期考虑先设定工作目录，再读取目录中的数据文件
news_info_train_path = './News_info_train_filter.txt'
pic_info_train_path = ''
news_pic_label_train_path = './News_pic_label_train.txt'

news_info_unlabel_path = './News_info_unlabel_filter.txt'
pic_info_unlabel_path = ''

news_info_validate_path = './News_info_validate_filter.txt'
pic_info_validate_path = ''
news_pic_lavel_validate_path = ''

# 读取原始的文本数据

In [2]:
from instrument import read_text

news_info_train = read_text(news_info_train_path)
news_pic_label_train = read_text(news_pic_label_train_path)
news_info_validate = read_text(news_info_validate_path)
news_info_unlabel = read_text(news_info_unlabel_path)

# 提取每条新闻文本的id, content, pic_list, label, match_pic_list, match_text

## 1、利用data2bunch函数对训练集、验证集、无标签数据的文本数据进行提取

In [3]:
from instrument import data2bunch

train_bunch = data2bunch(news_info_train, label=False)          # 对训练集数据进行提取
validate_bunch = data2bunch(news_info_validate, label=False)    # 对验证集数据进行提取
train_unlabel_bunch = data2bunch(news_info_unlabel,label=False) # 对无标签数据集进行数据提取

## 2、利用data2bunch函数对训练集中文本和图片的标签进行提取

In [4]:
train_label = data2bunch(news_pic_label_train,label=True)

## 3、进行文本与标签的配对

In [5]:
# 判断判断文本的id与标签的id是否相同来判断文本与标签是否匹配
for i in range(len(train_bunch.news_id)):
    if train_bunch.news_id[i] != train_label.news_pic_id[i]:
        print('出现标签不匹配！')
        print(train_bunch.news_id[i],'is not equal to',train_label.news_pic_id[i])
    else:
        pass

# 中文分词

In [6]:
import instrument
dir(instrument)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'data2bunch',
 'perdict_bayes',
 'read_bunch',
 'read_image',
 'read_text',
 'save_bunch',
 'save_image',
 'save_text',
 'search_best_para_bayes',
 'text2words']

In [9]:
postdict = {'基姆 霍根':'基姆•霍根',
            '中央 商场':'中央商场',
            '马克 龙':'马克龙',
            ' 惊天 魔 盗团':' 惊天魔盗团',
            '天将 雄狮':'天降雄狮',
            '天降 雄狮':'天降雄狮',
            '盗墓 笔记':'盗墓笔记',
            '功夫 瑜伽':'功夫瑜伽',
            '大闹 天竺':'大闹天竺',
            '人力 资源管理 师':'人力资源管理师',
            '微信 公众 号':'微信公众号',
            '星球大战 之 最后 的 绝地 武士':'星球大战之最后的绝地武士',
            '前任 3   再见 前任':'前任3:再见 前任',
            '二代 妖精':'二代妖精',
            '妖猫 传':' 妖猫传',
            '解忧 杂货店':'解忧杂货店',
            '机器 之血':'机器之血',
            '寻梦 环 游记':'寻梦环游记'
           }

In [52]:
def text2words(need_to_segment_text, method='jieba', ltp_model_path='',postdict={},stop_words=[]):
    '''
        该函数实现对每条新闻文本进行分词（分词方法包括jieba和ltp）,并将繁体字替换为简体字
    1、need_to_segment_text 为需要进行分词的文本列表
    2、method 为中文分词方法，可选 jieba 或 ltp
    3、ltp_model_path 为选择ltp分词方法时，ltp 分词模型在本地的目录地址
    4、postdist 数据格式：
        postdict = {'解 空间':'解空间','深度 优先':'深度优先'}
    
    '''
    from langconv import Converter
    import re
    text_words = []

    if method == 'jieba':
        import jieba
#         jieba.enable_parallel(4)             # 多线程分词，仅支持 Linux
        for sentence in need_to_segment_text:
            #             print(sentence)                                # 查看新闻原文
            sentence = Converter('zh-hans').convert(sentence)            # 将繁体中文转换为简体中文
            content = re.sub('[^\u4e00-\u9fa5a-zA-Z0-9.]',' ', sentence)  # 将中文、大小写字母和数字外的字符全替换为空格
            content_word = jieba.cut(content)
            seg_sent = ' '.join([word for word in content_word if word not in stop_words])  # 去除停止词
            seg_sent = re.sub('\s+',' ',seg_sent)
            for key in postdict:
                seg_sent = seg_sent.replace(key,postdict[key])    # 在分词后处理某些被分错的词和词语
            text_words.append(seg_sent)
    elif method == 'ltp' and ltp_model_path != '':
        from pyltp import Segmentor
        #         model_path = 'E:/Desktop/ZhuFei/Competition/NLP/ltp_data_v3.4.0/cws.model'   # Ltp 3.4 分词模型库
        segmentor = Segmentor()   # 实例化分词模块
        segmentor.load(ltp_model_path)  # 加载分词库
        for sentence in need_to_segment_text:
            #             print(sentence)                         # 查看新闻原文
            sentence = Converter('zh-hans').convert(sentence)     # 将繁体中文转换为简体中文
            content = re.sub('[^\u4e00-\u9fa5a-zA-Z0-9.:]',' ', sentence)  # 将中文、大小写字母和数字外的字符全替换为空格
            content_word = jieba.cut(content)
            seg_sent = ' '.join([word for word in content_word if word not in stop_words])  # 去除停止词
            seg_sent = re.sub('\s+',' ',seg_sent)
            for key in postdict:
                seg_sent = seg_sent.replace(key,postdict[key])    # 在分词后处理某些被分错的词和词语
            text_words.append(seg_sent) # 去除停止词
    else:
        fill_to_length = 130
        print(''.center(fill_to_length, '#'))
        print(' Method or model path is wrong! Please check it!!!! '.center(fill_to_length, '#'))
        print(''.center(fill_to_length, '#'))

    return text_words

In [7]:
from instrument import text2words

In [11]:
# text2words(train_bunch.news_content[69:79],postdict=postdict)

## 1、对训练集中的新闻文本进行分词

In [12]:
train_content_words_jieba = text2words(train_bunch.news_content,postdict=postdict)    # jieba 分词

# model_path = './NLP/ltp_data_v3.4.0/cws.model'   # Ltp 3.4 分词模型库
# train_content_words_ltp = text2words(train_bunch.news_content,method='ltp',model_path=model_path) # ltp 分词

In [13]:
len(train_content_words_jieba)
# train_content_words_jieba[:10]

48480

## 2、对验证集中的新闻文本进行分词 

In [14]:
validate_content_words_jieba = text2words(validate_bunch.news_content,postdict=postdict)   # jieba 分词 

# validate_content_words_ltp = text2words(validate_bunch.news_content,method='ltp',model_path=model_path) # ltp 分词

In [15]:
len(validate_content_words_jieba)
# validate_content_words_jieba[:10]

9696

## 3、对未标注文本进行分词

In [16]:
unlabel_news_words_jieba = text2words(train_unlabel_bunch.news_content,postdict=postdict)   # jieba 分词 

In [17]:
len(unlabel_news_words_jieba)
# unlabel_news_words_jieba[:10]

200000

## 4、训练集的标签匹配

In [18]:
for i in range(len(train_bunch.news_id)):
    if train_bunch.news_id[i]!=train_label.news_pic_id[i]:
        print(train_bunch.news_id[i],'is not equal with',train_label.news_pic_id[i])
        break
    else:
        pass
#         print('ID is correct!')
dir(train_bunch)
dir(train_label)
print(dir(validate_bunch))

['news_content', 'news_id', 'news_pic_list']


# 5、将分词结果保存以方便后续使用

## 5.1、将训练集与验证集进行Bunch化 

In [19]:
dir(train_label)

['news_pic_id', 'news_pic_label', 'news_pic_pic', 'news_pic_text']

In [20]:
from sklearn.datasets.base import Bunch

# 1、将训练集的分词结果保存为本地的一个 Bunch 对象
TRAIN_BUNCH = Bunch(news_id=[],news_words_jieba=[],news_pic_list=[])
TRAIN_BUNCH.news_id = train_bunch.news_id
TRAIN_BUNCH.news_words_jieba = train_content_words_jieba
TRAIN_BUNCH.news_pic_list = train_bunch.news_pic_list

# 2、将训练集新闻和图片对应的标签保存为本地的一个 Bunch 对象
TRAIN_LABEL_BUNCH = Bunch(news_pic_id=[],news_pic_label=[],news_pic_pic=[],news_pic_text=[])
TRAIN_LABEL_BUNCH.news_pic_id = train_label.news_pic_id
TRAIN_LABEL_BUNCH.news_pic_label = train_label.news_pic_label
TRAIN_LABEL_BUNCH.news_pic_pic = train_label.news_pic_pic
TRAIN_LABEL_BUNCH.news_pic_text = train_label.news_pic_text

# 3、将未标注数据集的分词结果保存为本地的一个 Bunch 对象
UNLABEL_NEWS_BUNCH = Bunch(news_id=[],news_words_jieba=[],news_pic_list=[])
UNLABEL_NEWS_BUNCH.news_id = train_unlabel_bunch.news_id
UNLABEL_NEWS_BUNCH.news_words_jieba = unlabel_news_words_jieba
UNLABEL_NEWS_BUNCH.news_pic_list = train_unlabel_bunch.news_pic_list

# 4、将验证集（contain id, content and pictures list）保存为一个本地的 Bunch 对象
VALIDATE_BUNCH = Bunch(news_id=[],news_words_jieba=[],news_pic_list=[])
VALIDATE_BUNCH.news_id = validate_bunch.news_id
VALIDATE_BUNCH.news_words_jieba = validate_content_words_jieba
VALIDATE_BUNCH.news_pic_list = validate_bunch.news_pic_list

In [25]:
import pandas as pd

train_df = pd.DataFrame.from_dict(VALIDATE_BUNCH).set_index('news_id')
train_df[:10]

Unnamed: 0_level_0,news_pic_list,news_words_jieba
news_id,Unnamed: 1_level_1,Unnamed: 2_level_1
D0048481,P0351731.JPEG;P0351732.JPEG;P0351733.JPEG;P035...,历代 名家 书法 对联 过年 写 春联 再也 不求人
D0048482,P0351764.JPEG\n,古代 街头 表演 人多 拥挤 怎么办 梅花拳 竟想 出 此法 武术 在 广大 民间 具有 深...
D0048483,NULL\n,风情万种 你 要 哪种 风情万种 你 要 哪种
D0048484,P0351765.JPEG;P0351766.JPEG;P0351767.JPEG;P035...,今天 请 把 这首 从 此刻 起 我要 读 3 遍 今天 把 这首 从 此刻 起 我要 强烈...
D0048485,P0351769.JPEG\n,重磅 江西 这家 省直 单位 正式 挂牌 成立 2 月 8 日 上午 省委 军民 融合 发展...
D0048486,NULL\n,2018 华南 师范大学 网络 教育 音乐学 师范 本科 介绍 华南 师范大学 网络 教育 ...
D0048487,P0351770.JPEG;P0351771.JPEG;P0351772.JPEG;P035...,寒假 集结 号 尚翔 篮球 集训营 火热 招生 中 篮球 不仅 是 一项 体育运动 它 是...
D0048488,P0351780.JPEG;P0351781.JPEG;P0351782.JPEG;P035...,宝 夜读 森林 里 的 陌生 来客 生命 教育 欢迎 大家 进店 选购 乐 乐趣 图书 销...
D0048489,P0351789.JPEG;P0351790.JPEG;P0351791.JPEG;P035...,学会 这 几招 保证 节前 综合症 不再 折磨 你 2018 年 的 春节 就 这样 猝不及...
D0048490,P0351796.JPEG;P0351797.JPEG;P0351798.JPEG;P035...,惊呆 昨天 有 100 位 HR 当众 表白 情人节 中奖 名单 公布 是 时候 揭晓 小 ...


## 5.2、将训练集和验证集保存至本地指定的目录

In [22]:
from instrument import save_bunch

train_bunch_path = './data_bunch/non_stop_words_train_bunch.dat'
save_bunch(train_bunch_path,TRAIN_BUNCH)

train_label_bunch_path = './data_bunch/train_label_bunch.dat'
save_bunch(train_label_bunch_path,TRAIN_LABEL_BUNCH)

unlabel_news_bunch_path = './data_bunch/non_stop_words_unlabel_news_bunch.dat'
save_bunch(unlabel_news_bunch_path,UNLABEL_NEWS_BUNCH)

validate_bunch_path = './data_bunch/non_stop_words_validate_bunch.dat'
save_bunch(validate_bunch_path,VALIDATE_BUNCH)

## 6、分词后结果处理（专业名词替换）

In [None]:
from pyltp import Segmentor
model_path = 'E:/Desktop/ZhuFei/Competition/NLP/ltp_data_v3.4.0/cws.model'   # Ltp 3.4 分词模型库
segmentor = Segmentor()
segmentor.load(model_path)
words = segmentor.segment('在包含问题的所有解的解空间树中，按照深度优先搜索的策略，从根节点出发深度搜索解空间树。')
seg_sent = ' '.join(words)
print(seg_sent)
postdict = {'解 空间':'解空间','深度 优先':'深度优先'}
for key in postdict:
    seg_sent = seg_sent.replace(key, postdict[key])
print(seg_sent)

# 创建词向量空间

# 将储存训练集和验证集分词的Bunch对象读入内存，用以进行模型训练 

In [1]:
# read bunch object
from instrument import read_bunch

train_bunch_path = './data_bunch/non_stop_words_train_bunch.dat'
train_label_bunch_path = './data_bunch/train_label_bunch.dat'
validate_bunch_path = './data_bunch/non_stop_words_validate_bunch.dat'

train_bunch = read_bunch(train_bunch_path)
train_label_bunch = read_bunch(train_label_bunch_path)
validate_bunch = read_bunch(validate_bunch_path)

In [2]:
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer

## 1、构建训练集的 TF-IDF 词向量空间对象

In [3]:
# train_bunch.news_words_jieba[:10]

In [4]:
tfidf_train = Bunch(Id=train_bunch.news_id, Label=train_label_bunch.news_pic_label, tdm=[], vocabulary={})
train_vectorizer = TfidfVectorizer(max_features=350000)
tfidf_train.tdm = train_vectorizer.fit_transform(train_bunch.news_words_jieba)        # jieba 分词结果
tfidf_train.vocabulary = train_vectorizer.vocabulary_

In [5]:
len(tfidf_train.vocabulary)

350000

## 2、构建验证集的 TF-IDF 词向量空间对象

In [6]:
tfidf_validate = Bunch(Id=validate_bunch.news_id, tdm=[], vocabulary={})
tfidf_validate.vocabulary = tfidf_train.vocabulary
validate_vectorizer = TfidfVectorizer(vocabulary=tfidf_train.vocabulary)
tfidf_validate.tdm = validate_vectorizer.fit_transform(validate_bunch.news_words_jieba)        # jieba 分词结果

In [7]:
len(tfidf_train.vocabulary)

350000

# 将训练数据划分为训练集和测试集

In [8]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(tfidf_train.tdm,
                                                    tfidf_train.Label,
                                                    test_size=0.3,
                                                    random_state=33)

# 构建模型

In [9]:
from sklearn.metrics import classification_report

## 1、Naive Bayes

### 1、模型测试

In [10]:
### Multinomial Naive Bayes Classifier
def classifier_naive_bayes(x_data, y_labels):
    from sklearn.naive_bayes import MultinomialNB
    model = MultinomialNB(alpha=0.1453)
    model.fit(x_data, y_labels)
    return model
model_naive_bayes = classifier_naive_bayes(x_train, y_train)
print('The accuracy of classifying training data with Naive Bayes is :',
      model_naive_bayes.score(x_test, y_test))
print(classification_report(y_test,model_naive_bayes.predict(x_test)))

The accuracy of classifying training data with Naive Bayes is : 0.6613036303630363
             precision    recall  f1-score   support

          0       0.67      0.79      0.73      7192
          1       0.47      0.03      0.06      2357
          2       0.65      0.77      0.71      4995

avg / total       0.63      0.66      0.61     14544



### 2、验证集预测

In [11]:
model_naive_bayes = classifier_naive_bayes(tfidf_train.tdm,tfidf_train.Label)
predict_naive_bayes = model_naive_bayes.predict(tfidf_validate.tdm)

In [12]:
# 7. store the result of predict to local, and ust it to submittion
bayes_text = []
for i in range(len(validate_bunch.news_id)):
    bayes_text.append('NULL')

label_predict = predict_naive_bayes
bayes_result = []
for i in range(len(validate_bunch.news_id)):
    bayes_result.append(validate_bunch.news_id[i]+'\t'+label_predict[i]+'\t'+bayes_text[i]+'\t'+bayes_text[i])

In [13]:
from instrument import save_text

save_path = './submittion/result_bayes.txt'
save_text(save_path, bayes_result)

In [None]:
# 读入数据
from instrument import read_bunch
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer

# load data from local
train_bunch_path = './data_bunch/train_bunch.dat'
validate_bunch_path = './data_bunch/validate_bunch.dat'
train_bunch = read_bunch(train_bunch_path)
validate_bunch = read_bunch(validate_bunch_path)

# 创建词向量空间
stop_words_list = None
max_df = 0.7

# create TF-IDF words vector space with train data
tfidf_train = Bunch(Id=train_bunch.news_id, Label=train_bunch.news_pic_label, tdm=[], vocabulary={})
train_vectorizer = TfidfVectorizer(stop_words=stop_words_list, sublinear_tf=True, max_df=max_df)
tfidf_train.tdm = train_vectorizer.fit_transform(train_bunch.news_words_jieba)                # jieba 分词结果或
tfidf_train.vocabulary = train_vectorizer.vocabulary_

# create TF-IDF words vector space with validate data
tfidf_validate = Bunch(Id=validate_bunch.news_id, tdm=[], vocabulary={})
tfidf_validate.vocabulary = tfidf_train.vocabulary
validate_vectorizer = TfidfVectorizer(stop_words=stop_words_list, sublinear_tf=True, max_df=max_df,
                                      vocabulary=tfidf_train.vocabulary)
tfidf_validate.tdm = validate_vectorizer.fit_transform(validate_bunch.news_words_jieba)        # jieba 分词结果

# 将数据分为训练集与测试集
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(tfidf_train.tdm,
                                                    tfidf_train.Label,
                                                    test_size=0.3,
                                                    random_state=33)

# 构建模型
from sklearn.metrics import classification_report