In [3]:
import jieba
import re
from tqdm import tqdm
import numpy as np
import pickle

path = '../data/'

In [8]:
def process_lines(in_name, prefix):
    f = open(path + in_name, 'r', encoding='utf-8')
    out_question = open(path + prefix + 'q_out.csv', 'w', encoding='utf-8')
    out_answer = open(path + prefix + 'a_out.csv', 'w', encoding='utf-8')
    out_label = []
    
    cnt = 0
    for line in tqdm(f):
        line = line.strip()
        l_ar = line.split("\t")
        if len(l_ar) != 3:
            continue
        question = l_ar[0]
        answer = l_ar[1]
        _label = l_ar[2]

        p = re.compile(u"[\u4e00-\u9fa5]+")
        
        ss = p.findall(question)
        out_question.write("".join(ss) + "\n")
        
        ss = p.findall(answer)
        out_answer.write("".join(ss) + "\n")
        
        label = 1 if _label == "1" else 0
        out_label.append(label)
        cnt = cnt + 1
        
    f.close()
    out_question.close()
    out_answer.close()
    np.save(path + prefix + 'label.npy', np.array(out_label))
    print('processed', cnt, 'lines')

In [10]:
process_lines('train-set.data', 'train_')
process_lines('validation-set.data', 'val_')

264416it [00:02, 115343.60it/s]


processed 264416 lines


39997it [00:00, 44996.75it/s]


processed 39997 lines


In [4]:
def segmentation(in_name, out_name):
    in_data = open(path + in_name, 'r', encoding='utf-8')
    out_data = open(path + out_name, 'w', encoding='utf-8')
    
    for line in tqdm(in_data):
        line = line.strip()

        seg_content = jieba.cut(line)
        out_data.write(" ".join(seg_content) + "\n")
        
    in_data.close()
    out_data.close()

In [16]:
segmentation('train_q_out.csv', 'train_q_seg.csv')
segmentation('train_a_out.csv', 'train_a_seg.csv')
segmentation('val_q_out.csv', 'val_q_seg.csv')
segmentation('val_a_out.csv', 'val_a_seg.csv')

264416it [00:23, 11098.62it/s]
264416it [00:41, 6379.00it/s]
39997it [00:02, 16354.95it/s]
39997it [00:07, 5501.17it/s]


In [4]:
def train_word2vec():
    import config
    from gensim.models import word2vec
    
    data = []
      
    with open(path + 'train_q_seg.csv', 'r', encoding='utf-8') as in_f:
        last_line = ""
        for line in in_f:
            if line != last_line:
                data.append(line)
            last_line = line
    with open(path + 'train_a_seg.csv', 'r', encoding='utf-8') as in_f:
        for line in in_f:
            data.append(line)
    
    with open(path + 'train_seg.csv', 'w', encoding='utf-8') as out_f:
        out_f.write("".join(data))
        
    print("word seg processed")
    
    sentences = word2vec.LineSentence(path + 'train_seg.csv')
    model = word2vec.Word2Vec(sentences, size=config.features, min_count=config.min_count, workers=config.num_workers)
    
    model.save(path + 'word2vec.model')

In [5]:
train_word2vec()

word seg processed


In [4]:
import config

def convert_pkl(name):
    in_f = open(path + name + '_seg.csv', 'r', encoding='utf-8')
    data = []
    for line in in_f:
        word_set = line.split(' ')
        word_set[-1] = word_set[-1].strip()
        if len(word_set) > config.max_length:
            word_set = word_set[:config.max_length]
        line = " ".join(word_set)
        data.append(line + "\n")
    in_f.close()
    
    out_f = open(path + name + '.pkl', 'wb')
    pickle.dump(data, out_f)

In [5]:
convert_pkl('train_q')
convert_pkl('train_a')
convert_pkl('val_q')
convert_pkl('val_a')

In [3]:
from gensim.models import word2vec
word2vec.FAST_VERSION



1