### 数据清洗

In [1]:
import pandas as pd
from jieba import posseg
import jieba

In [2]:
QA_TRAIN_DATA_PATH = '../data/AutoMaster_TrainSet.csv'
QA_TEST_DATA_PATH = '../data/AutoMaster_TestSet.csv'
QA_TRAIN_CLEAN_X_PATH = '../data/train_set.seg_x.txt'
QA_TRAIN_CLEAN_Y_PATH = '../data/train_set.seg_y.txt'
QA_TEST_CLEAN_X_PATH = '../data/test_set.seg_x.txt'
QA_STOPWORDS_PATH = '../data/stop_words.txt'

### 处理训练数据

In [3]:
# read train csv
train_df = pd.read_csv(QA_TRAIN_DATA_PATH, encoding='utf-8')
print(train_df.head())
print("Data Shape:", train_df.shape)

  QID Brand     Model                                           Question  \
0  Q1    奔驰     奔驰GL级                                方向机重，助力泵，方向机都换了还是一样   
1  Q2    奔驰      奔驰M级                                   奔驰ML500排气凸轮轴调节错误   
2  Q3    宝马  宝马X1(进口)  2010款宝马X1，2011年出厂，2.0排量，通用6L45变速箱，原地换挡位PRND车辆闯...   
3  Q4  Jeep       牧马人                              3.0V6发动机号在什么位置，有照片最好！   
4  Q5    奔驰      奔驰C级                       2012款奔驰c180怎么样，维修保养，动力，值得拥有吗   

                                            Dialogue  \
0  技师说：[语音]|车主说：新的都换了|车主说：助力泵，方向机|技师说：[语音]|车主说：换了...   
1  技师说：你这个有没有电脑检测故障代码。|车主说：有|技师说：发一下|车主说：发动机之前亮故障...   
2  技师说：你好，4缸自然吸气发动机N46是吧，先挂空档再挂其他档有没有闯动呢，变速箱油液位是否...   
3  技师说：右侧排气管上方，缸体上靠近变速箱|车主说：[图片]|车主说：是不是这个？|车主说：这...   
4  技师说：家庭用车的话，还是可以入手的|技师说：维修保养费用不高|车主说：12年的180市场价...   

                                      Report  
0                                       随时联系  
1                                       随时联系  
2  行驶没有顿挫的感觉，原地换挡有闯动，刹车踩重没有，这是力的限制的作用，应该没有问题  
3                 

In [4]:
# 去除report为空的
train_df.dropna(subset=['Report'], how='any', inplace=True)
print("Data Shape:", train_df.shape)

Data Shape: (82873, 6)


In [5]:
#剩余字段是输入，包含Brand,Model,Question,Dialogue，如果有空，填充即可
train_df.fillna('', inplace=True)
train_x = train_df.Question.str.cat(train_df.Dialogue)
print(train_x.head())
print("Data Shape:", train_x.shape)

0    方向机重，助力泵，方向机都换了还是一样技师说：[语音]|车主说：新的都换了|车主说：助力泵，...
1    奔驰ML500排气凸轮轴调节错误技师说：你这个有没有电脑检测故障代码。|车主说：有|技师说：...
2    2010款宝马X1，2011年出厂，2.0排量，通用6L45变速箱，原地换挡位PRND车辆闯...
3    3.0V6发动机号在什么位置，有照片最好！技师说：右侧排气管上方，缸体上靠近变速箱|车主说：...
4    2012款奔驰c180怎么样，维修保养，动力，值得拥有吗技师说：家庭用车的话，还是可以入手的...
Name: Question, dtype: object
Data Shape: (82873,)


In [6]:
train_y = []
if 'Report' in train_df.columns:
    train_y = train_df.Report
    assert len(train_x) == len(train_y)
print("Data Shape:", train_y.shape)

Data Shape: (82873,)


### 处理测试数据

In [7]:
test_df = pd.read_csv(QA_TEST_DATA_PATH, encoding='utf-8')
test_df.fillna('', inplace=True)
test_x = test_df.Question.str.cat(test_df.Dialogue)
print("Data Shape:", test_x.shape)

Data Shape: (20000,)


In [8]:
def parse_data(train_path, test_path):
    # 读取csv
    train_df = pd.read_csv(train_path, encoding='utf-8')
    # 去除report为空的
    train_df.dropna(subset=['Report'], how='any', inplace=True)
    # 剩余字段是输入，包含Brand,Model,Question,Dialogue，如果有空，填充即可
    train_df.fillna('', inplace=True)
    # 实际的输入X仅仅选择两个字段，将其拼接起来
    train_x = train_df.Question.str.cat(train_df.Dialogue)
    train_y = []
    if 'Report' in train_df.columns:
        train_y = train_df.Report
        assert len(train_x) == len(train_y)

    test_df = pd.read_csv(test_path, encoding='utf-8')
    test_df.fillna('', inplace=True)
    test_x = test_df.Question.str.cat(test_df.Dialogue)
    return train_x, train_y, test_x, []

In [9]:
def segment(sentence, cut_type='word', pos=False):
    if pos:
        if cut_type == 'word':
            word_pos_seq = posseg.lcut(sentence)
            word_seq, pos_seq = [], []
            for w, p in word_pos_seq:
                word_seq.append(w)
                pos_seq.append(p)
            return word_seq, pos_seq
        elif cut_type == 'char':
            word_seq = list(sentence)
            pos_seq = []
            for w in word_seq:
                w_p = posseg.lcut(w)
                pos_seq.append(w_p[0].flag)
            return word_seq, pos_seq
    else:
        if cut_type == 'word':
            return jieba.lcut(sentence)
        elif cut_type == 'char':
            return list(sentence)

In [21]:
def save_data(data, path, stopwords=set()):
    count = 0
    with open(path, 'w', encoding='utf-8') as f1:
        for line in data:
            if isinstance(line, str):
                seg_list = segment(line.strip(), cut_type='word')
                # 考虑stopwords
                seg_list = list(filter(lambda x: x not in stopwords, seg_list))
                if len(seg_list) > 0:
                    seg_line = ' '.join(seg_list)
                    f1.write('%s' % seg_line)
                    f1.write('\n')
                    count += 1
    print(path, count)

In [11]:
def read_stopwords(path):
    lines = set()
    with open(path, mode='r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            lines.add(line)
    return lines

In [12]:
stopwords = read_stopwords(QA_STOPWORDS_PATH)
print(len(stopwords))

312


In [13]:
train_list_src, train_list_trg, test_list_src, _ = parse_data(QA_TRAIN_DATA_PATH, QA_TEST_DATA_PATH)
print(len(train_list_src))
print(len(train_list_trg))

82873
82873


In [22]:
save_data(train_list_src, QA_TRAIN_CLEAN_X_PATH, stopwords)

KeyboardInterrupt: 

In [18]:
save_data(train_list_trg, QA_TRAIN_CLEAN_Y_PATH, stopwords)

82873


In [None]:
save_data(test_list_src, QA_TEST_CLEAN_X_PATH, stopwords)