In [1]:
import pandas as pd
import numpy as np
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn
warnings.simplefilter('ignore')
import sys
sys.path.extend(['../src/'])
from config import *
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
% matplotlib inline

config = Config()

### 数据读取

In [2]:
data = pd.read_excel('cache.xlsx')

In [3]:
data.columns

Index(['服务单号', '客户名称', '客户类型', '紧急程度', '问题发现日期', '分配工程师', '设备编号', '设备型号',
       '服务工时(小时)', '服务间隔天数', '上次维修时间', '装机日期', '维修服务内容', '问题汇总', '解决方案汇总',
       '装机年份'],
      dtype='object')

### 特征工程

由于每条样本都是一条单独的维修记录，不用把该仪器设备单独提出来做连续的时间线，只需要把每个时间点的仪器当成一台平行时空下的独立的仪器就好。

对于该条样本下，该仪器当前的状态构建以下特征：

1. 该仪器型号的平均使用年限：定义一台仪器淘汰所需要的平均年限，需要找工程师问有哪些仪器是已经报废的，报废时间及装机时间
2. 该仪器型号报废前的平均维修次数：可精确到各个部件各维修的次数
3. 该仪器最近一年的维修次数
4. 该仪器最近一年的维修次数占对应客户最近一年的总维修次数的比
5. 仪器对应客户最近一年的维修总次数
6. 仪器最近一年遭遇严重问题zp1的次数
7. 仪器最近一年遭遇zp1在总次数中的占比
8. 仪器距离上一次维修的时间间隔
9. 仪器距离上一次维护的时间间隔
10. 该仪器的已使用年限
11. 同类仪器对应使用年限时的平均维修次数
。。。

#### 在构造特征之前需要提取的信息

##### 每类型仪器各有多少台

In [4]:
instrument_uniq = data.drop_duplicates(subset = '设备编号', keep = 'last')
instrument_uniq.loc[instrument_uniq['设备型号'].isnull(),:]

Unnamed: 0,服务单号,客户名称,客户类型,紧急程度,问题发现日期,分配工程师,设备编号,设备型号,服务工时(小时),服务间隔天数,上次维修时间,装机日期,维修服务内容,问题汇总,解决方案汇总,装机年份
3830,BS-SHFW-维修部201708290097,浙江博圣生物技术股份有限公司,一般客户,未知,2017-08-27,陈凌云,BS-2016-WTS-TQS-01,,1.0,2.0,2017-08-27,2016-07-01,维修,基因小镇的氮气发生器亮service灯氮气发生器亮service等,1.氮气发生器型号为genius3020，软件版本为v3.0。genius3020有4个空气...,2016
4245,BS-SHFW-维修部201711133460,无锡市妇幼保健院,VIP,未知,2017-11-13,戴明路,BS-2017-NS550-01,,,,2017-11-13,2017-11-15,装机/移机,协助厂家工程师安装一套安诺优达NIPT系统陪同装机,,2017


发现共有两台仪器数据没有

In [5]:
data.loc[data['设备编号'] == 'BS-2016-WTS-TQS-01',:]

Unnamed: 0,服务单号,客户名称,客户类型,紧急程度,问题发现日期,分配工程师,设备编号,设备型号,服务工时(小时),服务间隔天数,上次维修时间,装机日期,维修服务内容,问题汇总,解决方案汇总,装机年份
1502,BS-SHFW-维修部201708290097,浙江博圣生物技术股份有限公司,一般客户,未知,2017-08-27,陈凌云,BS-2016-WTS-TQS-01,,1.0,2.0,2016-07-01,2016-07-01,维修,基因小镇的氮气发生器亮service灯氮气发生器亮service等,1.氮气发生器型号为genius3020，软件版本为v3.0。genius3020有4个空气...,2016
3830,BS-SHFW-维修部201708290097,浙江博圣生物技术股份有限公司,一般客户,未知,2017-08-27,陈凌云,BS-2016-WTS-TQS-01,,1.0,2.0,2017-08-27,2016-07-01,维修,基因小镇的氮气发生器亮service灯氮气发生器亮service等,1.氮气发生器型号为genius3020，软件版本为v3.0。genius3020有4个空气...,2016


In [6]:
data.loc[data['设备编号'].apply(lambda x: 'WTS' in x), ['设备编号','设备型号']]

Unnamed: 0,设备编号,设备型号
0,BS-2016-WTS-TQD-16,Xevo TQD
1,BS-2016-WTS-TQD-01,TQD
78,BS-2014-WTS-TQD-11,TQD
203,BS-2013-WTS-QM-01,Quattro Micro
240,BS-2014-WTS-TQD-08,TQD
248,BS-2015-WTS-TQD-06,TQD
312,OD-2016-WTS-QM-01,Quattro Micro
314,OD-2016-WTS-QM-01,Quattro Micro
319,BS-2014-WTS-TQD-05,TQD
376,BS-2014-WTS-TQD-05,TQD


怀疑是陈凌云填错了，其实是TQD不是TQS

In [7]:
data.loc[data['设备编号'] == 'BS-2016-WTS-TQS-01','设备编号'] = 'BS-2016-WTS-TQD-01'
data.loc[data['设备编号'] == 'BS-2016-WTS-TQD-01','设备型号'] = 'TQD'

In [8]:
data.loc[data['设备编号'].apply(lambda x: 'NS' in x), ['设备编号','设备型号']]

Unnamed: 0,设备编号,设备型号
1917,BS-2017-NS550-01,
4245,BS-2017-NS550-01,
6875,AN-NS550-2018001,Nextseq 550AR


In [9]:
data.loc[data['设备编号'] == 'BS-2017-NS550-01', '设备型号'] = 'Nextseq 550AR'

In [10]:
instrument_uniq = data.drop_duplicates(subset = '设备编号', keep = 'last')

In [11]:
instrument_uniq['设备型号'].value_counts()

1235-514                160
1420-020                148
GSL-120/GSL-10          142
2081-0010               104
CDS-5                    91
1235-5220                72
2021-0010                71
TQD                      68
GCS3000DX2               63
Capillarys 2 FP          40
CaptureStation           33
1235-501                 28
6000-0010                26
Xevo TQD                 25
luminex 200              20
Quattro Micro            19
1296-026                 11
KM1                       7
1296-003                  7
GCS3000Dx2                6
1420-012                  5
GSL-120                   5
00-0335                   4
KM2                       4
1296-0010                 3
UPLC                      3
6000                      2
CaptureStation(FISH)      2
5014-0020                 2
Nextseq 550AR             2
LX200                     2
MB4                       1
TQS                       1
BGI SEQ-500               1
SLAN-96S                  1
MB8/MB4             

#### 文本挖掘，重要问题zq1

1. 定义一个函数可以分词出单仪器型号的所有实体名词
2. 圈出其中的问题部分定义为zq1保存在文件problem.txt中

In [15]:
data['问题汇总'].fillna('', inplace = True)
data['问题汇总_jieba'] = data['问题汇总'].apply(seg_sentence)

data['解决方案汇总'].fillna('', inplace = True)
data['解决方案汇总_jieba'] = data['解决方案汇总'].apply(seg_sentence)

data['jieba'] = data['问题汇总_jieba'] + data['解决方案汇总_jieba']

##### 训练词向量

In [147]:
from tqdm import tqdm_notebook

# 处理低频词
def construct_dict(corput):
    word_dict = {}
    for line in tqdm_notebook(corput):
        for e in line.strip().split():
            word_dict[e] = word_dict.get(e, 0) + 1
    return word_dict
word_dict = construct_dict(data['jieba'])
word_stop_word = [e for e in word_dict if word_dict[e] <=2]
# pickle.dump(set(word_stop_word), open('../../data/word_stopword.pkl', 'wb'))




In [148]:
def filter_low_freq(corput, min_freq = 2):    
    word_seg_mf2 = []
    for w in tqdm_notebook(corput):
        word_seg_mf2.append(' '.join([e for e in w.split() if word_dict[e] > min_freq]))
    return word_seg_mf2
data['jieba_mf'] = filter_low_freq(data['jieba'])




In [150]:
config.savemodel_path = '../source/model/'
os.makedirs(config.savemodel_path, exist_ok=True)

In [165]:
# 训练词向量
def train_w2v_model(corpus, min_freq=5, embed_size=100):
    sentences = []
    for e in tqdm(corpus):
        sentences.append([i for i in e.strip().split() if i])
    print('训练集语料:', len(corpus))
    print('总长度: ', len(sentences))
    model = Word2Vec(sentences,size = embed_size, window= min_freq,min_count=1,negative=3,
                     sg=1,sample=0.001,hs=1,workers=4,iter=15)
    model.itos = {}
    model.stoi = {}
    model.embedding = {}
    print('保存模型...')
    for k in tqdm(model.wv.vocab.keys()):
        model.itos[model.wv.vocab[k].index] = k
        model.stoi[k] = model.wv.vocab[k].index
        model.embedding[model.wv.vocab[k].index] = model.wv[k]
    model.save('{}word2vec.{}d.mfreq{}.model'.format(config.savemodel_path, embed_size, min_freq))
    return model
model = train_w2v_model(data['jieba_mf'])
model.wv.save_word2vec_format('{}laozhu-word-300d'.format(config.savemodel_path), binary=False)

100%|██████████████████████████████████████████████████████████████████████████| 6904/6904 [00:00<00:00, 111653.61it/s]


训练集语料: 6904
总长度:  6904
保存模型...


100%|██████████████████████████████████████████████████████████████████████████| 7088/7088 [00:00<00:00, 444183.87it/s]


In [13]:
import jieba
import jieba.posseg as pseg
jieba.load_userdict('../source/words/problem.txt')
# 创建停用词list  
def stopwordslist(filepath):  
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]  
    return stopwords    
  
# 对句子进行分词  
def seg_sentence(sentence):  
    sentence_seged = jieba.cut(sentence.strip(), HMM=True, cut_all=False)  
#     sentence_seged = jieba.cut_for_search(sentence.strip(), HMM=True)
    stopwords = stopwordslist('../source/words/中文停用词表.txt')  # 这里加载停用词的路径
    outstr = ''  
    for word in sentence_seged:  
        if word not in stopwords:  
            if word != '\t':  
                outstr += word  
                outstr += " "  
    return outstr

In [14]:
import jieba
from jieba import analyse

def keywords_(dataseries, allowPos):
    cache = ""
    for i in list(dataseries):
        try:
            cache = cache + i
        except:
            cache = cache + ""
    content = " ".join(jieba.cut(cache))
    key_words = analyse.extract_tags(content, topK=2000, withWeight=True, allowPOS= allowPos)
    textrank = analyse.textrank(content,topK = 200,withWeight = True, allowPOS = allowPos)
    keywords = dict()
    for i in key_words:
        keywords[i[0]] = i[1]
    return textrank, pd.Series(keywords)

In [42]:
textrank = {}
keywords = {}
for i in ['nb','n','nr', 'ns','nt','nz']:
# allowP = ('nb','n','nr', 'ns','nt','nz')
    allowP = (i)
    textrank[i], keywords[i] = keywords_(data_1235['问题汇总'], allowP)

In [47]:
pd.options.display.max_rows = 400

pd.concat(keywords,axis = 1).sort_values(by = 'nt', ascending = False)

Unnamed: 0,n,nb,nr,ns,nt,nz
妇幼,0.401783,0.395805,0.397598,0.401141,0.391568,0.401691
故障,0.375127,0.369546,0.371220,0.374527,0.365590,0.375041
注射器,0.294067,0.289692,0.291004,0.293597,0.286591,0.294000
仪器,0.261400,0.257511,0.258677,0.260982,0.254754,0.261340
联机,0.258150,0.254309,0.255461,0.257737,0.251586,0.258091
右枪,0.221587,0.218290,0.219279,0.221233,0.215954,0.221537
样本,0.214136,0.210950,0.211906,0.213794,0.208692,0.214087
洗板,0.183288,0.180561,0.181379,0.182995,0.178628,0.183246
过程,0.173033,0.170458,0.171230,0.172756,0.168634,0.172993
试剂,0.153995,0.151704,0.152391,0.153749,0.150080,0.153960


可以看到对nb, n, nr, ns, nt, nz的keywords是有一定区别的，空缺部分不含有我们想要的词语，所以取n就可以了

对于每个词来说，在紧急、重要和正常出现的概率是否有不同呢，可以看一下

In [55]:
data['紧急程度'].value_counts()

未知     3577
正常     1529
紧急      836
无要求     580
重要      382
Name: 紧急程度, dtype: int64

In [76]:
textrank = {}
keywords = {}

for item in list(data['紧急程度'].unique()):
    textrank[item], keywords[item] = keywords_(data.loc[(data['设备型号'] == '1235-514')&(data['维修服务内容'] == '维修')&(data['紧急程度'] == item),'问题汇总'], ('n'))

In [79]:
pd.concat(keywords, axis = 1).sort_values(by = '紧急', ascending = False).fillna('')

Unnamed: 0,无要求,未知,正常,紧急,重要
注射器,0.18161,0.265519,0.281426,0.496189,
联机,0.183493,0.237709,0.0812411,0.472685,
洗板,,0.160211,,0.402247,
右枪,,0.186912,0.319402,0.402247,
故障,,0.383761,0.251533,0.3437,0.670512
过程,,0.1421,0.159354,0.343397,0.0809124
板框,,0.0337142,,0.248863,
样本,,0.192725,0.324697,0.240442,0.395677
皮带,0.156146,0.0838029,,0.219404,
真空,0.13716,0.0837663,,0.214139,


In [176]:
w2v_model = Word2Vec.load(config.w2v_file) 

In [184]:
import pandas as pd
import numpy as np
import re

from gensim.models import Word2Vec

def w2v_split(dataseries, window_size = 5, embed_size = 100): 
    w2v_col = [f'w2v_{i}' for i in range(embed_size)]
    all_texts = []
    dataseries.fillna('',inplace = True)
    for seq in dataseries:
        for shift in range(0,window_size):
            all_texts.append([word for word in re.findall(r'.{'+str(window_size)+'}',seq[shift:])])
    model = Word2Vec.load(config.w2v_file)    
    w2v_feat = []
    i = 0
    while i <= len(all_texts)-window_size:
        sum_w2v = np.zeros(shape=(embed_size,))
        for j in range(i,i+ window_size):
            for word in all_texts[j]:
                try:
                    sum_w2v += model[word]
                except:
                    continue
        w2v_feat.append(sum_w2v)
        i = i+window_size
    w2v_feat = np.vstack(w2v_feat)
    df_w2v = pd.DataFrame(w2v_feat,columns=w2v_col)
    return df_w2v

w2v_train = w2v_split(data['jieba_mf'])

In [170]:
def init_embedding(config, word2vec_model):
    vocab_len = len(word2vec_model.stoi) + 2
    print('Vocabulaty size : ', vocab_len)
    print('create embedding matrix')
    all_embs = np.stack(word2vec_model.embedding.values())
    embed_matrix = np.random.normal(all_embs.mean(), all_embs.std(), size=(vocab_len, config.EMBED_SIZE))
    embed_matrix[-2] = 0  # padding
    for i, val in tqdm(word2vec_model.embedding.items()):
        embed_matrix[i] = val
    return embed_matrix

In [171]:
init_embed = init_embedding(config, word2vec_model)

Vocabulaty size :  7090
create embedding matrix


100%|█████████████████████████████████████████████████████████████████████████| 7088/7088 [00:00<00:00, 1019380.97it/s]


In [49]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

dataseries_train = data.loc[(data['设备型号'] == '1235-514')&(data['维修服务内容'] == '维修')&(data['紧急程度'].isin(['正常','紧急'])),['紧急程度','问题汇总','问题汇总_jieba','jieba']].reset_index(drop = True)
dataseries_test = data.loc[(data['设备型号'] == '1235-514')&(data['维修服务内容'] == '维修')&(data['紧急程度'].isin(['无要求','重要'])),['紧急程度','问题汇总','问题汇总_jieba','jieba']]
w2v_train, w2v_test = w2v_train_test_split(dataseries_train['jieba'], dataseries_test['jieba'])

y_train = data.loc[(data['设备型号'] == '1235-514')&(data['维修服务内容'] == '维修')&(data['紧急程度'].isin(['正常','紧急'])),'紧急程度'].reset_index(drop = True).map({'正常': 0, '紧急': 1})

In [63]:
def lgb_w2v_train(X, Y, validation_size = 0.3):
    X_model, X_pred, Y_model, Y_pred = train_test_split(X, Y, test_size = validation_size,
                                                        random_state= 2019)
    train = lgb.Dataset(X_model, label=Y_model)
    valid = train.create_valid(X_pred, label=Y_pred)
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'min_child_weight': 3,
        'num_leaves': 63,
        'lambda_l2': 6,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'learning_rate': 0.05,
        'seed': 2019,
        'nthread': -1,
        'bagging_fraction': 0.7,
        'bagging_freq': 100
    }
    num_round = 40000
    gbm = lgb.train(params, train, num_round,
                    verbose_eval=500,valid_sets=[train, valid],early_stopping_rounds= 1000)
    return gbm

lgb_w2v = lgb_w2v_train(w2v_train, y_train)

Training until validation scores don't improve for 1000 rounds.
[500]	training's auc: 1	valid_1's auc: 0.792702
[1000]	training's auc: 1	valid_1's auc: 0.815994
Early stopping, best iteration is:
[421]	training's auc: 1	valid_1's auc: 0.78882


In [64]:
preds_test = lgb_w2v.predict(w2v_test)
dataseries_test[(preds_test< 0.5)]

Unnamed: 0,紧急程度,jieba
1827,重要,周 连续 两次 实验 取放 防蒸发帽 报错 . 检查 历史记录 第一天 实验 显示 自检 时...
1853,重要,全省 质控 分析 发现 鼓楼 医院 质控 整体 偏低 CV 偏大 客户 怀疑 设备 问题 ....
2027,重要,1235 产筛 使用 质控 近期 CV 超出 接受 范围 质控 CV 较大 . 检查 仪器 ...
2039,重要,1235 旧 仪器 1297 读码器 扫 样本 条码 识别 1297 扫 样品 条码 识别率...
2047,重要,1235 电脑主机 经常性 不定期 蓝屏 已经 处理 多次 需要 彻底解决 . 查看 历史记...
2545,重要,近 两周 实验 质控 呈现 高低 不同 状态 第三 板 质控 偏高 近期 产筛实验 HCG ...
2593,重要,整板 铕标 检测 发现 两台 仪器 均 出现 A1 H12 检测值 高 5% 情况 需 上门...
4155,重要,周 连续 两次 实验 取放 防蒸发帽 报错 . 检查 历史记录 第一天 实验 显示 自检 时...
4181,重要,全省 质控 分析 发现 鼓楼 医院 质控 整体 偏低 CV 偏大 客户 怀疑 设备 问题 ....
4355,重要,1235 产筛 使用 质控 近期 CV 超出 接受 范围 质控 CV 较大 . 检查 仪器 ...


##### lgb调参对结果的影响

In [117]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

def lgb_evaluate(num_leaves, subsample, colsample_bytree, min_child_samples, reg_alpha, reg_lambda):

    train = lgb.Dataset(X, label=y_train)
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'min_child_samples': int(min_child_samples),
        'num_leaves': int(num_leaves),
        'reg_alpha': max(reg_alpha,0),
        'reg_lambda': max(reg_lambda,0),
        'subsample': max(min(subsample, 1), 0),
        'colsample_bytree': max(min(colsample_bytree, 1), 0) ,
        'learning_rate': 0.05,
        'seed': 2019,
        'nthread': -1,}
    cv_result = lgb.cv(params, train, nfold = 5, seed= 2019, stratified=True, verbose_eval =200, metrics=['auc'])
    return max(cv_result['auc-mean'])

random_state = 2019
num_iter = 25
init_points = 10

X = w2v_train
y = y_train

lgbBO = BayesianOptimization(lgb_evaluate, {'num_leaves': (60, 120), 
                                            'subsample': (0.5, 1),
                                            'colsample_bytree': (0.5, 1),
                                            'min_child_samples': (5,100), 
                                            'reg_alpha': (0,10),
                                            'reg_lambda': (0,10)
                                            })

lgbBO.maximize(init_points=init_points, n_iter=num_iter)

[31mInitialization[0m
[94m----------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   min_child_samples |   num_leaves |   reg_alpha |   reg_lambda |   subsample | 
    1 | 00m01s | [35m   0.82583[0m | [32m            0.8564[0m | [32m            57.2218[0m | [32m     77.9289[0m | [32m     1.7554[0m | [32m      7.7397[0m | [32m     0.9900[0m | 
    2 | 00m00s |    0.79792 |             0.5069 |             87.9956 |     100.4786 |      4.7796 |       4.8014 |      0.6733 | 
    3 | 00m01s | [35m   0.85259[0m | [32m            0.6573[0m | [32m            27.5409[0m | [32m     93.8173[0m | [32m     4.1217[0m | [32m      3.9510[0m | [32m     0.9635[0m | 
    4 | 00m01s |    0.83928 |             0.8118 |             50.6123 |      81.7515 |      0.6991 |       9.2283 |      0.6861 | 
    5 | 00m00s |    0.81593 |             0.5888 |

##### 仪器数据合并会有效果吗

In [136]:
dataseries_train = data.loc[(data['维修服务内容'] == '维修')&(data['紧急程度'].isin(['正常','紧急'])),['紧急程度','问题汇总','问题汇总_jieba','jieba']].reset_index(drop = True)
dataseries_test = data.loc[(data['维修服务内容'] == '维修')&(data['紧急程度'].isin(['无要求','重要'])),['紧急程度','问题汇总','问题汇总_jieba','jieba']]
w2v_train, w2v_test = w2v_train_test_split(dataseries_train['jieba'], dataseries_test['jieba'])

y_train = data.loc[(data['维修服务内容'] == '维修')&(data['紧急程度'].isin(['正常','紧急'])),'紧急程度'].reset_index(drop = True).map({'正常': 0, '紧急': 1})

In [137]:
max_params = lgbBO.res['max']['max_params']

def lgb_w2v_train(X, Y, validation_size = 0.3):
    X_model, X_pred, Y_model, Y_pred = train_test_split(X, Y, test_size = validation_size,
                                                        random_state= 2019)
    train = lgb.Dataset(X_model, label=Y_model)
    valid = train.create_valid(X_pred, label=Y_pred)
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'min_child_samples': int(max_params['min_child_samples']),
        'num_leaves': int(max_params['num_leaves']),
        'reg_alpha': max_params['reg_alpha'],
        'reg_lambda': max_params['reg_lambda'],
        'subsample': max_params['subsample'],
        'colsample_bytree': max_params['colsample_bytree'],
        'learning_rate': 0.05,
        'seed': 2019,
        'nthread': -1
    }
    num_round = 40000
    gbm = lgb.train(params, train, num_round,
                    verbose_eval=500,valid_sets=[train, valid],early_stopping_rounds= 1000)
    return gbm

lgb_w2v = lgb_w2v_train(w2v_train, y_train)
preds_train = lgb_w2v.predict(w2v_train)
preds_test = lgb_w2v.predict(w2v_test)

Training until validation scores don't improve for 1000 rounds.
[500]	training's auc: 1	valid_1's auc: 0.930852
[1000]	training's auc: 1	valid_1's auc: 0.930852
Early stopping, best iteration is:
[10]	training's auc: 1	valid_1's auc: 0.885333


In [138]:
preds_train = lgb_w2v.predict(w2v_train)
dataseries_train[(y_train == 1)&(preds_train<0.5)]

Unnamed: 0,紧急程度,问题汇总,问题汇总_jieba,jieba
166,紧急,地贫9号马达超出限位，无法启动仪器1，9号马达超出限位，无法启动仪器。2，检查仪器发现有机玻...,地贫 号 马达 超出 限位 无法 启动 仪器 号 马达 超出 限位 无法 启动 仪器 检查 ...,地贫 号 马达 超出 限位 无法 启动 仪器 号 马达 超出 限位 无法 启动 仪器 检查 ...
363,紧急,处理之前温湿度超标的两支笔处理之前两支DX温湿度超标的笔,处理 之前 温湿度 超标 两支 笔 处理 之前 两支 DX 温湿度 超标 笔,处理 之前 温湿度 超标 两支 笔 处理 之前 两支 DX 温湿度 超标 笔 . 使用 福建...
391,紧急,GSL120扫描跳片，模板对焦无效，确认原因并解决&年度维护&客户使用培训机器跳片&客户培训...,GSL120 扫描 跳片 模板 对焦 无效 确认 原因 解决 & 年度维护 & 客户 使用 ...,GSL120 扫描 跳片 模板 对焦 无效 确认 原因 解决 & 年度维护 & 客户 使用 ...
601,紧急,地贫9号马达超出限位，无法启动仪器1，9号马达超出限位，无法启动仪器。2，检查仪器发现有机玻...,地贫 号 马达 超出 限位 无法 启动 仪器 号 马达 超出 限位 无法 启动 仪器 检查 ...,地贫 号 马达 超出 限位 无法 启动 仪器 号 马达 超出 限位 无法 启动 仪器 检查 ...
662,紧急,2号加样针在实验过程中下坠,号 加样针 实验过程 中 下坠,号 加样针 实验过程 中 下坠
834,紧急,120需要更换灯泡，打印模板设置120灯泡损坏，需要更换,120 需要 更换 灯泡 打印 模板 设置 120 灯泡 损坏 需要 更换,120 需要 更换 灯泡 打印 模板 设置 120 灯泡 损坏 需要 更换 更换 显微镜 卤...
974,紧急,bobs需要校准激光红激光CON1CV值偏大，校准和性能验证失败。,bobs 需要 校准 激光 红 激光 CON1CV 值 偏大 校准 性能验证 失败,bobs 需要 校准 激光 红 激光 CON1CV 值 偏大 校准 性能验证 失败 查看 管...


In [None]:
data['紧急指数'] = lgb_w2v.predict

##### problem.txt对结果的影响

In [58]:
jieba.load_userdict('../source/words/problem.txt')

data['问题汇总'].fillna('', inplace = True)
data['问题汇总_jieba'] = data['问题汇总'].apply(seg_sentence)

data['解决方案汇总'].fillna('', inplace = True)
data['解决方案汇总_jieba'] = data['解决方案汇总'].apply(seg_sentence)

data['jieba'] = data['问题汇总_jieba'] + data['解决方案汇总_jieba']

dataseries_train = data.loc[(data['设备型号'] == '1235-514')&(data['维修服务内容'] == '维修')&(data['紧急程度'].isin(['正常','紧急'])),['紧急程度','jieba']].reset_index(drop = True)
dataseries_test = data.loc[(data['设备型号'] == '1235-514')&(data['维修服务内容'] == '维修')&(data['紧急程度'].isin(['无要求','重要'])),['紧急程度','jieba']]

w2v_train, w2v_test = w2v_train_test_split(dataseries_train['jieba'], dataseries_test['jieba'])
y_train = data.loc[(data['设备型号'] == '1235-514')&(data['维修服务内容'] == '维修')&(data['紧急程度'].isin(['正常','紧急'])),'紧急程度'].reset_index(drop = True).map({'正常': 0, '紧急': 1})
lgb_w2v = lgb_w2v_train(w2v_train, y_train)
preds_train = lgb_w2v.predict(w2v_train)
preds_test = lgb_w2v.predict(w2v_test)

Training until validation scores don't improve for 2000 rounds.
[100]	training's auc: 0.945011	valid_1's auc: 0.746118
[200]	training's auc: 0.991071	valid_1's auc: 0.807453
[300]	training's auc: 0.987528	valid_1's auc: 0.801242
[400]	training's auc: 0.998299	valid_1's auc: 0.77795
[500]	training's auc: 1	valid_1's auc: 0.792702
[600]	training's auc: 0.999858	valid_1's auc: 0.815217
[700]	training's auc: 1	valid_1's auc: 0.806677
[800]	training's auc: 1	valid_1's auc: 0.807453
[900]	training's auc: 1	valid_1's auc: 0.804348
[1000]	training's auc: 1	valid_1's auc: 0.815994
[1100]	training's auc: 1	valid_1's auc: 0.809783
[1200]	training's auc: 1	valid_1's auc: 0.80823
[1300]	training's auc: 1	valid_1's auc: 0.813665
[1400]	training's auc: 1	valid_1's auc: 0.826863
[1500]	training's auc: 1	valid_1's auc: 0.829969
[1600]	training's auc: 1	valid_1's auc: 0.83618
[1700]	training's auc: 1	valid_1's auc: 0.832298
[1800]	training's auc: 1	valid_1's auc: 0.832298
[1900]	training's auc: 1	valid_

In [59]:
preds_train = lgb_w2v.predict(w2v_train)
dataseries_train[(y_train == 1)&(preds_train<0.5)]

Unnamed: 0,紧急程度,jieba
66,紧急,1297 加 标准品 报错 1235 维护保养 1297 加 标准品 报错 1297 做 液...
95,紧急,更换 弹簧 测试 lift 取框 正常 更换 弹簧
168,紧急,1297 加 标准品 报错 1235 维护保养 1297 加 标准品 报错 1297 做 液...
225,紧急,号 负压泵 损坏 需要 更换 更换 负压泵 发现 pool 阀 18 号口 松动 旋紧 加扎...


In [61]:
preds_test = lgb_w2v.predict(w2v_test)
dataseries_test[(preds_test< 0.5)]

Unnamed: 0,紧急程度,jieba
1827,重要,周 连续 两次 实验 取放 防蒸发帽 报错 . 检查 历史记录 第一天 实验 显示 自检 时...
1853,重要,全省 质控 分析 发现 鼓楼 医院 质控 整体 偏低 CV 偏大 客户 怀疑 设备 问题 ....
2027,重要,1235 产筛 使用 质控 近期 CV 超出 接受 范围 质控 CV 较大 . 检查 仪器 ...
2039,重要,1235 旧 仪器 1297 读码器 扫 样本 条码 识别 1297 扫 样品 条码 识别率...
2047,重要,1235 电脑主机 经常性 不定期 蓝屏 已经 处理 多次 需要 彻底解决 . 查看 历史记...
2545,重要,近 两周 实验 质控 呈现 高低 不同 状态 第三 板 质控 偏高 近期 产筛实验 HCG ...
2593,重要,整板 铕标 检测 发现 两台 仪器 均 出现 A1 H12 检测值 高 5% 情况 需 上门...
4155,重要,周 连续 两次 实验 取放 防蒸发帽 报错 . 检查 历史记录 第一天 实验 显示 自检 时...
4181,重要,全省 质控 分析 发现 鼓楼 医院 质控 整体 偏低 CV 偏大 客户 怀疑 设备 问题 ....
4355,重要,1235 产筛 使用 质控 近期 CV 超出 接受 范围 质控 CV 较大 . 检查 仪器 ...


#### 特征构造

由于把每一条记录都当做一台单独平行时空的仪器，所以需要将该样本维修时间之前的所有样本提出来当做原始数据  
最终要对每一条记录跑一个大循环来计算每一个特征

In [28]:
data.columns

Index(['服务单号', '客户名称', '客户类型', '紧急程度', '问题发现日期', '分配工程师', '设备编号', '设备型号',
       '服务工时(小时)', '服务间隔天数', '上次维修时间', '装机日期', '维修服务内容', '问题汇总', '解决方案汇总'],
      dtype='object')

In [100]:
data['问题发现日期'] = pd.to_datetime(data['问题发现日期'], errors = 'coerce')
data = data.sort_values(by = '问题发现日期')

##### 该仪器最近一年的维修次数

In [101]:
data = data.reset_index(drop = True)

def service_count_year(data):
    instrument_id = data.iloc[-1]['设备编号']
    cache_ins = data.loc[data['设备编号'] == instrument_id,:]    
    cache_ins['year'] = cache_ins['问题发现日期'].apply(lambda x: x.year)
    last_year = cache_ins.iloc[-1]['year']
    return cache_ins['year'].value_counts()[last_year]

In [102]:
from tqdm import tqdm_notebook

service_count = []
for ix in tqdm_notebook(data.index):
    data_time = data.iloc[:ix+1].copy()
    service_count.append(service_count_year(data_time))




In [103]:
data['最近一年维修次数'] = service_count

##### 该客户最近一年的维修总次数

In [104]:
def client_service_count_year(data):
    client_id = data.iloc[-1]['客户名称']
    cache_ins = data.loc[data['客户名称'] == client_id,:]    
    cache_ins['year'] = cache_ins['问题发现日期'].apply(lambda x: x.year)
    last_year = cache_ins.iloc[-1]['year']
    return cache_ins['year'].value_counts()[last_year]

In [105]:
client_service_count = []
for ix in tqdm_notebook(data.index):
    data_time = data.iloc[:ix+1].copy()
    client_service_count.append(client_service_count_year(data_time))




In [106]:
data['最近一年该客户的维修总次数'] = client_service_count

##### 该仪器最近一年的维修次数占对应客户最近一年的总维修次数的比

In [107]:
data['年修占比'] = data['最近一年维修次数']/data['最近一年该客户的维修总次数']

##### 仪器距离上一次维修维护的时间间隔

In [148]:
data.loc[pd.to_datetime(data['装机日期'])< pd.to_datetime('1999-05-01'), '装机日期'] = '2005-06-28'

In [149]:
data.loc[(data['装机日期'].isnull())&(data['装机年份'].notnull()),'装机日期'] = data.loc[(data['装机日期'].isnull())&(data['装机年份'].notnull()),'装机年份']
data.loc[data['装机日期'] == '20XX','装机日期'] = '2016'
data.loc[data['上次维修时间'].isnull(),'上次维修时间'] = data.loc[data['上次维修时间'].isnull(),'装机日期']
data.loc[data['上次维修时间'] == '20XX','上次维修时间'] = '2016'

##### 该仪器的已使用年限

In [150]:
days_cache = (pd.to_datetime(data['问题发现日期'], errors = 'coerce') - pd.to_datetime(data['装机日期'], errors = 'coerce')).apply(lambda x: x.days)
data['已使用年限'] = round(days_cache/365.25,2)

In [151]:
data.loc[data['已使用年限']<0,['设备编号','问题发现日期','装机日期','维修服务内容']]

Unnamed: 0,设备编号,问题发现日期,装机日期,维修服务内容
0,BS-2016-WTS-TQD-16,2016-06-30,2016-10-08 00:00:00,维修
105,BS-2016-TMN-CDS-10,2016-09-19,2016-09-22 00:00:00,维修
126,BS-2016-TMN-CDS-17,2016-09-26,2016-09-30 00:00:00,维修
136,BS-2016-SBA-CP2-10,2016-10-10,2016-10-13 00:00:00,维修
187,BS-2016-LAK-KM-03,2016-10-31,2016-11-02 00:00:00,维修
197,BS-2016-PKI-AD-08,2016-11-01,2016-12-15 00:00:00,维修
230,BS-2016-TMN-CDS-11,2016-11-11,2016-11-15 00:00:00,维修
264,BS-2016-SBA-CP2-09,2016-11-21,2016-11-25 00:00:00,维修
272,BS-2016-PKI-AD-10,2016-11-21,2016-11-23 00:00:00,维修
282,BS-2016-PKI-AD-08,2016-11-24,2016-12-15 00:00:00,维修


可以看到很多已使用年限信息<0的样本  
一大部分是装机/移机的样本，这部分是可以直接改为0的  
其他的一部分，一部分是填写错误

In [152]:
data.loc[data['已使用年限']<0, '已使用年限'] = 0

##### 同类仪器对应使用年限时的平均维修次数

由于是维修次数，所以需要把维修以外的样本剃了

In [172]:
data_repair = data.loc[data['维修服务内容'] == '维修', :]

In [183]:
data_repair['已使用年限_round'] = data_repair['已使用年限'].apply(lambda x: round(x))
instrument_count = instrument_uniq['设备型号'].value_counts().to_dict()
instrument_average_service = {}
for instrument in instrument_count.keys():
    instrument_average_service[instrument] = (data_repair.loc[data_repair['设备型号'] == instrument, '已使用年限_round'].value_counts()/instrument_count[instrument]).apply(lambda x: round(x, 3))
average_service_stats = pd.concat(instrument_average_service,axis = 1)
# average_service_stats.fillna('', inplace = True)

In [184]:
average_service_stats.loc['仪器台数'] = instrument_uniq['设备型号'].value_counts().to_dict()

In [185]:
pd.options.display.max_columns = 200
average_service_stats = average_service_stats.T.sort_values(by = '仪器台数', ascending = False).T

In [186]:
average_service_stats

Unnamed: 0,1235-514,1420-020,GSL-120/GSL-10,2081-0010,CDS-5,1235-5220,2021-0010,TQD,GCS3000DX2,Capillarys 2 FP,CaptureStation,1235-501,6000-0010,Xevo TQD,luminex 200,Quattro Micro,1296-026,1296-003,KM1,GCS3000Dx2,1420-012,GSL-120,KM2,00-0335,UPLC,1296-0010,LX200,Nextseq 550AR,6000,5014-0020,CaptureStation(FISH),GSL-10,BGI SEQ-500,MB4,MB8/MB4,SLAN-96S,TQS
0,0.119,0.014,0.07,0.327,0.352,1.375,1.239,0.044,0.524,0.425,0.03,,,0.76,0.15,0.421,,,2.714,,0.2,9.2,0.5,,2.667,,2.0,,5.0,,,,,,,,
1,0.319,0.054,0.169,0.558,0.22,1.097,1.873,0.794,0.825,0.675,0.121,0.25,0.5,2.08,0.25,0.368,0.364,0.143,0.571,0.167,0.4,17.2,0.5,2.75,1.333,1.0,1.0,,4.5,1.0,1.0,,,,,,
2,0.781,0.074,0.317,0.644,0.044,0.278,1.507,1.0,0.714,0.45,0.061,0.071,0.231,0.28,0.15,0.211,0.818,0.571,,0.167,,17.2,0.5,2.25,2.0,2.333,3.0,,3.5,,,,,,,,
3,1.031,0.014,0.218,0.567,0.132,0.069,0.408,0.868,0.794,0.7,0.03,1.607,0.115,0.04,0.1,0.053,0.182,,0.429,1.0,,14.0,0.5,0.75,0.333,,0.5,,,,,,,,,,
4,1.144,0.02,0.148,0.346,0.22,,0.127,0.603,0.54,0.65,,1.679,,,0.15,0.368,0.727,,0.571,0.5,,9.4,,,,,0.5,,6.5,,,,,,,,
5,0.988,0.014,0.099,0.106,0.011,,,0.382,0.222,0.35,,0.571,0.115,,0.2,0.526,0.364,,,,,8.6,,,,,,,10.0,,,,,,,,
6,0.65,0.034,0.134,,0.132,,,,0.079,0.025,0.03,0.143,0.077,,,0.474,0.182,,,0.333,,9.6,,,,,,,,,,,,,,,
7,0.588,0.027,0.134,,0.077,,,0.015,0.095,,0.061,0.143,,,,0.421,,,0.714,,,4.4,0.5,,,,,,,,,,,,,,
8,0.331,0.027,0.049,,0.022,,,,0.111,,0.03,,,,,,0.182,,,,,3.2,,,,0.667,,,,,,,,,,,
9,0.338,0.054,0.028,,0.055,,,,0.048,,0.061,,,,,0.105,,0.143,0.286,0.167,,1.2,,,,0.667,,,,,,,,,,,


从目前得到的结果可以看出几点：
1. 1235在相同年限上的维修次数明显要高于1420
2. 平均年限维修次数呈现先增后降的趋势，这可能与到一定年限就优先换新报废而不在维修维护有关
3. 个别仪器的样本较多，部分仪器较少，需要合并统计才能达到效果
4. 对于过于少的仪器，这类特征不采用趋势映射而直接做中位数插补

为了这张表完整可查，这边先进行中位数插补

In [187]:
average_service_stats.drop('仪器台数', axis = 0, inplace = True)

for col in average_service_stats.columns:
    average_service_stats[col] = average_service_stats[col].fillna(average_service_stats[col].median())

average_service_stats.fillna(0, inplace = True)

In [196]:
average_service_map = average_service_stats.to_dict()

average_service_map2 = {}
for k1 in average_service_map.keys():
    for k2 in average_service_map[k1].keys():
        average_service_map2['{}_{}'.format(k1, k2)] = average_service_map[k1][k2]

In [199]:
data['设备型号_已使用年限'] = data['设备型号'] + '_' + data['已使用年限_round'].astype(int).astype(str)

In [205]:
data['设备年平均维修次数'] = pd.to_numeric(data['设备型号_已使用年限'].map(average_service_map2), errors = 'coerce')

In [206]:
data.drop(['设备型号_已使用年限', '已使用年限_round'], axis = 1, inplace = True)