In [1]:
import pandas as pd
import numpy as np
import warnings
import jieba
import jieba.posseg as pseg
jieba.load_userdict('../source/words/problem.txt')
import re
from gensim.models import Word2Vec
from bayes_opt import BayesianOptimization
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sqlalchemy import create_engine
import sys
sys.path.extend(['../src/'])
from config import *
config = Config()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\KENSHI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.831 seconds.
Prefix dict has been built succesfully.


### 数据读取

In [83]:
def data_sql_load():
    engine = create_engine('mysql+pymysql://%s:%s@%s:3306/%s?charset=utf8' %(config.sql_user, config.sql_password, config.sql_ip, config.sql_database),echo = False)
    sql = "select * from etl_data"
    data = pd.read_sql_query(sql, engine)
    return data

data = data_sql_load()

### 特征工程

由于每条样本都是一条单独的维修记录，不用把该仪器设备单独提出来做连续的时间线，只需要把每个时间点的仪器当成一台平行时空下的独立的仪器就好。

对于该条样本下，该仪器当前的状态构建以下特征：

1. 该仪器型号的平均使用年限：定义一台仪器淘汰所需要的平均年限，需要找工程师问有哪些仪器是已经报废的，报废时间及装机时间
2. 该仪器型号报废前的平均维修次数：可精确到各个部件各维修的次数
3. 该仪器最近一年的维修次数
4. 该仪器最近一年的维修次数占对应客户最近一年的总维修次数的比
5. 仪器对应客户最近一年的维修总次数
6. 仪器最近一年遭遇严重问题zp1的次数
7. 仪器最近一年遭遇zp1在总次数中的占比
8. 仪器距离上一次维修的时间间隔
9. 仪器距离上一次维护的时间间隔
10. 该仪器的已使用年限
11. 同类仪器对应使用年限时的平均维修次数
。。。

#### 在构造特征之前需要提取的信息

##### 每类型仪器各有多少台

In [11]:
instrument_uniq = data.drop_duplicates(subset = '设备编号', keep = 'last')

instrument_uniq['设备型号'].value_counts()

1235-514                160
1420-020                148
GSL-120/GSL-10          142
2081-0010               104
CDS-5                    91
1235-5220                72
2021-0010                71
TQD                      68
GCS3000DX2               63
Capillarys 2 FP          40
CaptureStation           33
1235-501                 28
6000-0010                26
Xevo TQD                 25
luminex 200              20
Quattro Micro            19
1296-026                 11
KM1                       7
1296-003                  7
GCS3000Dx2                6
1420-012                  5
GSL-120                   5
00-0335                   4
KM2                       4
1296-0010                 3
UPLC                      3
Nextseq 550AR             2
LX200                     2
CaptureStation(FISH)      2
6000                      2
5014-0020                 2
MB8/MB4                   1
TQS                       1
BGI SEQ-500               1
GSL-10                    1
MB4                 

#### 特征构造

由于把每一条记录都当做一台单独平行时空的仪器，所以需要将该样本维修时间之前的所有样本提出来当做原始数据  
最终要对每一条记录跑一个大循环来计算每一个特征

In [14]:
data['问题发现日期'] = pd.to_datetime(data['问题发现日期'], errors = 'coerce')
data = data.sort_values(by = '问题发现日期')
data = data.reset_index(drop = True)

##### 该仪器最近一年的维修次数

In [16]:
from tqdm import tqdm_notebook

def service_count_year(data):
    instrument_id = data.iloc[-1]['设备编号']
    cache_ins = data.loc[data['设备编号'] == instrument_id,:]    
    cache_ins['year'] = cache_ins['问题发现日期'].apply(lambda x: x.year)
    last_year = cache_ins.iloc[-1]['year']
    return cache_ins['year'].value_counts()[last_year]
service_count = []
for ix in tqdm_notebook(data.index):
    data_time = data.iloc[:ix+1].copy()
    service_count.append(service_count_year(data_time))
data['最近一年维修次数'] = service_count




##### 该客户最近一年的维修总次数

In [18]:
def client_service_count_year(data):
    client_id = data.iloc[-1]['客户名称']
    cache_ins = data.loc[data['客户名称'] == client_id,:]    
    cache_ins['year'] = cache_ins['问题发现日期'].apply(lambda x: x.year)
    last_year = cache_ins.iloc[-1]['year']
    return cache_ins['year'].value_counts()[last_year]
client_service_count = []
for ix in tqdm_notebook(data.index):
    data_time = data.iloc[:ix+1].copy()
    client_service_count.append(client_service_count_year(data_time))
data['最近一年该客户的维修总次数'] = client_service_count




##### 该仪器最近一年的维修次数占对应客户最近一年的总维修次数的比

In [20]:
data['年修占比'] = data['最近一年维修次数']/data['最近一年该客户的维修总次数']

##### 仪器距离上一次维修维护的时间间隔

In [21]:
data.loc[pd.to_datetime(data['装机日期'])< pd.to_datetime('1999-05-01'), '装机日期'] = '2005-06-28'	
data.loc[(data['装机日期'].isnull())&(data['装机年份'].notnull()),'装机日期'] = data.loc[(data['装机日期'].isnull())&(data['装机年份'].notnull()),'装机年份']
data.loc[data['装机日期'] == '20XX','装机日期'] = '2016'
data.loc[data['上次维修时间'].isnull(),'上次维修时间'] = data.loc[data['上次维修时间'].isnull(),'装机日期']
data.loc[data['上次维修时间'] == '20XX','上次维修时间'] = '2016'

##### 该仪器的已使用年限

In [22]:
days_cache = (pd.to_datetime(data['问题发现日期'], errors = 'coerce') - pd.to_datetime(data['装机日期'], errors = 'coerce')).apply(lambda x: x.days)
data['已使用年限'] = round(days_cache/365.25,2)

In [151]:
data.loc[data['已使用年限']<0,['设备编号','问题发现日期','装机日期','维修服务内容']]

Unnamed: 0,设备编号,问题发现日期,装机日期,维修服务内容
0,BS-2016-WTS-TQD-16,2016-06-30,2016-10-08 00:00:00,维修
105,BS-2016-TMN-CDS-10,2016-09-19,2016-09-22 00:00:00,维修
126,BS-2016-TMN-CDS-17,2016-09-26,2016-09-30 00:00:00,维修
136,BS-2016-SBA-CP2-10,2016-10-10,2016-10-13 00:00:00,维修
187,BS-2016-LAK-KM-03,2016-10-31,2016-11-02 00:00:00,维修
197,BS-2016-PKI-AD-08,2016-11-01,2016-12-15 00:00:00,维修
230,BS-2016-TMN-CDS-11,2016-11-11,2016-11-15 00:00:00,维修
264,BS-2016-SBA-CP2-09,2016-11-21,2016-11-25 00:00:00,维修
272,BS-2016-PKI-AD-10,2016-11-21,2016-11-23 00:00:00,维修
282,BS-2016-PKI-AD-08,2016-11-24,2016-12-15 00:00:00,维修


可以看到很多已使用年限信息<0的样本  
一大部分是装机/移机的样本，这部分是可以直接改为0的  
其他的一部分，一部分是填写错误

In [23]:
data.loc[data['已使用年限']<0, '已使用年限'] = 0

##### 同类仪器对应使用年限时的平均维修次数

由于是维修次数，所以需要把维修以外的样本剃了

In [38]:
engine = create_engine('mysql+pymysql://%s:%s@%s:3306/%s?charset=utf8' %(config.sql_user, config.sql_password, config.sql_ip, config.sql_database),echo = False)

instrument_uniq = data.drop_duplicates(subset = '设备编号', keep = 'last')
data_repair = data.loc[data['维修服务内容'] == '维修', :]
data_repair['已使用年限_round'] = data_repair['已使用年限'].apply(lambda x: round(x))
instrument_count = instrument_uniq['设备型号'].value_counts().to_dict()
instrument_average_service = {}
for instrument in instrument_count.keys():
    instrument_average_service[instrument] = (data_repair.loc[data_repair['设备型号'] == instrument, '已使用年限_round'].value_counts()/instrument_count[instrument]).apply(lambda x: round(x, 3))
average_service_stats = pd.concat(instrument_average_service,axis = 1)
average_service_stats.loc['仪器台数'] = instrument_uniq['设备型号'].value_counts().to_dict()
average_service_stats = average_service_stats.T.sort_values(by = '仪器台数', ascending = False).T
average_service_stats.to_sql('instrument_average_service', engine, if_exists='replace',index= False)

从目前得到的结果可以看出几点：
1. 1235在相同年限上的维修次数明显要高于1420
2. 平均年限维修次数呈现先增后降的趋势，这可能与到一定年限就优先换新报废而不在维修维护有关
3. 个别仪器的样本较多，部分仪器较少，需要合并统计才能达到效果
4. 对于过于少的仪器，这类特征不采用趋势映射而直接做中位数插补

为了这张表完整可查，这边先进行中位数插补

In [39]:
data['已使用年限_round'] = data['已使用年限'].apply(lambda x: round(x))

average_service_stats.drop('仪器台数', axis = 0, inplace = True)
for col in average_service_stats.columns:
    average_service_stats[col] = average_service_stats[col].fillna(average_service_stats[col].median())
average_service_stats.fillna(0, inplace = True)
average_service_map = average_service_stats.to_dict()
average_service_map2 = {}
for k1 in average_service_map.keys():
    for k2 in average_service_map[k1].keys():
        average_service_map2['{}_{}'.format(k1, k2)] = average_service_map[k1][k2]

data['设备型号_已使用年限'] = data['设备型号'] + '_' + data['已使用年限_round'].astype(int).astype(str)
data['设备年平均维修次数'] = pd.to_numeric(data['设备型号_已使用年限'].map(average_service_map2), errors = 'coerce')	
data.drop(['设备型号_已使用年限', '已使用年限_round'], axis = 1, inplace = True)

#### 保存到sql

In [48]:
datetime_columns = ['上次维修时间','装机日期']
for col in datetime_columns:
    data[col] = pd.to_datetime(data[col], errors = 'coerce')
data.to_sql('fe_data1', engine, if_exists='replace',index= False)

#### 文本挖掘，重要问题zq1

1. 定义一个函数可以分词出单仪器型号的所有实体名词
2. 圈出其中的问题部分定义为zq1保存在文件problem.txt中

In [2]:
def data_sql_load():
    engine = create_engine('mysql+pymysql://%s:%s@%s:3306/%s?charset=utf8' %(config.sql_user, config.sql_password, config.sql_ip, config.sql_database),echo = False)
    sql = "select * from fe_data1"
    data = pd.read_sql_query(sql, engine)
    return data

data = data_sql_load()

In [4]:
import jieba
import jieba.posseg as pseg
jieba.load_userdict('../source/words/problem.txt')
# 创建停用词list  
def stopwordslist(filepath):  
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]  
    return stopwords    
  
# 对句子进行分词  
def seg_sentence(sentence):  
    sentence_seged = jieba.cut(sentence.strip(), HMM=True, cut_all=False)  
#     sentence_seged = jieba.cut_for_search(sentence.strip(), HMM=True)
    stopwords = stopwordslist('../source/words/中文停用词表.txt')  # 这里加载停用词的路径
    outstr = ''  
    for word in sentence_seged:  
        if word not in stopwords:  
            if word != '\t':  
                outstr += word  
                outstr += " "  
    return outstr

##### 文本合并

In [95]:
def text_clean(content):
    try:
        result = re.sub(r'[^\u4e00-\u9fa5,A-Za-z0-9]', " ",content)
    except:
        result = ''
    return result

data['问题汇总'].fillna('', inplace = True)
data['问题汇总_jieba'] = data['问题汇总'].apply(seg_sentence)

data['解决方案汇总'].fillna('', inplace = True)
data['解决方案汇总_jieba'] = data['解决方案汇总'].apply(seg_sentence)

data['jieba'] = (data['问题汇总_jieba'] + data['解决方案汇总_jieba']).apply(text_clean)

##### 训练词向量

In [40]:
from gensim.models import word2vec
import gensim
import logging

# 此函数作用是对初始语料进行分词处理后，作为训练模型的语料
def cut_txt(corpus):
    jieba.load_userdict(config.userdict_file)
    content = " ".join(corpus)
    new_text = jieba.cut(content, HMM=True, cut_all=False)
    str_out = ' '.join(new_text).replace('，', '').replace('。', '').replace('？', '').replace('！', '') \
        .replace('“', '').replace('”', '').replace('：', '').replace('…', '').replace('（', '').replace('）', '') \
        .replace('—', '').replace('《', '').replace('》', '').replace('、', '').replace('‘', '') \
        .replace('’', '').replace('１', '').replace('.', '').replace('/', '')     # 去掉标点符号
    fo = open(config.seg_senquence_file, 'w', encoding='utf-8')
    fo.write(str_out)

def w2v_model_train_save(corpus_file, embed_size = config.embed_size, window_size = config.window_size):  # model_file_name为训练语料的路径,save_model为保存模型名
    # 模型训练，生成词向量
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    sentences = word2vec.Text8Corpus(corpus_file)  # 加载语料
    model = Word2Vec(sentences, size=embed_size, window=window_size, min_count=1,negative=3,
                     sg=1,sample=0.001,hs=1,workers=4,iter= 5)  # 训练skip-gram模型; 默认window=5
    model.save('{}word2vec.{}d.wsize{}.model'.format(config.savemodel_path, embed_size, window_size))
    model.wv.save_word2vec_format('{}word2vec_format.bin'.format(config.savemodel_path), binary=True)   # 以二进制类型保存模型以便重用

def word2vec_train(corpus):
    cut_txt(corpus)
    w2v_model_train_save(config.seg_senquence_file)

def word2vec_load():    
    return gensim.models.KeyedVectors.load(config.w2v_file)

def w2v_transform(corpus_series,word2vec_model):
    def transform_word_vector_mean(word_lists):
        re=np.zeros((len(word_lists),config.embed_size))
        for i,word in enumerate(word_lists):
            if word  in word2vec_model.wv.vocab:
                re[i]=word2vec_model[word]
        return re.mean(axis=0)
    idx = np.arange(len(corpus_series))
    w2v_col = [f'w2v_{i}' for i in range(config.embed_size)]
    q1=np.zeros((len(idx),config.embed_size))
    for i,j in enumerate(idx):        
        q1[i]=transform_word_vector_mean(corpus_series[j])
    output = pd.DataFrame(q1)
    output.columns = w2v_col
    return output

##### 特征“仪器问题程度”模型建立  

主要的问题在于标签的设定，即哪些情况可以定义这次问题的紧急程度：  
1. 观察紧急程度和维修服务内容可以看到，由于维修服务内容有很多标记为“待确认”的样本，为了减少标记工作量，最好是将这部分也能使用模型标记，而且维修的样本中有2520条是“未知”，这些都是需要能够分出来使用的，所以需要做半监督。
2. 因为这个特征是为了反映维修问题的严重程度，所以有关巡检，性能验证，装机移机都不在考虑范围内，可全部修改标记为非严重仪器问题

In [84]:
pd.DataFrame(data['紧急程度'].groupby(data['维修服务内容']).value_counts())

Unnamed: 0_level_0,Unnamed: 1_level_0,紧急程度
维修服务内容,紧急程度,Unnamed: 2_level_1
其他,正常,11
其他,未知,10
其他,紧急,4
其他,无要求,3
其他,重要,3
巡检,无要求,113
巡检,正常,94
巡检,紧急,79
巡检,未知,33
巡检,重要,21


In [41]:
cut_txt(data['jieba'])
word2vec_train(data['jieba'])
w2v_model = word2vec_load()

dtrain = data.loc[data['维修服务内容'] == '维修',:].reset_index(drop = True)
dtrain = dtrain.loc[dtrain['紧急程度'] != '未知',:].reset_index(drop = True)
dtrain.loc[dtrain['紧急程度'].isin(['紧急','重要']),'仪器问题程度'] = 1
dtrain.loc[dtrain['紧急程度'].isin(['正常','无要求']),'仪器问题程度'] = 0
w2v_train = w2v_transform(dtrain['jieba'], w2v_model)
y = dtrain['仪器问题程度']

2019-04-16 19:06:44,389 : INFO : collecting all words and their counts
2019-04-16 19:06:44,392 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-04-16 19:06:44,502 : INFO : collected 13129 word types from a corpus of 396785 raw words and 40 sentences
2019-04-16 19:06:44,503 : INFO : Loading a fresh vocabulary
2019-04-16 19:06:44,524 : INFO : effective_min_count=1 retains 13129 unique words (100% of original 13129, drops 0)
2019-04-16 19:06:44,525 : INFO : effective_min_count=1 leaves 396785 word corpus (100% of original 396785, drops 0)
2019-04-16 19:06:44,561 : INFO : deleting the raw counts dictionary of 13129 items
2019-04-16 19:06:44,564 : INFO : sample=0.001 downsamples 49 most-common words
2019-04-16 19:06:44,565 : INFO : downsampling leaves estimated 346401 word corpus (87.3% of prior 396785)
2019-04-16 19:06:44,576 : INFO : constructing a huffman tree from 13129 words
2019-04-16 19:06:44,848 : INFO : built huffman tree with maximum node depth 19
20

##### lgb调参

In [13]:
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score

def lgbBO_tuning(X, y, random_state = 2019, num_iter = 25, init_points = 10):    
    def lgb_evaluate(num_leaves, subsample, colsample_bytree, min_child_samples, reg_alpha, reg_lambda):
        train = lgb.Dataset(X, label=y)
        params = {
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric': 'auc',
            'min_child_samples': int(min_child_samples),
            'num_leaves': int(num_leaves),
            'reg_alpha': max(reg_alpha,0),
            'reg_lambda': max(reg_lambda,0),
            'subsample': max(min(subsample, 1), 0),
            'colsample_bytree': max(min(colsample_bytree, 1), 0) ,
            'learning_rate': 0.05,
            'seed': 2019,
            'nthread': -1,}
        cv_result = lgb.cv(params, train, nfold = 5, seed= 2019, stratified=True, verbose_eval =200, metrics=['auc'])
        return max(cv_result['auc-mean'])
    lgbBO = BayesianOptimization(lgb_evaluate, {'num_leaves': (30, 120), 
                                                'subsample': (0.5, 1),
                                                'colsample_bytree': (0.5, 1),
                                                'min_child_samples': (1,100), 
                                                'reg_alpha': (0,10),
                                                'reg_lambda': (0,10)
                                                })
    lgbBO.maximize(init_points=init_points, n_iter=num_iter)
    return lgbBO

lgbBO = lgbBO_tuning(w2v_train, y)

[31mInitialization[0m
[94m----------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   min_child_samples |   num_leaves |   reg_alpha |   reg_lambda |   subsample | 
    1 | 00m02s | [35m   0.82932[0m | [32m            0.8168[0m | [32m            35.3300[0m | [32m     71.9720[0m | [32m     8.3887[0m | [32m      6.7613[0m | [32m     0.9824[0m | 
    2 | 00m06s | [35m   0.87693[0m | [32m            0.7386[0m | [32m             9.7745[0m | [32m    101.2738[0m | [32m     3.0819[0m | [32m      8.0824[0m | [32m     0.6801[0m | 
    3 | 00m01s |    0.81608 |             0.7654 |             70.5154 |      68.7731 |      6.4497 |       5.9827 |      0.7845 | 
    4 | 00m01s |    0.78141 |             0.9852 |             92.8865 |     104.6281 |      8.1311 |       9.5898 |      0.9009 | 
    5 | 00m02s |    0.82277 |             0.9024 |

  " state: %s" % convergence_dict)


   23 | 00m18s |    0.81983 |             0.6305 |             99.5408 |     117.3462 |      0.2894 |       0.3152 |      0.7844 | 
   24 | 00m22s |    0.85183 |             0.5727 |             43.4755 |      31.7113 |      0.0436 |       9.9404 |      0.7099 | 
   25 | 00m21s |    0.85763 |             0.5739 |              3.0591 |      30.9581 |      9.9053 |       0.2397 |      0.5399 | 


  " state: %s" % convergence_dict)


   26 | 00m22s |    0.87350 |             0.6112 |             45.8532 |     119.6562 |      0.0386 |       0.8531 |      0.5813 | 
   27 | 00m28s |    0.90552 |             0.6207 |             18.2559 |     109.8768 |      0.1426 |       0.0888 |      0.6220 | 
   28 | 00m29s |    0.90123 |             0.5388 |              8.7927 |      43.7234 |      0.1191 |       2.3421 |      0.6025 | 
   29 | 00m31s |    0.91349 |             0.5559 |             10.8342 |     100.5293 |      0.0689 |       0.7375 |      0.9512 | 


  " state: %s" % convergence_dict)


   30 | 00m36s | [35m   0.91916[0m | [32m            0.5000[0m | [32m             7.3480[0m | [32m     96.0936[0m | [32m     0.0000[0m | [32m      0.0000[0m | [32m     0.5000[0m | 
   31 | 00m30s |    0.90536 |             0.5092 |             15.7566 |      99.3862 |      0.3115 |       1.0999 |      0.6487 | 


  " state: %s" % convergence_dict)


   32 | 00m29s |    0.84465 |             0.5000 |             74.9302 |      90.4488 |      0.0000 |       0.0000 |      0.9557 | 


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


   33 | 00m26s |    0.78542 |             0.6838 |             99.1004 |      42.7729 |      9.7255 |       0.2141 |      0.6859 | 
   34 | 00m29s |    0.87994 |             0.6121 |             37.5346 |      49.3567 |      0.0148 |       0.8221 |      0.7801 | 
   35 | 00m31s |    0.87114 |             0.7917 |             28.1715 |     119.9222 |      0.0341 |       9.8474 |      0.7968 | 


  " state: %s" % convergence_dict)


##### 模型训练

In [44]:
max_params = lgbBO.res['max']['max_params']

from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

def evalMetric(preds,dtrain):
    label = dtrain.get_label()
    pre = pd.DataFrame({'preds':preds,'label':label})
    auc = roc_auc_score(pre.label,pre.preds)
    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)
    f1 = f1_score(pre.label,pre.preds)
    res = 0.6*auc +0.4*f1
    return 'res',res,True

def lgb_w2v_train(X, Y, validation_size = 0.3):
    X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size = validation_size, random_state= 2019, stratify = y)
    train = lgb.Dataset(X_train, label=Y_train)
    valid = train.create_valid(X_valid, label=Y_valid)
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'logloss',
        'min_child_samples': int(max_params['min_child_samples']),
        'num_leaves': int(max_params['num_leaves']),
        'reg_alpha': max_params['reg_alpha'],
        'reg_lambda': max_params['reg_lambda'],
        'subsample': max_params['subsample'],
        'colsample_bytree': max_params['colsample_bytree'],
        'learning_rate': 0.02,
        'seed': 2019,
        'nthread': -1
    }
    
    num_round = 20000
    gbm = lgb.train(params, train, num_round, verbose_eval=100,valid_sets=[valid], early_stopping_rounds = 1000, feval=evalMetric)
    return gbm

lgb_model = lgb_w2v_train(w2v_train, y)

Training until validation scores don't improve for 1000 rounds.
[100]	valid_0's res: 0.875786
[200]	valid_0's res: 0.875366
[300]	valid_0's res: 0.87779
[400]	valid_0's res: 0.881807
[500]	valid_0's res: 0.882756
[600]	valid_0's res: 0.883368
[700]	valid_0's res: 0.884234
[800]	valid_0's res: 0.884036
[900]	valid_0's res: 0.883876
[1000]	valid_0's res: 0.88332
[1100]	valid_0's res: 0.885563
[1200]	valid_0's res: 0.885396
[1300]	valid_0's res: 0.885734
[1400]	valid_0's res: 0.884334
[1500]	valid_0's res: 0.884008
[1600]	valid_0's res: 0.883438
[1700]	valid_0's res: 0.883425
[1800]	valid_0's res: 0.883232
[1900]	valid_0's res: 0.882307
[2000]	valid_0's res: 0.882191
[2100]	valid_0's res: 0.882555
[2200]	valid_0's res: 0.88271
[2300]	valid_0's res: 0.883127
Early stopping, best iteration is:
[1358]	valid_0's res: 0.885863


##### 错义分析

##### 半监督过程

In [69]:
data_semi = data.loc[(data['维修服务内容'] == '维修')&(data['紧急程度'] == '未知'),:].reset_index(drop = True)
w2v_semi = w2v_transform(data_semi['jieba'], w2v_model)
data_semi['preds'] = lgb_model.predict(w2v_semi)

semi_concat = data_semi.loc[(data_semi['preds'] > 0.9)|(data_semi['preds'] < 0.1),:].reset_index(drop = True)
data_semi = data_semi.loc[(data_semi['preds'] <= 0.9)&(data_semi['preds'] >= 0.1),:].reset_index(drop = True)

dtrain_ = pd.concat((dtrain, semi_concat), axis = 0).reset_index(drop = True)

dtrain_.loc[dtrain_['preds']<0.1, '仪器问题程度'] = 0
dtrain_.loc[dtrain_['preds']>0.9, '仪器问题程度'] = 1

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [70]:
w2v_train = w2v_transform(dtrain_['jieba'], w2v_model)
y = dtrain_['仪器问题程度']
lgb_model = lgb_w2v_train(w2v_train, y)



Training until validation scores don't improve for 1000 rounds.
[100]	valid_0's res: 0.928001
[200]	valid_0's res: 0.935073
[300]	valid_0's res: 0.937561
[400]	valid_0's res: 0.937919
[500]	valid_0's res: 0.938478
[600]	valid_0's res: 0.938989
[700]	valid_0's res: 0.939783
[800]	valid_0's res: 0.939601
[900]	valid_0's res: 0.939616
[1000]	valid_0's res: 0.939451
[1100]	valid_0's res: 0.939272
[1200]	valid_0's res: 0.939177
[1300]	valid_0's res: 0.938768
[1400]	valid_0's res: 0.938412
[1500]	valid_0's res: 0.938286
[1600]	valid_0's res: 0.938242
[1700]	valid_0's res: 0.937737
Early stopping, best iteration is:
[721]	valid_0's res: 0.939929


In [71]:
w2v_semi = w2v_transform(data_semi['jieba'], w2v_model)
data_semi['preds'] = lgb_model.predict(w2v_semi)



In [76]:
semi_concat = data_semi.loc[(data_semi['preds'] > 0.9)|(data_semi['preds'] < 0.1),:].reset_index(drop = True)
data_semi = data_semi.loc[(data_semi['preds'] <= 0.9)&(data_semi['preds'] > 0.1),:].reset_index(drop = True)

dtrain_ = pd.concat((dtrain_, semi_concat), axis = 0).reset_index(drop = True)
dtrain_.loc[dtrain_['preds']<0.1, '仪器问题程度'] = 0
dtrain_.loc[dtrain_['preds']>0.9, '仪器问题程度'] = 1

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


In [77]:
w2v_train = w2v_transform(dtrain_['jieba'], w2v_model)
y = dtrain_['仪器问题程度']
lgb_model = lgb_w2v_train(w2v_train, y)



Training until validation scores don't improve for 1000 rounds.
[100]	valid_0's res: 0.932966
[200]	valid_0's res: 0.937345
[300]	valid_0's res: 0.937405
[400]	valid_0's res: 0.937329
[500]	valid_0's res: 0.938228
[600]	valid_0's res: 0.939289
[700]	valid_0's res: 0.939218
[800]	valid_0's res: 0.939166
[900]	valid_0's res: 0.939241
[1000]	valid_0's res: 0.938729
[1100]	valid_0's res: 0.93854
[1200]	valid_0's res: 0.93856
[1300]	valid_0's res: 0.9385
[1400]	valid_0's res: 0.938249
[1500]	valid_0's res: 0.938364
[1600]	valid_0's res: 0.938412
[1700]	valid_0's res: 0.938256
Early stopping, best iteration is:
[720]	valid_0's res: 0.93951


In [102]:
max_params

{'colsample_bytree': 0.5,
 'min_child_samples': 7.347950100676255,
 'num_leaves': 96.09357631764188,
 'reg_alpha': 1.5571088120843582e-06,
 'reg_lambda': 1.2860202034063392e-07,
 'subsample': 0.5}

In [101]:
lgb_model.save_model('{}lgb_jinji_w2v.model'.format(config.savemodel_path))

In [96]:
w2v_data = w2v_transform(data['jieba'], w2v_model)
data['紧急程度指数'] = lgb_model.predict(w2v_data)



In [97]:
import matplotlib.pyplot as plt
import seaborn as sns

data['紧急分层'] = pd.cut(data['紧急程度指数'], 5, labels=['1','2', '3', '4', '5'])

In [98]:
data['紧急分层'].value_counts()

5    4931
1    1674
4     116
2      99
3      84
Name: 紧急分层, dtype: int64

In [100]:
engine = create_engine('mysql+pymysql://%s:%s@%s:3306/%s?charset=utf8' %(config.sql_user, config.sql_password, config.sql_ip, config.sql_database),echo = False)
data.to_sql('fe_data2', engine, if_exists='replace',index= False)