In [1]:
import warnings
warnings.simplefilter('ignore')
import gc
import re
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_rows = 500
pd.options.display.max_colwidth = 200
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from gensim.models.word2vec import Word2Vec
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from catboost import CatBoostClassifier

from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.linear_model import LogisticRegression

# loading data

In [2]:
# Log message

sel_data = pd.read_csv('../data/preliminary_sel_log_all.csv') #x训练的日志文件
sel_data2 = pd.read_csv('../tcdata/final_sel_log_dataset_b.csv')  # 提交的日志文件
sel_data = pd.concat([sel_data, sel_data2])
sel_data['time'] = pd.to_datetime(sel_data['time'])
sel_data.sort_values(by=['sn', 'time'], inplace=True)
sel_data.reset_index(drop=True, inplace=True)

In [3]:
# train set

train_data = pd.read_csv("../data/preliminary_train_label_dataset_all.csv")

In [4]:
#additional training set

train_data_a = pd.read_csv("../data/preliminary_crashdump_dataset.csv")
train_data_a = pd.concat((train_data_a, pd.read_csv('../tcdata/final_crashdump_dataset_b.csv')))
train_data_a['label'] = train_data_a['fault_code'].apply(lambda x:0 if x.split('.')[0] == 'cpu0' else 1)
train_data = pd.concat((train_data, train_data_a.loc[:, ['sn', 'fault_time', 'label']]))
train_data = train_data.drop_duplicates()
train_data['fault_time'] = pd.to_datetime(train_data['fault_time'])
train_data.sort_values(by=['sn', 'fault_time'], inplace=True)
train_data.reset_index(drop=True, inplace=True)

In [5]:
test_data = pd.read_csv('../tcdata/final_submit_dataset_b.csv').loc[:, ['sn', 'fault_time']]
test_data['fault_time'] = pd.to_datetime(test_data['fault_time'])

In [47]:
# 20分钟
sel_data = sel_data.copy()    # 制作特征词向量
for name,row in tqdm(sel_data.iterrows()):
    s = row["msg"]
    msg_lsit =s.split()  
    if "OEM record ef" in s:  # 删除含有 OME record efd 列
        sel_data.drop(index=[name],inplace=True)   
    if msg_lsit[0].lower() == "unknown" and re.search("#0.+",msg_lsit[1].lower()):
        sel_data.drop(index=[name],inplace=True)

1697746it [28:08, 1005.72it/s] 


# w2v model

In [6]:
def FirstFilter(ele):
    s = re.sub('(#0x..\s)|(/\s)','',ele)
    s = re.sub('cpu.*_','cpu_',s)
    s = re.sub('(\|\s)','',s)
    s = re.sub('(\d+)','',s)
    s = re.sub('(_{2,})','_',s)
    s = re.sub('锛�','', s)
    s = re.sub('锟絋',' ', s)
    s = re.sub('锟�','', s)
    # s = re.sub('\|',',', s)
    # s = re.sub('s4/s5:', 'ss_one_state', s)
    # s = re.sub('s0/g0:', 'sg_one_state', s)
    # s = re.sub('s5/g2:', 'sg_two_state', s)
    s = re.sub('s4/s5:', 'ss', s)
    s = re.sub('s0/g0:', 'sg', s)
    s = re.sub('s5/g2:', 'sg', s)
    s = re.sub('aa17.{22}','', s)
    # s = re.sub('000000','a Special tags', s)
    s = re.sub('\d{4}\w\d\w\d{5}','Asserted oem record', s)
    return s

In [7]:
tmp = sel_data.groupby(['sn'], as_index=False)['msg'].agg(list)
tmp['text'] = tmp['msg'].apply(lambda x: ("\n".join([i for i in x])).lower())

In [8]:
tmp['text_1'] = tmp['text'].apply(FirstFilter)

In [9]:
sentences_list = tmp['text_1'].values.tolist()
sentences = list()
for s in sentences_list:
    sentences.append([w for w in s.split()])

In [15]:
w2v_model = Word2Vec(sentences, vector_size=32, window=3, min_count=5, sg=0, hs=1, seed=2022)

In [16]:
def get_w2v_mean(sentences):
    emb_matrix = list()
    vec = list()
    for w in sentences.split():
        if w in w2v_model.wv:
            vec.append(w2v_model.wv[w])
    if len(vec) > 0:
        emb_matrix.append(np.mean(vec, axis=0))
    else:
        emb_matrix.append([0] * w2v_model.vector_size)
    return emb_matrix

# tf-idf model

In [17]:
X = list(tmp['text'].values)
tfv = TfidfVectorizer(ngram_range=(1,3), min_df=5, max_features=50000)
tfv.fit(X)

TfidfVectorizer(max_features=50000, min_df=5, ngram_range=(1, 3))

In [18]:
X_tfidf = tfv.transform(X)
svd = TruncatedSVD(n_components=16)
svd.fit(X_tfidf)

TruncatedSVD(n_components=16)

In [56]:
def get_tfidf_svd(sentences, n_components=16):
    X_tfidf = tfv.transform(sentences)
    X_svd = svd.transform(X_tfidf)
    return np.mean(X_svd, axis=0)

# other features

In [57]:
sel_data['time_ts'] = sel_data["time"].values.astype(np.int64) // 10 ** 9
train_data['fault_time_ts'] = train_data["fault_time"].values.astype(np.int64) // 10 ** 9

In [58]:
def safe_split(strs, n, sep='|'):
    str_li = strs.split(sep)
    if len(str_li) >= n + 1:
        return str_li[n]
    else:
        return ''

sel_data['msg_split_0'] = sel_data['msg'].apply(lambda x: safe_split(x, 0))
sel_data['msg_split_1'] = sel_data['msg'].apply(lambda x: safe_split(x, 1))
sel_data['msg_split_2'] = sel_data['msg'].apply(lambda x: safe_split(x, 2))
sel_data['category'] = sel_data['msg'].apply(lambda x: x.split()[0])

In [59]:
cate_map = {
    'Memory': 0,
    'System': 1,
    'Processor': 2,
    'Temperature': 3,
    'Drive': 4,
    'Power': 5,
    'Unknown': 6,
    'Microcontroller': 7,
    'OS': 8,
    'Watchdog2': 9,
    'OEM': 10,
    'Button': 11,
    'Slot/Connector': 12,
    'Microcontroller/Coprocessor': 13,
    'Management': 14,
    'Event': 15,
    'Watchdog': 16,
    'Slot': 17,
    'Fan': 18,
    'Critical': 19,
    'device': 20,
    'LAN': 21,
    'Version': 22,
    'Add-in': 23,
    'Terminator': 24,
    'Chassis': 25,
    'reserved': 26,
    'Physical': 27,
    'Session': 28,
    'Reserved': 29,
    'Cable/Interconnect': 30,
    'Cable': 31,
    'Chip': 32,
    'Battery': 33
}

In [60]:
for i in tqdm(cate_map):
    sel_data[f'{i}_counts'] = sel_data['msg'].apply(lambda x: len(re.findall(i, x)))

100%|██████████████████████████████████████████████████████████████████████████████████| 34/34 [00:39<00:00,  1.17s/it]


# make dataset

In [61]:
def make_dataset(dataset, data_type='train'):
    ret = list()

    for idx, row in tqdm(dataset.iterrows()):
        sn = row['sn']
        fault_time = row['fault_time']
        
        # ts = row['fault_time_ts']
        # 选取数据
        days = 1  # 一天
        hours = 1
        tail_nums = 40   #数据量
        LimitedTime = fault_time- pd.Timedelta(days=days)  # 计算前一天的日期
        LimitedTime2 = fault_time + pd.Timedelta(hours=hours)  # 计算后一小时的日期
        df = sel_data[(sel_data['sn'] == sn)&(sel_data["time"] > LimitedTime)&(sel_data["time"] < LimitedTime2)].copy()

        if data_type == 'train':
            label = row['label']

        # df = sel_data[sel_data['sn'] == sn].copy()
        # df = df[df['time_ts'] <= ts].copy()
        df = df.sort_values(by='time_ts').reset_index(drop=True)
        df = df.tail(tail_nums).copy()        # TODO: could change last 40 logs here

        # make some features

        logs_count = len(df)

        if logs_count > 0:
            msg_nunique = df['msg'].nunique()   # 统计不同的量
            msg_category_nunique = df['category'].nunique()
            msg_split_0_nunique = df['msg_split_0'].nunique()
            msg_split_1_nunique = df['msg_split_1'].nunique()
            msg_split_2_nunique = df['msg_split_2'].nunique()
            last_category = df['category'].value_counts().index[0]
            last_category = cate_map[last_category] if last_category in cate_map else len(cate_map)
            s = df['time_ts'].values
            if len(s) > 0:
                seconds_span = s[-1] - s[0] 
            else:
                seconds_span = 0

            df['time_ts_shift_1'] = df['time_ts'].shift(1)
            df['time_ts_diffs_1'] = df['time_ts'] - df['time_ts_shift_1']
            s = df['time_ts_diffs_1'].values
            if len(s) > 1:
                log_time_diffs_avg = np.mean(s[1:])
                log_time_diffs_max = np.max(s[1:])
                log_time_diffs_min = np.min(s[1:])
                log_time_diffs_std = np.std(s[1:])
            else:
                try:
                    log_time_diffs_avg = log_time_diffs_max = log_time_diffs_min = s[0]
                    log_time_diffs_std = 0
                except:
                    log_time_diffs_avg = log_time_diffs_max = log_time_diffs_min = log_time_diffs_std = 0

            all_msg = "\n".join(df['msg'].values.tolist()).lower()
            new_msg = FirstFilter(all_msg)
            w2v_emb = get_w2v_mean(new_msg)[0]
            tfv_emb = get_tfidf_svd([s.lower() for s in df['msg'].values.tolist()])

        else:
#             continue
            logs_count = 0
            msg_nunique = 0
            msg_category_nunique = 0
            msg_split_0_nunique = 0
            msg_split_1_nunique = 0
            msg_split_2_nunique = 0
            last_category = 0
            seconds_span = 0
            log_time_diffs_avg = 0
            log_time_diffs_max = 0
            log_time_diffs_min = 0
            log_time_diffs_std = 0
            w2v_emb = [0] * 32
            tfv_emb = [0] * 16


        # format dataset
        data = {
            'sn': sn,
            'fault_time': fault_time,
            'logs_count': logs_count,
            'msg_nunique': msg_nunique,
            'msg_category_nunique': msg_category_nunique,
            'msg_split_0_nunique': msg_split_0_nunique,
            'msg_split_1_nunique': msg_split_1_nunique,
            'msg_split_2_nunique': msg_split_2_nunique,
            'last_category': last_category,
            'seconds_span': seconds_span,
            'log_time_diffs_avg': log_time_diffs_avg,
            'log_time_diffs_max': log_time_diffs_max,
            'log_time_diffs_min': log_time_diffs_min,
            'log_time_diffs_std': log_time_diffs_std,
        }

        for i in range(32):
            data[f'msg_w2v_{i}'] = w2v_emb[i]
        for i in range(16):
            data[f'msg_tfv_{i}'] = tfv_emb[i]
        for i in cate_map:
            data[f'{i}_counts'] = df[f'{i}_counts'].sum()
            
        if data_type == 'train':
            data['label'] = label
        ret.append(data)
        
    return ret

In [62]:
# 大概需要半个小时
train = make_dataset(train_data, data_type='train')
df_train_0 = pd.DataFrame(train)

16869it [21:42, 12.96it/s]


In [63]:
# 制作测试集的特征

test_data['fault_time_ts'] = test_data["fault_time"].values.astype(np.int64) // 10 ** 9

test = make_dataset(test_data, data_type='test')

df_test_0 = pd.DataFrame(test)

12260it [15:52, 12.88it/s]


In [135]:
print(df_train_0.shape)
print(df_test_0.shape)

(16669, 97)
(12260, 96)


# srever model

In [158]:
temp = pd.read_csv("../tcdata/final_sel_log_dataset_b.csv").set_index('sn')
def func(x):
    if type(temp.loc[x, 'server_model']) == str:  
        return temp.loc[x, 'server_model']     
    else:
        return temp.loc[x, 'server_model'][0]

test_servel_model = pd.read_csv("../tcdata/final_submit_dataset_b.csv")
test_servel_model['0'] = test_servel_model['sn'].apply(func)

In [159]:
test_servel_model = test_servel_model.loc[:, ['sn', '0']]

In [160]:
train_servel_model = pd.read_csv("../data/preliminary_train_ServerModel.csv")
# test_servel_model = pd.read_csv("../data/preliminary_submit_dataset_b_ServerModel.csv")
print(train_servel_model.shape)
print(test_servel_model.shape)

(16669, 2)
(12260, 2)


In [161]:
# 添加servermodel 作为特征
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le = le.fit(train_servel_model["0"].unique())
NewTrainServerModel = np.array(le.transform(train_servel_model["0"]))
train_servel_model_all =pd.concat([train_servel_model,pd.DataFrame(NewTrainServerModel,columns=["sm_num"])],axis=1)
NewTestServerModel = np.array(le.transform(test_servel_model["0"]))
test_servel_model_all =pd.concat([test_servel_model,pd.DataFrame(NewTestServerModel,columns=["sm_num"])],axis=1)
# NewTestServerModel = np.array(le.transform(test_servel_model["0"])).reshape(-1,1)

In [162]:
train_servel_model_all.drop_duplicates(subset=['sn'],inplace=True)
test_servel_model_all.drop_duplicates(subset=['sn'],inplace=True)

In [163]:
df_train_1 = df_train_0.merge(train_servel_model_all[['sn','sm_num']],on="sn",how='inner')
df_test_1 = df_test_0.merge(test_servel_model_all[['sn','sm_num']],on="sn")

print(df_train_1.shape)
print(df_test_1.shape)

(16869, 98)
(12260, 97)


# 设置训练集和测试集

In [164]:
df_train = df_train_1.copy()
df_test = df_test_1.copy()
# df_train = df_train_0.copy()
# df_test = df_test_0.copy()

# 建模调参

In [165]:
df_train['label1'] = df_train['label'].apply(lambda x: x-1 if x in [2, 3] else 0)
weights = compute_class_weight(class_weight='balanced', classes=[0, 1, 2], y=df_train['label1'])
class_weights1 = dict(zip([0, 1, 2], weights))
weights = compute_class_weight(class_weight='balanced', classes=[0, 1], y=df_train['label'][(df_train['label'] == 0) | (df_train['label'] == 1)])
class_weights2 = dict(zip([0, 1], weights))

In [166]:
def macro_f1(target_df, submit_df):
    """
    计算得分
    :param target_df: [sn,fault_time,label]
    :param submit_df: [sn,fault_time,label]
    :return:
    """
    weights =  [3/7,  2/7,  1/7,  1/7]
    macro_F1 =  0.
    for i in  range(len(weights)):
        TP =  sum((target_df == i) & (submit_df == i))
        FP =  sum((target_df != i) & (submit_df == i))
        FN =  sum((target_df == i) & (submit_df != i))
        precision = TP /  (TP + FP)  if  (TP + FP)  >  0  else  0
        recall = TP /  (TP + FN)  if  (TP + FN)  >  0  else  0
        F1 =  2  * precision * recall /  (precision + recall)  if  (precision + recall)  >  0  else  0
        macro_F1 += weights[i]  * F1
        print("类别"+str(i)+"    precision:"+str(precision)+"     recall:"+str(recall)+"         F1:"+str(F1))
    return macro_F1

In [167]:
def creat_catboost(class_weights, cat_features, cls_kind, NUM_CLASSES=None):
    params = { 
            'task_type': 'CPU', 
            'bootstrap_type': 'Bernoulli',
            'learning_rate': 0.05, 
            'eval_metric': cls_kind, 
            'loss_function': cls_kind, 
            'classes_count': NUM_CLASSES, 
            'iterations': 2000, # 生成多少颗对称树
            'random_seed': 2022, 
            'depth': 8, 
            'subsample': 0.8, 
            'leaf_estimation_iterations': 8,
            'reg_lambda': 0.5,
            'class_weights': class_weights,
            'early_stopping_rounds': 100 ,
            'cat_features':cat_features,  # 设置无序离散型特征
            'one_hot_max_size':5   # 设置one-hot最大编码
        }
    model = CatBoostClassifier(**params)
    return model

In [168]:
def creat_xgb(class_weights, cat_features, cls_kind, NUM_CLASSES=None):
    params = { 
            'task_type': 'CPU', 
            'bootstrap_type': 'Bernoulli',
            'learning_rate': 0.05, 
            'classes_count': NUM_CLASSES, 
            'iterations': 2000, # 生成多少颗对称树
            'random_seed': 2022, 
            'depth': 8, 
            'subsample': 0.8, 
            'leaf_estimation_iterations': 8,
            'reg_lambda': 0.5,
            'class_weights': class_weights,
            'early_stopping_rounds': 100 ,
            'cat_features':cat_features,  # 设置无序离散型特征
            'one_hot_max_size':5   # 设置one-hot最大编码
        }
    model = XGBClassifier(**params)
    return model

In [169]:
def creat_lgb(class_weights, cat_features, cls_kind, NUM_CLASSES=None):
    model = LGBMClassifier(
    # **params
        class_weights = class_weights,
        classes_count = NUM_CLASSES, 
        cat_features = cat_features,
        loss_function = cls_kind, 
        random_state = 626, 
        n_estimators = 800,
        learning_rate = 0.1,
        max_depth = -1,
        num_leaves = 127,
        colsample_bytree = 0.8, 
        subsample = 0.8,
        lambda_l1 = 0.1,   # 0.1
        lambda_l2 = 0.2,  # 0.2
        device='cpu'
    )
    return model

In [170]:
FOLDS = 10
TARGET = 'label'
TARGET2 = 'label1'
a = ['sn', 'fault_time', TARGET, TARGET2,
    'Physical_counts',
    'Reserved_counts',
    'Cable/Interconnect_counts',
    'Battery_counts',
    'Slot/Connector_counts',
    'Fan_counts',
    'LAN_counts',
    'Add-in_counts',
    'Session_counts',
    'Chip_counts',
    'Cable_counts',
    'reserved_counts',
    'Version_counts',
    'Terminator_counts',
    'Slot_counts',
    'Drive_counts',
    'Watchdog2_counts',
    'device_counts',
    'Event_counts',
    'Management_counts',
    'OEM_counts',
    'Microcontroller/Coprocessor_counts',
    'Chassis_counts']

# a = ['sn', 'fault_time', TARGET, TARGET2]
use_features = [col for col in df_train.columns if col not in a ]

cat_features = [use_features.index("sm_num"), use_features.index("last_category")]  # 分类特征
model1_useless, model2_useless = {i:0 for i in df_train.columns}, {i:0 for i in df_train.columns}

In [171]:
def softmax(x):
    return np.exp(x) / np.exp(x).sum(axis=1).reshape(-1, 1)

In [173]:
for model_name, model_builder in zip(['catboost', 'xgb', 'lgb'], [creat_catboost, creat_xgb, creat_lgb]):
    y_pred = np.zeros((len(df_test), 4))
    y_pred_res = np.zeros((len(df_test), 4))
    folds = GroupKFold(n_splits=FOLDS)
    oof_pred = np.zeros(len(df_train))
    for fold, (tr_ind, val_ind) in enumerate(folds.split(df_train, df_train[TARGET], df_train['sn'])):
        print(f'Fold {fold + 1}') 
        # model1: three classification
        target = 'label1'
        NUM_CLASSES = df_train[target].nunique()
        x_train, x_val = df_train[use_features].iloc[tr_ind], df_train[use_features].iloc[val_ind] 
        y_train, y_val = df_train[target].iloc[tr_ind], df_train[target].iloc[val_ind]
        model1 = model_builder(class_weights1, cat_features, 'MultiClass', NUM_CLASSES)  
        if model_name == 'catboost':
            model1.fit(x_train, 
              y_train, 
              eval_set = (x_val, y_val),
              verbose=1000000)  
        else:
            model1.fit(x_train.values, 
              y_train.values, 
              eval_set = [(x_val.values, y_val.values)],
              verbose=1000000)    
        feat_imp = pd.DataFrame({'imp': model1.feature_importances_, 'feature': use_features})
        feat_imp = feat_imp.sort_values(by='imp').reset_index(drop=True)
        for i in feat_imp.index:
            if feat_imp.loc[i, 'imp'] < 0.1:
                model1_useless[feat_imp.loc[i, 'feature']] += 1

        # model2: binary classification
        target = 'label'
        x_train, x_val = df_train[use_features].iloc[tr_ind], df_train[use_features].iloc[val_ind] 
        y_train, y_val = df_train[target].iloc[tr_ind], df_train[target].iloc[val_ind]
        x_train_2 = x_train[(y_train == 0) | (y_train == 1)]
        x_val_2 = x_val[(y_val == 0) | (y_val == 1)]
        y_train_2 = y_train[(y_train == 0) | (y_train == 1)]
        y_val_2 = y_val[(y_val == 0) | (y_val == 1)]
        NUM_CLASSES = 2
        model2 = model_builder(class_weights2, cat_features, 'Logloss')  
        if model_name == 'catboost':
            model2.fit(x_train_2, 
                      y_train_2, 
                      eval_set=(x_val_2, y_val_2), 
                      verbose=1000000)
        else:
            model2.fit(x_train_2.values, 
                      y_train_2.values, 
                      eval_set=[(x_val_2.values, y_val_2.values)], 
                      verbose=1000000)

        predicted = softmax(model1.predict_proba(x_val)).argmax(axis=1) + 1
        predicted[predicted == 1] = softmax(model2.predict_proba(x_val[predicted == 1])).argmax(axis=1)
        oof_pred[val_ind] = predicted # 输出概率
        score = macro_f1(df_train['label'].iloc[val_ind], predicted)
        # 测试集，
        y_pred1 = softmax(model1.predict_proba(df_test[use_features]))
        y_pred[(y_pred1.argmax(axis=1) != 0), 0:2] = 0
        y_pred[(y_pred1.argmax(axis=1) != 0), 2:] = y_pred1[(y_pred1.argmax(axis=1) != 0)][:, 1:] / y_pred1[(y_pred1.argmax(axis=1) != 0)][:, 1:].sum(axis=1).reshape(-1, 1)
        y_pred[(y_pred1.argmax(axis=1) == 0), 2:] = 0
        y_pred[(y_pred1.argmax(axis=1) == 0), 0:2] = softmax(model2.predict_proba(df_test[use_features][y_pred1.argmax(axis=1) == 0]))
        y_pred_res +=  y_pred / folds.n_splits

        # score = macro_f1(y_val, oof_pred[val_ind].argmax(axis=1), average='macro')
        print(f'F1 score: {score}')       
        feat_imp = pd.DataFrame({'imp': model2.feature_importances_, 'feature': use_features})
        feat_imp = feat_imp.sort_values(by='imp').reset_index(drop=True)
        for i in feat_imp.index:
            if feat_imp.loc[i, 'imp'] < 0.1:
                model2_useless[feat_imp.loc[i, 'feature']] += 1

    #     print("Features importance...")
    #     

        del x_train, x_val, y_train, y_val, x_train_2, x_val_2, y_train_2, y_val_2,
        gc.collect()
    print(model_name, macro_f1(df_train['label'], oof_pred))

Fold 1
0:	learn: 0.9886388	test: 0.9893206	best: 0.9893206 (0)	total: 134ms	remaining: 4m 27s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2426227979
bestIteration = 171

Shrink model to first 172 iterations.
0:	learn: 0.6876646	test: 0.6891104	best: 0.6891104 (0)	total: 122ms	remaining: 4m 4s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.65178558
bestIteration = 20

Shrink model to first 21 iterations.
类别0    precision:0.32075471698113206     recall:0.34         F1:0.3300970873786408
类别1    precision:0.6927536231884058     recall:0.6967930029154519         F1:0.694767441860465
类别2    precision:0.9546925566343042     recall:0.913312693498452         F1:0.9335443037974683
类别3    precision:0.83203125     recall:0.9466666666666667         F1:0.8856548856548856
F1 score: 0.5998607621870294
Fold 2
0:	learn: 0.9921688	test: 0.9909137	best: 0.9909137 (0)	total: 182ms	remaining: 6m 3s
Stopped by overfitting detector  (100 iterations wait)

bestTes

TypeError: Unknown type of parameter:class_weights, got:dict

In [None]:
model1 = creat_xgb(class_weights1, cat_features, 'MultiClass', NUM_CLASSES)  

model1.fit(x_train.values, 
          y_train.values, 
          eval_set = [(x_val.values, y_val.values)],
          verbose=1000000)

In [None]:
y_train.values.reshape(-1)

In [None]:
for i, (valid_X, valid_y) in enumerate(eval_set):
    print(valid_X, valid_y)
    break

In [None]:
eval_set[0]

In [None]:
sub = df_test[['sn', 'fault_time']].copy()
sub['label'] = y_pred_res.argmax(axis=1)
# display(sub.head())
sub['label'].value_counts()

In [None]:
import time
sub.to_csv(f'predictions.csv', index=False)

In [None]:
a = []
for i in model1_useless:
    if model1_useless[i] >= 7:
        a.append(i)
b = []
for i in model2_useless:
    if model2_useless[i] >= 7:
        b.append(i)

In [None]:
a.extend(['sn', 'fault_time', TARGET, TARGET2])
use_features1 = [col for col in df_train.columns if col not in a ]
b.extend(['sn', 'fault_time', TARGET, TARGET2])
use_features2 = [col for col in df_train.columns if col not in b ]
cat_features1 = [use_features1.index("sm_num"), use_features1.index("last_category")]  # 分类特征
cat_features2 = [use_features2.index("sm_num")]  # 分类特征


In [None]:
y_pred = np.zeros((len(df_test), 4))
folds = GroupKFold(n_splits=FOLDS)
oof_pred = np.zeros(len(df_train))
for fold, (tr_ind, val_ind) in enumerate(folds.split(df_train, df_train[TARGET], df_train['sn'])):
    print(f'Fold {fold + 1}') 
    # model1: three classification
    target = 'label1'
    NUM_CLASSES = df_train[target].nunique()
    x_train_1, x_val_1 = df_train[use_features1].iloc[tr_ind], df_train[use_features1].iloc[val_ind] 
    y_train_1, y_val_1 = df_train[target].iloc[tr_ind], df_train[target].iloc[val_ind]
    model1 = creat_model(class_weights1, cat_features1, 'MultiClass', NUM_CLASSES)  
    model1.fit(x_train_1, 
              y_train_1, 
              eval_set=(x_val_1, y_val_1), 
              verbose=100)    

    # model2: binary classification
    target = 'label'
    x_train, x_val = df_train[use_features2].iloc[tr_ind], df_train[use_features2].iloc[val_ind] 
    y_train, y_val = df_train[target].iloc[tr_ind], df_train[target].iloc[val_ind]
    x_train_2 = x_train[(y_train == 0) | (y_train == 1)]
    x_val_2 = x_val[(y_val == 0) | (y_val == 1)]
    y_train_2 = y_train[(y_train == 0) | (y_train == 1)]
    y_val_2 = y_val[(y_val == 0) | (y_val == 1)]
    NUM_CLASSES = 2
    model2 = creat_model(class_weights2, cat_features2, 'Logloss')  
    model2.fit(x_train_2, 
              y_train_2, 
              eval_set=(x_val_2, y_val_2), 
              verbose=100)

    predicted = softmax(model1.predict_proba(x_val_1)).argmax(axis=1) + 1
    predicted[predicted == 1] = softmax(model2.predict_proba(df_train[use_features2].iloc[val_ind][predicted == 1])).argmax(axis=1)
    oof_pred[val_ind] = predicted # 输出概率
    score = macro_f1(df_train['label'].iloc[val_ind], predicted)
    # 测试集，
    y_pred1 = softmax(model1.predict_proba(df_test[use_features1]))
    y_pred[(y_pred1.argmax(axis=1) != 0), 0:2] = 0
    y_pred[(y_pred1.argmax(axis=1) != 0), 2:] = y_pred1[(y_pred1.argmax(axis=1) != 0)][:, 1:] / y_pred1[(y_pred1.argmax(axis=1) != 0)][:, 1:].sum(axis=1).reshape(-1, 1)
    y_pred[(y_pred1.argmax(axis=1) == 0), 2:] = 0
    y_pred[(y_pred1.argmax(axis=1) == 0), 0:2] = softmax(model2.predict_proba(df_test[use_features2][y_pred1.argmax(axis=1) == 0]))
    y_pred +=  y_pred / folds.n_splits

    # score = macro_f1(y_val, oof_pred[val_ind].argmax(axis=1), average='macro')
    print(f'F1 score: {score}')       
#     feat_imp = pd.DataFrame({'imp': model2.feature_importances_, 'feature': use_features})
#     feat_imp = feat_imp.sort_values(by='imp').reset_index(drop=True)
#     for i in feat_imp.index:
#         if feat_imp.loc[i, 'imp'] < 0.1:
#             model2_useless[feat_imp.loc[i, 'feature']] += 1
    
#     print("Features importance...")
#     

    del x_train, x_val, y_train, y_val, x_train_2, x_val_2, y_train_2, y_val_2,
    gc.collect()

In [None]:
macro_f1(df_train['label'], oof_pred)

In [72]:
sub = df_test[['sn', 'fault_time']].copy()
sub['label'] = y_pred_res.argmax(axis=1)
# display(sub.head())
sub['label'].value_counts()

2    1601
3     625
1     582
0     222
Name: label, dtype: int64

In [73]:
import time
sub.to_csv(f'{time.strftime("%m-%d %H-%M-%S", time.localtime())}.csv', index=False)