# 360初赛规则模型

In [None]:
from config import *
from data import get_test_final_data, get_train_final_data
from jieba.analyse import tfidf,textrank
from joblib import Parallel,delayed
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import lightgbm as lgb
import re

In [None]:
train_data = get_train_final_data()
test_data = get_test_final_data()
train_data.fillna("",inplace=True)
test_data.fillna("",inplace=True)

In [None]:
def applyParallel(dfgroup, func, n_thread):
    with Parallel(n_jobs=n_thread) as parallel:
        res = parallel(delayed(func)(v) for k,v in dfgroup)
        return pd.concat(res, axis=0)

In [None]:
train_data["content_last_str"] = train_data.content.map(lambda x: x[-1] if len(x)>0 else "")
test_data["content_last_str"] = test_data.content.map(lambda x: x[-1] if len(x)>0 else "")

In [None]:
train_test_sent_count = dict()
for content in train_data.content:
    for sent in set(re.split('[' + ''.join([u'，', u'、', u'。', ',', '.', '!', '?', u'：', ':']) + ']', content)):
        if len(sent) < 8:
            continue
        train_test_sent_count.setdefault(sent, 0)
        train_test_sent_count[sent] += 1       
for content in test_data.content:
    for sent in set(re.split('[' + ''.join([u'，', u'、', u'。', ',', '.', '!', '?', u'：', ':']) + ']', content)):
        if len(sent) < 8:
            continue
        train_test_sent_count.setdefault(sent, 0)
        train_test_sent_count[sent] += 1

In [None]:
def get_feature(row):
    content = row.content
    sent_count = dict()
    for sent in re.split('[' + ''.join([u'，', u'、', u'。', ',', '.', '!', '?', u'：', ':']) +']', content):
        if len(sent) < 8:
            continue
        sent_count.setdefault(sent,0)
        sent_count[sent] += 1
    if sent_count:
        values = sent_count.values()
        sens = sent_count.keys()
        
        self_sens = []
        common_sens = []
        for sen in sens:
            if train_test_sent_count[sen] > 1:
                common_sens.append(sen)
            else:
                self_sens.append(sen)
        self_sen_str = "".join(self_sens)
        common_sen_str = "".join(common_sens)
        self_sen_keywords = set(tfidf(self_sen_str))
        common_sen_keywords = set(tfidf(common_sen_str))
        row["common_keywords"] = len(self_sen_keywords) + len(common_sen_keywords) - len(self_sen_keywords.union(common_sen_keywords))
        
        sen_in_others_count = list(map(lambda sen: train_test_sent_count[sen] ,sens))
        row['sen_in_others_count_max'] = np.max(sen_in_others_count)
        row['sen_in_others_count_q8'] =  pd.Series(sen_in_others_count).quantile(0.8)
        row['sen_in_others_duplicated_num'] = np.sum(np.array(sen_in_others_count)>1)

        row['in_content_duplicated_num'] = np.sum(np.array(list(values))>1)
        row["max_len"] = np.max(list(values))
        row["sen_num"] = len(values)
        row['q8_sen_num'] = pd.Series(list(values)).quantile(0.8)
    else:    
        row["common_keywords"] = 0
        row['sen_in_others_count_max'] = 0
        row['sen_in_others_count_q8'] =  0
        row['sen_in_others_duplicated_num'] = 0
        
        row['in_content_duplicated_num'] = 0
        row["max_len"] = 0
        row["sen_num"] = 0
        row['q8_sen_num'] = 0
    row['if_，'] = row.content_last_str == u'，'
    row['if_。'] = row.content_last_str == u'。'
    row['if_、'] = row.content_last_str == u'、'
    row['if_!']  = row.content_last_str == u'!'
    row['if_?'] = row.content_last_str == u'?'
    row['if_;'] = row.content_last_str == u';'
    return row.to_frame().T

In [None]:
train = applyParallel(train_data.iterrows(), get_feature, 10)
test = applyParallel(test_data.iterrows(), get_feature, 10)

In [None]:
filterfeats = ["id", "title","content", "label", "content_last_str"]
predictors = [feat for feat in train.columns if feat not in filterfeats]
training_label = train_data.label.map(label2int).values
training_data = train[predictors]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(training_data.astype(float), training_label, test_size= 0.1, random_state=0)
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [None]:
params = {
        'boosting_type': 'gbrt',
        'objective': 'binary',
        'num_leaves': 15,
        'metric': ["auc"],
        'learning_rate': 0.06,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.85,
        'bagging_freq': 5,
        'verbose': 1,
        'num_threads': 20,
        'min_data_in_leaf': 100
}
gbm = lgb.train( params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=[lgb_train, lgb_eval],
                verbose_eval = True,
                early_stopping_rounds=13)

In [None]:
pred = gbm.predict(X_test)
print(f1_score((pred>0.5).astype(int),y_test))

### BadCase

In [None]:
train["proba"] = gbm.predict(train[predictors].astype(float)) 
train["pred"] = (train["proba"]>0.5).astype("int")
train["label"] = train_data.label.map(label2int)
error = train[train.pred != train.label]

In [None]:
error[error.label == 0].shape,error[error.label == 1].shape

In [None]:
n = error.sample(2)
for i in n.iterrows():
    print(i[1].title)
    print(i[1].content)
    print(int2label[i[1].label],i[1].proba,i[1].sen_num,i[1].sen_in_others_count_max,i[1].in_content_duplicated_num,
         i[1].sen_in_others_duplicated_num,i[1].max_len,i[1].common_keywords)
    print("="*10)

In [None]:
gbm.save_model("./gbm.txt")

In [None]:
names, importances = zip(*(sorted(zip(gbm.feature_name(), gbm.feature_importance()), key=lambda x: x[1])))
for name, importance in zip(names, importances):
    print (name, importance)

In [None]:
test['label'] = gbm.predict(test[predictors].astype(float),  num_iteration=gbm.best_iteration)
test['label'] = (test['label'] > 0.5).astype(int).map(int2label)
test_submit = test[['id', 'label']]
test_submit.to_csv(Config.data_dir + '/submission.csv',index=None, header=None, encoding='utf8')