# Toxic ngram tfidf and multi-models stacking

## Import packages

In [1]:
import warnings
warnings.filterwarnings("ignore")

from datetime import datetime as dt
import numpy as np
import pandas as pd

# feature extraction and model selection tools.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_union
from sklearn import decomposition, preprocessing
from sklearn.model_selection import cross_val_score, train_test_split

# models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
import lightgbm as lgb

import os
import h5py
import pickle
import scipy.sparse as scs

In [2]:
## basic paramaters definition
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

## Load data

In [3]:
df_trn = pd.read_csv('../input/train.csv').fillna(' ')
df_tst = pd.read_csv('../input/test.csv').fillna(' ')

print('df_trn', df_trn.shape)
print('df_tst', df_tst.shape)

df_trn (159571, 8)
df_tst (153164, 2)


## Develop at local should sampling

In [None]:
is_dev = False
if is_dev:
    samples = []
    for label in class_names:
        df_sample = df_trn.groupby(
            label,
            group_keys=False).apply(lambda x: x.sample(min(len(x), 10000)))
        samples.append(df_sample)

    df_trn = pd.concat(samples, axis=0)
    df_trn = df_trn.drop_duplicates()
    df_tst = df_tst.sample(10000)
    print("Sample size is ", df_trn.shape, df_tst.shape)

In [None]:
print('dump dataframe start', dt.now())
pickle.dump(df_trn, open('df_trn.pkl', 'wb'))
pickle.dump(df_tst, open('df_tst.pkl', 'wb'))
print('dump dataframe end', dt.now())

dump dataframe start 2018-12-12 14:36:32.467692
dump dataframe end 2018-12-12 14:36:33.279240


## TF-IDF feature extracting

In [None]:
# extract word level ngram and char level ngram feature
trn_text = df_trn['comment_text']
tst_text = df_tst['comment_text']
all_text = pd.concat([trn_text, tst_text])

# 提取单词级别的tfidf文本特征，ngram设为(1,2) 表示1到2个单词的分割都会提取出来
word_vec = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=10000)

# 提取字符级别的tfidf特征，ngram设为(2,6) 这个地方，可以调参，不过50K的特征，其实覆盖度也够了
char_vec = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 4),
    max_features=50000)

# 把两个向量化特征提取工具做成一个pipeline
all_vec = make_union(word_vec, char_vec, n_jobs=10)

# 训练的时候需要用全量的数据来让tfidf覆盖到你所有的单词，词组，字符，字符串，能与计算出这部分的tfidf
print('TF-IDF start', dt.now())
all_vec.fit(all_text)
pickle.dump(all_vec, open('tfidf.vec', 'wb'))

print('TF-IDF 1/3', dt.now())
X_trn = all_vec.transform(trn_text)
print('TF-IDF 2/3', dt.now())
X_tst = all_vec.transform(tst_text)
print('TF-IDF 3/3', dt.now())
print('TF-IDF end', dt.now(), X_trn.shape, X_tst.shape)

print('save_npz start', dt.now())
scs.save_npz('X_trn_tfidf.npz', X_trn)
scs.save_npz('X_tst_tfidf.npz', X_tst)
print('save_npz end', dt.now())

TF-IDF start 2018-12-12 14:36:33.304008
TF-IDF 1/3 2018-12-12 14:41:15.827396
TF-IDF 2/3 2018-12-12 14:44:16.510873
TF-IDF 3/3 2018-12-12 14:46:54.036578
TF-IDF end 2018-12-12 14:46:54.037655 (159571, 60000) (153164, 60000)
save_npz start 2018-12-12 14:46:54.037911
save_npz end 2018-12-12 14:52:53.095288


## Reducing features dimensions with PCA and StandardScaler

In [None]:
# 使用SVD进行降维，components设为120，SVD的components的合适调整区间一般为120~200
# 因为tfidf提取的特征唯独太大，线性模型还可以快速训练处一个model，但是对于treebase的模型就差很多了，特征维度多了，简直就是灾难
# 面对特征灾难的解法有：1.降维、2. 使用model-base的特征选择

# 使用TruncatedSVD来降维，是因为它计算非常快，不像NMF这种计算量就非常大，耗时比较长
svd = decomposition.TruncatedSVD(n_components=120)
print('TruncatedSVD start', dt.now())
svd.fit(X_trn)
print('TruncatedSVD 1/3', dt.now())
X_trn_svd = svd.transform(X_trn)
print('TruncatedSVD 2/3', dt.now())
X_tst_svd = svd.transform(X_tst)
print('TruncatedSVD 3/3', dt.now())
print('TruncatedSVD end', dt.now(), X_trn_svd.shape, X_tst_svd.shape)

np.save('X_trn_svd.npy', X_trn_svd)
np.save('X_tst_svd.npy', X_tst_svd)

# 对从SVD获得的数据进行标准化处理，这样在线性模型或者神经网络这类模型中对特征范围比较敏感的非常有用
scl = preprocessing.StandardScaler()
print('StandardScaler start', dt.now())
scl.fit(X_trn_svd)
print('StandardScaler 1/3', dt.now())
X_trn_svd_scl = scl.transform(X_trn_svd)
print('StandardScaler 2/3', dt.now())
X_tst_svd_scl = scl.transform(X_tst_svd)
print('StandardScaler 3/3', dt.now())
print('StandardScaler end', dt.now(), X_trn_svd_scl.shape, X_tst_svd_scl.shape)

np.save('X_trn_svd_scl.npy', X_trn_svd_scl)
np.save('X_tst_svd_scl.npy', X_tst_svd_scl)

TruncatedSVD start 2018-12-12 14:52:53.111774
TruncatedSVD 1/3 2018-12-12 14:56:29.596487
TruncatedSVD 2/3 2018-12-12 14:56:45.288917
TruncatedSVD 3/3 2018-12-12 14:56:59.203840
TruncatedSVD end 2018-12-12 14:56:59.204074 (159571, 120) (153164, 120)
StandardScaler start 2018-12-12 14:56:59.407111
StandardScaler 1/3 2018-12-12 14:56:59.999576
StandardScaler 2/3 2018-12-12 14:57:00.210949
StandardScaler 3/3 2018-12-12 14:57:00.414812
StandardScaler end 2018-12-12 14:57:00.415168 (159571, 120) (153164, 120)


## Checkpoint 1 load data

In [None]:
print('Load cache start', dt.now())

df_trn = pickle.load(open('df_trn.pkl', 'rb'))
df_tst = pickle.load(open('df_tst.pkl', 'rb'))

X_trn = scs.load_npz('X_trn_tfidf.npz')
X_tst = scs.load_npz('X_tst_tfidf.npz')

X_trn_svd = np.load('X_trn_svd.npy')
X_tst_svd = np.load('X_tst_svd.npy')

X_trn_svd_scl = np.load('X_trn_svd_scl.npy')
X_tst_svd_scl = np.load('X_tst_svd_scl.npy')

print('Load cache end', dt.now())

Load cache start 2018-12-12 14:57:00.626193
Load cache end 2018-12-12 14:57:40.354163


## Define a general function to try different models

In [None]:
def skl_train_and_preds(clf, name, X_trn, X_tst):
    print("Fit {} {}".format(name, type(clf)))
    print('X_trn', X_trn.shape)
    print('X_tst', X_tst.shape)
    scores = []  # 记录每一个类别的交叉验证训练的auc得分

    trn_preds = pd.DataFrame.from_dict({'id': df_trn['id']})
    tst_preds = pd.DataFrame.from_dict({'id': df_tst['id']})

    # 这里迭代每一个类别来，将一个多label问题，转换为多个二分类问题
    for class_name in class_names:
        y_trn = df_trn[class_name]
        # use cross-validataion to train model
        cv_score = np.mean(
            cross_val_score(clf, X_trn, y_trn, cv=3, scoring='roc_auc'))
        scores.append(cv_score)
        print('CV score for class {} is {}'.format(class_name, cv_score))
        # fit new model with full of train data
        clf.fit(X_trn, y_trn)
        # used for stacking
        trn_preds[class_name] = clf.predict_proba(X_trn)[:, 1]
        # prediction the test for single submission or bledding
        tst_preds[class_name] = clf.predict_proba(X_tst)[:, 1]

    print('Total CV score is {}'.format(np.mean(scores)))
    # save the model train and test predicted result in to local file.
    trn_preds.to_csv('0-trn-ngram-{}.csv'.format(name), index=False)
    tst_preds.to_csv('0-tst-ngram-{}.csv'.format(name), index=False)
    print('=' * 80)

## LogisticRegression

In [None]:
# 先参数最简单的逻辑回归模型，如果LR可以解决这个问题，说明我们的建模是没有问题的
# 参数solver，使用sag计算非常快，其他solver你还可以选择比较牛逼的libnear，但是计算量稍大
clf = LogisticRegression(solver='sag')
skl_train_and_preds(clf, 'lr', X_trn, X_tst)

Fit lr <class 'sklearn.linear_model.logistic.LogisticRegression'>
X_trn (159571, 60000)
X_tst (153164, 60000)
CV score for class toxic is 0.9785644059424624
CV score for class severe_toxic is 0.9886992060449481
CV score for class obscene is 0.9902203632571126
CV score for class threat is 0.9891529445940939
CV score for class insult is 0.9825933972596332
CV score for class identity_hate is 0.9828002905348693
Total CV score is 0.98533843460552


## Feature selection before tree-base modeling

In [None]:
from sklearn.feature_selection import SelectFromModel

# 定义一个通用的特征选择函数，用来为treebase的模型做特征降维
def select_features(class_name, X_trn, X_tst, y_trn):
    if os.path.exists('X_trn_sfm_{}.npz'.format(class_name)):
        X_trn = scs.load_npz('X_trn_sfm_{}.npz'.format(class_name))
        X_tst = scs.load_npz('X_tst_sfm_{}.npz'.format(class_name))
    else:
        # 使用LR做base model，这里就用默认参数吧，其实也能用pipeline的方式来做参数优化，比较耗时，没必要
        lr = LogisticRegression(solver='sag')
        # 这里的参数主要看threshold，这里是0.2是说我的特征权重大于0.2的才会留下来
        sfm = SelectFromModel(lr, threshold=0.2)
        print(X_trn.shape)
        print('Feature selection 1/3')
        sfm.fit(X_trn, y_trn)
        print('Feature selection 2/3')
        X_trn = sfm.transform(X_trn)
        print('Feature selection 3/3')
        X_tst = sfm.transform(X_tst)
        print('feature selection end', type(X_trn), X_trn.shape, X_tst.shape)
        scs.save_npz('X_trn_sfm_{}.npz'.format(class_name), X_trn)
        scs.save_npz('X_tst_sfm_{}.npz'.format(class_name), X_tst)

    return X_trn, X_tst

## XGBClassifier

In [None]:
def xgb_train_and_preds(params,
                        name,
                        X_trn,
                        X_tst,
                        use_feature_selection=False):
    print("Fit {} XGBClassifier".format(name))
    print('X_trn', X_trn.shape)
    print('X_tst', X_tst.shape)

    trn_preds = pd.DataFrame.from_dict({'id': df_trn['id']})
    tst_preds = pd.DataFrame.from_dict({'id': df_tst['id']})

    for class_name in class_names:
        print('Train for class {}'.format(class_name))
        y_trn = df_trn[class_name]
        if use_feature_selection:
            X_trn_sfm, X_tst_sfm = select_features(class_name, X_trn, X_tst,
                                                   y_trn)
        else:
            X_trn_sfm, X_tst_sfm = X_trn, X_tst
        trn_X, val_X, trn_y, val_y = train_test_split(
            X_trn_sfm, y_trn, test_size=0.33, random_state=42, stratify=y_trn)

        trn_ds, val_ds = xgb.DMatrix(trn_X, trn_y), xgb.DMatrix(val_X, val_y)
        eval_list = [(val_ds, 'val')]

        evals_result = {}
        gbm = xgb.train(
            params,
            trn_ds,
            3000,
            eval_list,
            early_stopping_rounds=30,
            evals_result=evals_result,
            verbose_eval=20,
        )

        best_iteration_ = np.argmax(evals_result['val']['auc']) + 1
        print('best_iteration_', best_iteration_)
        # fit new model with full of train data
        gbm = xgb.train(params, xgb.DMatrix(X_trn_sfm, y_trn), best_iteration_)

        # used for stacking
        trn_preds[class_name] = gbm.predict(
            xgb.DMatrix(X_trn_sfm), ntree_limit=gbm.best_ntree_limit)
        # prediction the test for single submission or bledding
        tst_preds[class_name] = gbm.predict(
            xgb.DMatrix(X_tst_sfm), ntree_limit=gbm.best_ntree_limit)
        print()

    # save the model train and test predicted result in to local file.
    trn_preds.to_csv('0-trn-ngram-{}.csv'.format(name), index=False)
    tst_preds.to_csv('0-tst-ngram-{}.csv'.format(name), index=False)
    print('=' * 80)

In [None]:
params = {
    'max_depth': 13,
    'eta': 1,
    'silent': 1,
    'objective': 'binary:logistic',
    'subsample': 0.88,
    'colsample_bytree': 0.88,
    'colsample_bylevel': 0.7,
    'nthread': 10,
    'learning_rate': 1e-1,
    'eval_metric': 'auc'
}
xgb_train_and_preds(params, 'xgb', X_trn, X_tst, True)

Fit xgb XGBClassifier
X_trn (159571, 60000)
X_tst (153164, 60000)
Train for class toxic
(159571, 60000)
Feature selection 1/3
Feature selection 2/3
Feature selection 3/3
feature selection end <class 'scipy.sparse.csr.csr_matrix'> (159571, 16169) (153164, 16169)
[0]	val-auc:0.821204
Will train until val-auc hasn't improved in 30 rounds.
[20]	val-auc:0.942259
[40]	val-auc:0.95574
[60]	val-auc:0.960463
[80]	val-auc:0.963728
[100]	val-auc:0.96617
[120]	val-auc:0.968016
[140]	val-auc:0.969164
[160]	val-auc:0.969944
[180]	val-auc:0.97063
[200]	val-auc:0.971172
[220]	val-auc:0.971662
[240]	val-auc:0.971983
[260]	val-auc:0.972326
[280]	val-auc:0.972655
[300]	val-auc:0.9728
[320]	val-auc:0.973063
[340]	val-auc:0.973382
[360]	val-auc:0.973622
[380]	val-auc:0.973752
[400]	val-auc:0.973951
[420]	val-auc:0.974026
[440]	val-auc:0.974237
[460]	val-auc:0.974275
[480]	val-auc:0.974348
[500]	val-auc:0.974519
[520]	val-auc:0.974707
[540]	val-auc:0.974818
[560]	val-auc:0.974967
[580]	val-auc:0.975086
[600

## LightGBM

In [20]:
def lgb_train_and_preds(params,
                        name,
                        X_trn,
                        X_tst,
                        use_feature_selection=False):
    print("Fit {} LightGBM".format(name))
    print('X_trn', X_trn.shape)
    print('X_tst', X_tst.shape)

    trn_preds = pd.DataFrame.from_dict({'id': df_trn['id']})
    tst_preds = pd.DataFrame.from_dict({'id': df_tst['id']})
    
    for class_name in class_names:
        print('Train for class {}'.format(class_name))
        y_trn = df_trn[class_name]

        if use_feature_selection:
            X_trn_sfm, X_tst_sfm = select_features(class_name, X_trn, X_tst,
                                                   y_trn)
        else:
            X_trn_sfm, X_tst_sfm = X_trn, X_tst

        trn_X, val_X, trn_y, val_y = train_test_split(
            X_trn_sfm, y_trn, test_size=0.33, random_state=42, stratify=y_trn)
        trn_ds, val_ds = lgb.Dataset(trn_X, trn_y), lgb.Dataset(val_X, val_y)
        evals_result = dict()
        print('train with val...')
        gbm = lgb.train(
            params=params,
            train_set=trn_ds,
            valid_sets=val_ds,
            valid_names=['val'],
            early_stopping_rounds=30,
            num_boost_round=3000,
            verbose_eval=20,
            evals_result=evals_result)

        best_iteration_ = np.argmax(evals_result['val']['auc']) + 1
        print('best_iteration_', best_iteration_)
        # fit new model with full of train data
        print('train for prediction...', dt.now())
        gbm = lgb.train(
            params=params,
            train_set=lgb.Dataset(X_trn_sfm, y_trn),
            num_boost_round=best_iteration_)
        
        # used for stacking
        print('predict for train')
        trn_preds[class_name] = gbm.predict(X_trn_sfm)
        # prediction the test for single submission or bledding
        print('predict for test')
        tst_preds[class_name] = gbm.predict(X_tst_sfm)
        print()

    # save the model train and test predicted result in to local file.
    trn_preds.to_csv('0-trn-ngram-{}.csv'.format(name), index=False)
    tst_preds.to_csv('0-tst-ngram-{}.csv'.format(name), index=False)
    print('=' * 80)

In [21]:
params = {
    'learning_rate': 1e-1,
    'metric': 'auc',
    'max_depth': 13,
    'objective': 'binary',
    'subsample': 0.88,
    'colsample_bytree': 0.88,
    'random_state': 128
}
lgb_train_and_preds(params, 'lgb', X_trn, X_tst, True)
lgb_train_and_preds(params, 'lgb-svd', X_trn_svd, X_tst_svd)

Fit lgb LightGBM
X_trn (159571, 60000)
X_tst (153164, 60000)
Train for class toxic
train with val...
Training until validation scores don't improve for 30 rounds.
[20]	val's auc: 0.946243
[40]	val's auc: 0.958637
[60]	val's auc: 0.964901
[80]	val's auc: 0.968683
[100]	val's auc: 0.970932
[120]	val's auc: 0.972323
[140]	val's auc: 0.973072
[160]	val's auc: 0.97398
[180]	val's auc: 0.974428
[200]	val's auc: 0.97472
[220]	val's auc: 0.975139
[240]	val's auc: 0.975425
[260]	val's auc: 0.975628
[280]	val's auc: 0.975816
[300]	val's auc: 0.975967
[320]	val's auc: 0.976061
[340]	val's auc: 0.976164
[360]	val's auc: 0.976378
[380]	val's auc: 0.976353
[400]	val's auc: 0.976491
[420]	val's auc: 0.97656
[440]	val's auc: 0.976663
[460]	val's auc: 0.97676
[480]	val's auc: 0.976931
[500]	val's auc: 0.976939
[520]	val's auc: 0.977005
[540]	val's auc: 0.977117
[560]	val's auc: 0.977038
Early stopping, best iteration is:
[540]	val's auc: 0.977117
best_iteration_ 540
train for prediction... 2018-12-13 0

## Stacking

In [22]:
csv_files = os.listdir('.')

In [23]:
trn_pred_dfs = []

for trn_file in list(filter(lambda x: x.find('trn') >= 0 and x.find('csv')>=0, csv_files)):
    print(trn_file)
    column_names = [class_name + ' ' + trn_file for class_name in class_names]
    df_pred = pd.read_csv(trn_file, index_col='id')
    df_pred.columns = column_names
    trn_pred_dfs.append(df_pred)

trn_pred_df = pd.concat(trn_pred_dfs, axis=1)

tst_pred_dfs = []
for tst_file in list(filter(lambda x: x.find('tst') >= 0 and x.find('csv')>=0, csv_files)):
    print(tst_file)
    column_names = [class_name + ' ' + tst_file for class_name in class_names]
    df_pred = pd.read_csv(tst_file, index_col='id')
    df_pred.columns = column_names
    tst_pred_dfs.append(df_pred)

tst_pred_df = pd.concat(tst_pred_dfs, axis=1)

0-trn-ngram-lr.csv
0-trn-ngram-lgb-svd.csv
0-trn-ngram-stacking-sgd.csv
0-trn-ngram-lgb.csv
0-trn-ngram-xgb.csv
0-tst-ngram-xgb.csv
0-tst-ngram-lgb.csv
0-tst-ngram-stacking-sgd.csv
0-tst-ngram-lr.csv
0-tst-ngram-lgb-svd.csv


In [24]:
print(trn_pred_df.shape, tst_pred_df.shape)

(159571, 30) (153164, 30)


In [25]:
clf = SGDClassifier(
    loss='log',
    penalty='l2',
    alpha=1e-3,
    random_state=42,
    max_iter=1000,
    tol=2e-5)
skl_train_and_preds(clf, 'stacking-sgd', trn_pred_df.values, tst_pred_df.values)

Fit stacking-sgd <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>
X_trn (159571, 30)
X_tst (153164, 30)
CV score for class toxic is 0.9999913748814668
CV score for class severe_toxic is 0.9999093414313679
CV score for class obscene is 0.9999741251589508
CV score for class threat is 0.9969840188951228
CV score for class insult is 0.9999049058470703
CV score for class identity_hate is 0.9997387363852877
Total CV score is 0.9994170837665443
