In [44]:
import sys
import os
import pandas as pd
import numpy as np
import itertools
from sklearn.externals import joblib
from src.models.evaluation import multi_roc_auc

sys.path.insert(1, '..')

In [3]:
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV, LassoCV
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

In [4]:
from xgboost import XGBClassifier



In [10]:
data_dir = 'data/external/'
train_df = pd.read_csv(data_dir + 'train.csv')
test_df = pd.read_csv(data_dir + 'test.csv')
sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')

In [11]:
test_ids = test_df['id'].values
target_cols = [y for y in train_df.columns if y not in ('id', 'comment_text')]
targets = train_df[target_cols]

In [12]:
tfidf_corpus = joblib.load('data/processed/tfidf_corpus.pickle')

In [13]:
train_text = tfidf_corpus[:train_df.shape[0]]
test_text = tfidf_corpus[train_df.shape[0]:]

In [16]:
selected_features = []

for c in target_cols:
    y = targets[c][:]
    lasso = LogisticRegressionCV(penalty='l1', scoring='roc_auc', solver='liblinear', n_jobs=-1)
    lasso.fit(train_text, y)
    selector = SelectFromModel(lasso, prefit=True)
    
    selected = selector.get_support()
    
    selected_features.append(selected)
    print('Label %s done' %c)

Label toxic done


Label severe_toxic done


Label obscene done


Label threat done


Label insult done


Label identity_hate done


In [30]:
joblib.dump(selected_features, 'selected_features.pickle')

['selected_features.pickle']

In [31]:
sf = np.array(selected_features)

In [34]:
final_feat = sum(sf, 0) > 0

In [38]:
X_train = train_text[:, final_feat]

In [41]:
xgb = ClassifierChain(XGBClassifier(n_estimators=1000))

In [42]:
xgb.fit(X_train, targets)

ClassifierChain(base_estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
        cv=None, order=None, random_state=None)

In [43]:
joblib.dump(xgb, 'xgbmodel.pickle')

['xgbmodel.pickle']

In [45]:
preds = xgb.predict_proba(test_text[:, final_feat])

In [48]:
target_probs = pd.DataFrame(preds)
target_probs.columns = target_cols

In [50]:
target_probs['id'] = test_ids

In [51]:
subm = target_probs[sample_submission.columns]

In [52]:
subm.to_csv('xgb_preds.csv', index=False)

In [54]:
for c in target_cols:
    target_probs[c] = target_probs[c].clip(0+1e12, 1-1e12)

In [55]:
ensemble_preds = pd.read_csv(data_dir + 'submission_ensemble.csv')

In [56]:
target_probs.columns = [x+'_' if x not in ['id'] else x for x in target_probs.columns]
blend = pd.merge(target_probs, ensemble_preds, on='id', how='left')

In [57]:
for c in target_cols:
    blend[c] = 0.8 * blend[c] + 0.2 * blend[c+'_']
    blend[c] = blend[c].clip(0+1e12, 1-1e12)
    
blend = blend[sample_submission.columns]

In [58]:
blend.to_csv('ensemble_xgb_submission.csv', index=False)