In [1]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.externals import joblib

sys.path.insert(1, '..')

In [2]:
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain


In [27]:
from xgboost import XGBClassifier



In [3]:
data_dir = 'data/external/'

In [4]:
train_df = pd.read_csv(data_dir + 'train.csv')
test_df = pd.read_csv(data_dir + 'test.csv')
sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')

In [5]:
test_ids = test_df['id'].values
target_cols = [y for y in train_df.columns if y not in ('id', 'comment_text')]
targets = train_df[target_cols]

text_data = pd.concat([train_df['comment_text'], test_df['comment_text']], axis=0)
text_data.fillna('unknown', inplace=True)
# train_text = text_data[:train_df.shape[0]]



In [20]:
vectorizer = CountVectorizer(stop_words='english', max_features=10000)
corpus = vectorizer.fit_transform(text_data)

In [21]:
lda = LatentDirichletAllocation(n_components=100, learning_method='online', verbose=1)


In [22]:
lda.fit(corpus[:train_df.shape[0]])

iteration: 1 of max_iter: 10


iteration: 2 of max_iter: 10


iteration: 3 of max_iter: 10


iteration: 4 of max_iter: 10


iteration: 5 of max_iter: 10


iteration: 6 of max_iter: 10


iteration: 7 of max_iter: 10


iteration: 8 of max_iter: 10


iteration: 9 of max_iter: 10


iteration: 10 of max_iter: 10


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=100, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=1)

In [23]:
joblib.dump(lda, 'lda_vocab10k.pickle')

['lda_vocab10k.pickle']

In [24]:
train_topics = lda.transform(corpus[:train_df.shape[0]])

In [26]:
test_topics = lda.transform(corpus[train_df.shape[0]:])

In [28]:
xgb_chain = ClassifierChain(XGBClassifier(n_estimators=100))

In [29]:
xgb_chain.fit(train_topics, targets)

ClassifierChain(base_estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
        cv=None, order=None, random_state=None)

In [38]:
preds = xgb_chain.predict_proba(test_topics)

In [39]:
preds = pd.DataFrame(preds)

In [41]:
preds.columns = target_cols

In [42]:
preds['id'] = test_ids

In [44]:
for c in target_cols:
    preds[c] = preds[c].clip(0+1e12, 1-1e12)

In [45]:
preds.to_csv('lda_xgb_chain.csv', index=False)

In [46]:
ensemble_preds = pd.read_csv(data_dir + 'submission_ensemble.csv')

In [47]:
preds.columns = [x+'_' if x not in ['id'] else x for x in preds.columns]

In [51]:
blend = pd.merge(preds, ensemble_preds, on='id', how='left')

In [52]:
for c in target_cols:
    blend[c] = 0.8 * blend[c] + 0.2 * blend[c+'_']
    blend[c] = blend[c].clip(0+1e12, 1-1e12)
    
blend = blend[sample_submission.columns]

In [53]:
blend.to_csv('ensemble_ldaxgb_submission.csv', index=False)