In [1]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.externals import joblib

sys.path.insert(1, '..')

In [2]:
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain


In [3]:
data_dir = 'data/external/'

In [4]:
train_df = pd.read_csv(data_dir + 'train.csv')
test_df = pd.read_csv(data_dir + 'test.csv')
sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')

In [5]:
test_ids = test_df['id'].values
target_cols = [y for y in train_df.columns if y not in ('id', 'comment_text')]
targets = train_df[target_cols]

text_data = pd.concat([train_df['comment_text'], test_df['comment_text']], axis=0)
text_data.fillna('unknown', inplace=True)
# train_text = text_data[:train_df.shape[0]]



In [6]:
vectorizer = CountVectorizer(stop_words='english', max_features=800000)
corpus = vectorizer.fit_transform(text_data)

In [7]:
lda = LatentDirichletAllocation(n_components=100, n_jobs=-1)
topics = lda.fit_transform(corpus)



In [None]:
joblib.dump(topics, 'lda_topics.pickle')

In [None]:
train_topics = topics[:train_df.shape[0]]
text_topics = topics[train_df.shape[0]:]

In [None]:
gbm_chain = ClassifierChain(GradientBoostingClassifier(n_estimators=500))
gbm_chain.fit(train_topics, targets)