In [1]:
from keras.models import Sequential
from keras.layers import Input, Dense, Dropout
from  keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
import copy

import sklearn.model_selection
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train = pd.read_csv("./train_cleaned_no-stopwords.csv")
test = pd.read_csv("./test_cleaned_no-stopwords.csv")
list_sentences_train = train["comment_text"].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").values

In [3]:
train.shape

(159571, 8)

In [4]:
def count_words(corpus):
    word_counts = {}
    for sentence in corpus:
        for word in set(sentence.split()):
                if word not in word_counts: word_counts[word] = 0
                word_counts[word] += 1
    return word_counts

def remove_low_freq_words(text, val_w):
    text = set(text.split())
    text = list(text & val_w)
    if len(text) == 0:
        text = ["CVxTz"]
    text = " ".join(text)    
    return text
        
all_text = list(list_sentences_train) + list(list_sentences_test)
print len(all_text)
word_counts = count_words(all_text)
print len(word_counts)

val_words = set()
for k, v in word_counts.items():
    if v > 1:
        val_words.add(k)
print(len(val_words))
list_sentences_f_train  = ["CVxTz"] * len(list_sentences_train)
list_sentences_f_test   = ["CVxTz"] * len(list_sentences_test)

for i in tqdm(range(len(list_sentences_train))):
    list_sentences_f_train[i] = remove_low_freq_words(list_sentences_train[i], val_words)

for i in tqdm(range(len(list_sentences_test))):
    list_sentences_f_test[i] = remove_low_freq_words(list_sentences_test[i], val_words)

312735
203883


  6%|▌         | 9048/159571 [00:00<00:01, 90433.08it/s]

101897


100%|██████████| 159571/159571 [00:01<00:00, 90783.46it/s]
100%|██████████| 153164/153164 [00:01<00:00, 97668.36it/s] 


In [None]:
def tokenize(s): return s.split(" ")

vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )

train_tfidf = vec.fit_transform(list_sentences_f_train)
test_tfidf = vec.transform(list_sentences_f_test)

In [None]:
def batch_generator(X, y, batch_size, shuffle):
    number_of_batches = X.shape[0]/batch_size
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

In [None]:
class nn_multiclass_clf:       
    def __init__(self):
        self.model = ""
    
    def get_mdl(self, X, y):
        adam = Adam(lr=0.001)
        model = Sequential()
        model.add(BatchNormalization(input_shape=(X.shape[1],)))
        model.add(Dense(64, activation='sigmoid'))
        model.add(Dropout(rate=0.6))
        model.add(Dense(2048, activation='sigmoid'))
        model.add(Dropout(rate=0.6))
        #self.model.add(Dense(2048, activation='sigmoid'))
        #self.model.add(Dropout(rate=0.6))
        #self.model.add(Dense(2048, activation='sigmoid'))
        #self.model.add(Dropout(rate=0.6))
        if len(y.shape) > 1:
            model.add(Dense(y.shape[1], activation='sigmoid'))
            model.compile(loss="categorical_crossentropy", optimizer=adam, metrics=["accuracy"])
        else:
            model.add(Dense(1, activation='sigmoid'))
            model.compile(loss="binary_crossentropy", optimizer=adam, metrics=["accuracy"])

        #print(self.model.summary())
        batch_size = 512
        model.fit_generator(generator=batch_generator(X, y, batch_size, True),
                            nb_epoch=1,
                            samples_per_epoch=X.shape[0]/batch_size)
        return model

    def fit(self, X, y):
        self.model = self.get_mdl(X, y)
        
    def predict(self, X):
        return self.model.predict(X)

In [None]:
class nn_1vsall_clf:
    
        
    def __init__(self):
        self.models = []
    
    def get_mdl(self, X, y):
        adam = Adam(lr=0.001)
        model = Sequential()
        model.add(BatchNormalization(input_shape=(X.shape[1],)))
        model.add(Dense(64, activation='sigmoid'))
        model.add(Dropout(rate=0.6))
        model.add(Dense(2048, activation='sigmoid'))
        model.add(Dropout(rate=0.6))
        #self.model.add(Dense(2048, activation='sigmoid'))
        #self.model.add(Dropout(rate=0.6))
        #self.model.add(Dense(2048, activation='sigmoid'))
        #self.model.add(Dropout(rate=0.6))
        if len(y.shape) > 1:
            model.add(Dense(y.shape[1], activation='sigmoid'))
            model.compile(loss="categorical_crossentropy", optimizer=adam, metrics=["accuracy"])
        else:
            model.add(Dense(1, activation='sigmoid'))
            model.compile(loss="binary_crossentropy", optimizer=adam, metrics=["accuracy"])

        #print(self.model.summary())
        batch_size = 512
        model.fit_generator(generator=batch_generator(X, y, batch_size, True),
                            nb_epoch=10,
                            samples_per_epoch=X.shape[0]/batch_size)
        return model

    def fit(self, X, y):
        if len(y.shape)>1:
            for i in range(y.shape[1]):
                m = self.get_mdl(X, y[:,i])
                self.models.append((m))
        else:
            m = self.get_mdl(X, y)
            self.models.append((m))
        
    def predict(self, X):
        preds = np.zeros((X.shape[0], len(self.models)))
        for i in range(len(self.models)):
            m = self.models[i]
            preds[:,i] = m.predict(X).flatten()
        return preds

In [None]:
class NB_LR:
    
    def __init__(self):
        self.models = []
        self.r  =[]
    
    def pr(self, x, y_i, y):
        p = x[y==y_i].sum(0)
        return (p+1) / ((y==y_i).sum()+1)

    def get_mdl(self, X, y):
        #y = y.values
        r = np.log(self.pr(X, 1,y) / self.pr(X, 0,y))
        m = LogisticRegression(C=10, dual=True)
        #m = SVC(kernel='sigmoid', probability = True)
        x_nb = X.multiply(r)
        return m.fit(x_nb, y), r
    
    def fit(self, X, y):
        if len(y.shape)>1:
            for i in range(y.shape[1]):
                m,r = self.get_mdl(X, y[:,i])
                self.models.append((m, r))
        else:
            m,r = self.get_mdl(X, y)
            self.models.append((m, r))
        
    def predict(self, X):
        preds = np.zeros((X.shape[0], len(self.models)))
        for i in range(len(self.models)):
            m, r = self.models[i]
            preds[:,i] = m.predict_proba(X.multiply(r))[:,1]
        return preds


In [5]:
class stepwise_clf:
    
    
    def __init__(self, y):
        self.y_all = y
        self.y_toxic_or_not = y.max(axis=1)
        
    def fit(self, model, X):
        self.model_all = model()
        self.model_toxic_or_not = model()
        print "fitting all"
        self.model_all.fit(X, self.y_all)
        print "fitting toxic or not"
        self.model_toxic_or_not.fit(X, self.y_toxic_or_not)
        
    def predict(self, X):
        return (self.model_all.predict(X), self.model_toxic_or_not.predict(X))

In [None]:
class LR:
    
    def __init__(self):
        self.models = []

    def get_mdl(self, X, y):
        m = LogisticRegression(C=10, dual=True)
        return m.fit(X, y)
    
    def fit(self, X, y):
        if len(y.shape)>1:
            for i in range(y.shape[1]):
                m = self.get_mdl(X, y[:,i])
                self.models.append((m))
        else:
            m = self.get_mdl(X, y)
            self.models.append((m))
        
    def predict(self, X):
        preds = np.zeros((X.shape[0], len(self.models)))
        for i in range(len(self.models)):
            m = self.models[i]
            preds[:,i] = m.predict_proba(X)[:,1]
        return preds


In [None]:
import xgboost as xgb

class xgboost:
    def __init__(self):
        self.models = []
        self.xgb_params = {'eta': 0.3, 
              'max_depth': 5, 
              'subsample': 0.8, 
              'colsample_bytree': 0.8, 
              'objective': 'binary:logistic', 
              'eval_metric': 'auc', 
              'seed': 23,
              'n_estimators':1000,
              'nthread' : 4
             }
        self.num_steps = 200
        
    def get_mdl(self, X, y):
        dtrain = xgb.DMatrix( X, label = y)
        m = xgb.train(self.xgb_params, dtrain, self.num_steps)
        return m
        
    def fit(self, X, y):
        if len(y.shape)>1:
            for i in range(y.shape[1]):
                m = self.get_mdl(X, y[:,i])
                self.models.append((m))
        else:
            m = self.get_mdl(X, y)
            self.models.append((m))
        
    def predict(self, X):
        preds = np.zeros((X.shape[0], len(self.models)))
        for i in range(len(self.models)):
            m = self.models[i]
            preds[:,i] = m.predict(xgb.DMatrix(X))
        return preds
    

In [None]:
from sklearn.ensemble import RandomForestClassifier

class randomforest:
    
    def __init__(self):
        self.models = []

    def get_mdl(self, X, y):
        m = RandomForestClassifier(n_estimators=100, n_jobs=4)
        return m.fit(X, y)
    
    def fit(self, X, y):
        if len(y.shape)>1:
            for i in range(y.shape[1]):
                print i
                m = self.get_mdl(X, y[:,i])
                self.models.append((m))
        else:
            m = self.get_mdl(X, y)
            self.models.append((m))
        
    def predict(self, X):
        preds = np.zeros((X.shape[0], len(self.models)))
        for i in range(len(self.models)):
            m = self.models[i]
            preds[:,i] = m.predict_proba(X)[:,1]
        return preds


In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_tfidf, y, test_size=0.33)

all_models = {"nb_lr" : NB_LR, "gboost" :xgboost }

for name, m in all_models.items():
    print name
    setwiser = stepwise_clf(y_train)
    setwiser.fit(m, X_train)

    X_val_lvl1 = list(setwiser.predict(X_val))
    X_train_lvl1 = list(setwiser.predict(X_train))

    #for sel in [[1,0,0],[0,1,0],[0,0,1],[1,1,0],[1,0,1],[1,1,1]]:
    for sel in [[1,0],[0,1],[1,1]]:
        sel = np.array(sel)
        X_train_stacked = np.column_stack([X_train_lvl1[i]  for i in np.where(sel==1)[0]])
        X_val_stacked = np.column_stack([X_val_lvl1[i]  for i in np.where(sel==1)[0]])

        print(X_train_stacked.shape)
        print(X_val_stacked.shape)

        from scipy import sparse
        l = LogisticRegression(C=10, dual=True)
        lr = LR()
        lr.fit(X_train_stacked, y_train)
        pred = lr.predict(X_val_stacked)
        print(roc_auc_score(y_val, pred))

In [None]:
all_models = {"nb_lr" : NB_LR, "xgboost" :xgboost, "rnd_forest" : randomforest}

for name, m in all_models.items():
    print name
    setwiser = stepwise_clf(y)
    setwiser.fit(m, train_tfidf)

    X_test_lvl1 = list(setwiser.predict(test_tfidf))
    X_train_lvl1 = list(setwiser.predict(train_tfidf))

    #for sel in [[1,0,0],[0,1,0],[0,0,1],[1,1,0],[1,0,1],[1,1,1]]:
    selections = {"normal" : [1,0], "toxicOnly" : [0,1], "combined" : [1,1]}
    for sel_name, sel in selections.items():
        sel = np.array(sel)
        X_train_stacked = np.column_stack([X_train_lvl1[i]  for i in np.where(sel==1)[0]])
        X_test_stacked = np.column_stack([X_test_lvl1[i]  for i in np.where(sel==1)[0]])
        pd.DataFrame(np.column_stack([train["id"].values, X_train_stacked])).to_csv("/home/ubuntu/kaggle/toxicity/team_ensembl/Florian/out/train_1703_FG_%s_%s.csv" % (name, sel_name), header= False, index= False)
        pd.DataFrame(np.column_stack([test["id"].values, X_test_stacked])).to_csv("/home/ubuntu/kaggle/toxicity/team_ensembl/Florian/out/test_1703_FG_%s_%s.csv" % (name, sel_name), header= False, index= False)   
        print(X_train_stacked.shape)
        print(X_test_stacked.shape)

In [None]:
all_models = {"nn" : nn_1vsall_clf}

for name, m in all_models.items():
    print name
    setwiser = stepwise_clf(y)
    setwiser.fit(m, train_tfidf)

    X_test_lvl1 = list(setwiser.predict(test_tfidf))
    X_train_lvl1 = list(setwiser.predict(train_tfidf))

    #for sel in [[1,0,0],[0,1,0],[0,0,1],[1,1,0],[1,0,1],[1,1,1]]:
    selections = {"normal" : [1,0], "toxicOnly" : [0,1], "combined" : [1,1]}
    for sel_name, sel in selections.items():
        sel = np.array(sel)
        X_train_stacked = np.column_stack([X_train_lvl1[i]  for i in np.where(sel==1)[0]])
        X_test_stacked = np.column_stack([X_test_lvl1[i]  for i in np.where(sel==1)[0]])
        pd.DataFrame(np.column_stack([train["id"].values, X_train_stacked])).to_csv("/home/ubuntu/kaggle/toxicity/team_ensembl/Florian/out/train_1703_FG_%s_%s.csv" % (name, sel_name), header= False, index= False)
        pd.DataFrame(np.column_stack([test["id"].values, X_test_stacked])).to_csv("/home/ubuntu/kaggle/toxicity/team_ensembl/Florian/out/test_1703_FG_%s_%s.csv" % (name, sel_name), header= False, index= False)   
        print(X_train_stacked.shape)
        print(X_test_stacked.shape)

In [6]:
from keras.preprocessing import text, sequence
import gensim.models.word2vec as w2v
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Generate common embedings
tokenizer = text.Tokenizer(num_words=len(val_words))
tokenizer.fit_on_texts(list(list_sentences_f_train) + list(list_sentences_f_test))
word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

maxlen = 150
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_f_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_f_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

list_senteces_all = list_sentences_f_train + list_sentences_f_test
for i in tqdm(range(len(list_senteces_all))):
    list_senteces_all[i] = list_senteces_all[i].split(" ")
    
# hyper parameters of the word2vec model
num_features = 300 # dimensions of each word embedding
min_word_count = 1 # this is not advisable but since we need to extract
# feature vector for each word we need to do this
context_size = 7 # context window length
downsampling = 1e-3 # downsampling for very frequent words
seed = 1 # seed for random number generator to make results reproducible

word2vec_ = w2v.Word2Vec(
    sg = 1, seed = seed,
    workers = 4,
    size = num_features,
    min_count = min_word_count,
    window = context_size,
    sample = downsampling
)

word2vec_.build_vocab(list_senteces_all)
word2vec_.train(list_senteces_all, total_examples = word2vec_.corpus_count, epochs = 10)

embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    if i ==0: print word
    if word in word2vec_.wv:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = word2vec_.wv[word]

Found 98821 unique tokens


100%|██████████| 312735/312735 [00:01<00:00, 183754.64it/s]


In [7]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

max_features = 98822
embed_size = 300
maxlen = 150


class lstm:
    
    def __init__(self):
        self.models = []
    
    def get_mdl(self, X, y):
        max_features = 98822
        embed_size = 300
        maxlen = 150
        inp = Input(shape=(maxlen,))
        x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
        x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.25, recurrent_dropout=0.1))(x)
        x = GlobalMaxPool1D()(x)
        x = Dense(50, activation="relu")(x)
        x = Dropout(0.1)(x)
        x = Dense(1, activation="sigmoid")(x)
        model = Model(inputs=inp, outputs=x)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.fit(X, y, batch_size=512, epochs=20)
        return model
    
    def fit(self, X, y):
        if len(y.shape)>1:
            for i in range(y.shape[1]):
                m = self.get_mdl(X, y[:,i])
                self.models.append((m))
        else:
            m = self.get_mdl(X, y)
            self.models.append((m))
        
    def predict(self, X):
        preds = np.zeros((X.shape[0], len(self.models)))
        for i in range(len(self.models)):
            m = self.models[i]
            preds[:,i] = m.predict(X).flatten()
        return preds
    

In [8]:
all_models = {"lstm" : lstm}

for name, m in all_models.items():
    print name
    setwiser = stepwise_clf(y)
    setwiser.fit(m, X_t)

    X_test_lvl1 = list(setwiser.predict(X_te))
    X_train_lvl1 = list(setwiser.predict(X_t))

    #for sel in [[1,0,0],[0,1,0],[0,0,1],[1,1,0],[1,0,1],[1,1,1]]:
    selections = {"normal" : [1,0], "toxicOnly" : [0,1], "combined" : [1,1]}
    for sel_name, sel in selections.items():
        sel = np.array(sel)
        X_train_stacked = np.column_stack([X_train_lvl1[i]  for i in np.where(sel==1)[0]])
        X_test_stacked = np.column_stack([X_test_lvl1[i]  for i in np.where(sel==1)[0]])
        pd.DataFrame(np.column_stack([train["id"].values, X_train_stacked])).to_csv("/home/ubuntu/kaggle/toxicity/team_ensembl/Florian/out/train_1703_FG_%s_%s.csv" % (name, sel_name), header= False, index= False)
        pd.DataFrame(np.column_stack([test["id"].values, X_test_stacked])).to_csv("/home/ubuntu/kaggle/toxicity/team_ensembl/Florian/out/test_1703_FG_%s_%s.csv" % (name, sel_name), header= False, index= False)   
        print(X_train_stacked.shape)
        print(X_test_stacked.shape)

lstm
fitting all
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20


Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
fitting toxic or not
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
(159571, 1)
(153164, 1)
(159571, 7)
(153164, 7)
(159571, 6)
(153164, 6)
