In [1]:
from keras.models import Sequential
from keras.layers import Input, Dense, Dropout
from  keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
import copy

import sklearn.model_selection
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train = pd.read_csv("./train_cleaned_no-stopwords.csv")
test = pd.read_csv("./test_cleaned_no-stopwords.csv")
list_sentences_train = train["comment_text"].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").values

In [3]:
def count_words(corpus):
    word_counts = {}
    for sentence in corpus:
        for word in set(sentence.split()):
                if word not in word_counts: word_counts[word] = 0
                word_counts[word] += 1
    return word_counts

def remove_low_freq_words(text, val_w):
    text = set(text.split())
    text = list(text & val_w)
    if len(text) == 0:
        text = ["CVxTz"]
    text = " ".join(text)    
    return text
        
all_text = list(list_sentences_train) + list(list_sentences_test)
print len(all_text)
word_counts = count_words(all_text)
print len(word_counts)

val_words = set()
for k, v in word_counts.items():
    if v > 1:
        val_words.add(k)
print(len(val_words))
list_sentences_f_train  = ["CVxTz"] * len(list_sentences_train)
list_sentences_f_test   = ["CVxTz"] * len(list_sentences_test)

for i in tqdm(range(len(list_sentences_train))):
    list_sentences_f_train[i] = remove_low_freq_words(list_sentences_train[i], val_words)

for i in tqdm(range(len(list_sentences_test))):
    list_sentences_f_test[i] = remove_low_freq_words(list_sentences_test[i], val_words)

312735
203883


  6%|▌         | 9153/159571 [00:00<00:01, 91526.16it/s]

101897


100%|██████████| 159571/159571 [00:01<00:00, 93181.06it/s]
100%|██████████| 153164/153164 [00:01<00:00, 98876.98it/s] 


In [4]:
def tokenize(s): return s.split(" ")

vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )

train_tfidf = vec.fit_transform(list_sentences_f_train)
test_tfidf = vec.transform(list_sentences_f_test)

In [5]:
def batch_generator(X, y, batch_size, shuffle):
    number_of_batches = X.shape[0]/batch_size
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

In [6]:
class nn:
    
        
    def __init__(self):
        self.models = []
    
    def get_mdl(self, X, y):
        adam = Adam(lr=0.001)
        model = Sequential()
        model.add(BatchNormalization(input_shape=(X.shape[1],)))
        model.add(Dense(2, activation='sigmoid'))
        model.add(Dropout(rate=0.6))
        model.add(Dense(2, activation='sigmoid'))
        model.add(Dropout(rate=0.6))
        #self.model.add(Dense(2048, activation='sigmoid'))
        #self.model.add(Dropout(rate=0.6))
        #self.model.add(Dense(2048, activation='sigmoid'))
        #self.model.add(Dropout(rate=0.6))
        if len(y.shape) > 1:
            model.add(Dense(y.shape[1], activation='sigmoid'))
            model.compile(loss="categorical_crossentropy", optimizer=adam, metrics=["accuracy"])
        else:
            model.add(Dense(1, activation='sigmoid'))
            model.compile(loss="binary_crossentropy", optimizer=adam, metrics=["accuracy"])

        #print(self.model.summary())
        batch_size = 512
        model.fit_generator(generator=batch_generator(X, y, batch_size, True),
                            nb_epoch=1,
                            samples_per_epoch=X.shape[0]/batch_size)
        return model

    def fit(self, X, y):
        if len(y.shape)>1:
            for i in range(y.shape[1]):
                m = self.get_mdl(X, y[:,i])
                self.models.append((m))
        else:
            m = self.get_mdl(X, y)
            self.models.append((m))
        
    def predict(self, X):
        preds = np.zeros((X.shape[0], len(self.models)))
        for i in range(len(self.models)):
            m = self.models[i]
            preds[:,i] = m.predict(X).flatten()
        return preds

In [7]:
class NB_LR:
    
    def __init__(self):
        self.models = []
        self.r  =[]
    
    def pr(self, x, y_i, y):
        p = x[y==y_i].sum(0)
        return (p+1) / ((y==y_i).sum()+1)

    def get_mdl(self, X, y):
        #y = y.values
        r = np.log(self.pr(X, 1,y) / self.pr(X, 0,y))
        m = LogisticRegression(C=10, dual=True)
        #m = SVC(kernel='sigmoid', probability = True)
        x_nb = X.multiply(r)
        return m.fit(x_nb, y), r
    
    def fit(self, X, y):
        if len(y.shape)>1:
            for i in range(y.shape[1]):
                m,r = self.get_mdl(X, y[:,i])
                self.models.append((m, r))
        else:
            m,r = self.get_mdl(X, y)
            self.models.append((m, r))
        
    def predict(self, X):
        preds = np.zeros((X.shape[0], len(self.models)))
        for i in range(len(self.models)):
            m = self.models[i]
            preds[:,i] = m.predict_proba(X.multiply(r))[:,1]
        return preds


In [8]:
X_train, X_val, y_train, y_val = train_test_split(train_tfidf, y, test_size=0.33)
#thisnn = nn()
#print(X_train.shape)
#thisnn.fit(X_train, y_train)

In [9]:
class stepwise_clf:
    
    
    def __init__(self, y):
        self.y_all = y
        self.y_toxic_or_not = y.max(axis=1)
        self.toxic_row_indices = list(np.where(self.y_toxic_or_not==1)[0])
        
    def fit(self, model, X):
        self.model_all = model()
        self.model_toxic_or_not = model()
#        self.model_only_toxic = model()
        print "fitting all"
        self.model_all.fit(X, self.y_all)
#        print "fitting only toxic"
 #       self.model_only_toxic.fit(X[self.toxic_row_indices,], self.y_all[self.toxic_row_indices,])
        print "fitting toxic or not"
        self.model_toxic_or_not.fit(X, self.y_toxic_or_not)
        
    def predict(self, X):
#        return (self.model_all.predict(X), self.model_only_toxic.predict(X), self.model_toxic_or_not.predict(X))
        return (self.model_all.predict(X), self.model_toxic_or_not.predict(X))

    def save_models(self, outdir):
        self.model_all.model.save(outdir + "nn_all.h5")
        self.model_toxic_or_not.model.save(outdir + "nn_toxic_or_not.h5")
#        self.model_only_toxic.model.save(outdir + "nn_only_toxic.h5")

In [10]:
class LR:
    
    def __init__(self):
        self.models = []

    def get_mdl(self, X, y):
        m = LogisticRegression(C=10, dual=True)
        return m.fit(X, y)
    
    def fit(self, X, y):
        if len(y.shape)>1:
            for i in range(y.shape[1]):
                m = self.get_mdl(X, y[:,i])
                self.models.append((m))
        else:
            m = self.get_mdl(X, y)
            self.models.append((m))
        
    def predict(self, X):
        preds = np.zeros((X.shape[0], len(self.models)))
        for i in range(len(self.models)):
            m = self.models[i]
            preds[:,i] = m.predict_proba(X)[:,1]
        return preds


In [23]:
setwiser = stepwise_clf(y_train)
setwiser.fit(NB_LR, X_train)

fitting all
fitting toxic or not


In [37]:
setwiser.save_models("/home/ubuntu/kaggle/toxicity/team_ensembl/Florian/")

In [None]:
X_train[list(np.where(y_train.max(axis=1)==1)[0]),]

In [None]:
tmp = y_train.max(axis=1)
tmp[1:20]

In [None]:
y_train[0:20,]

In [59]:
pred_all_val, _, _ = setwiser.predict(X_val)
print(roc_auc_score(y_val, pred_all_val))

0.9749131178039706


In [83]:
X_val_lvl1 = list(setwiser.predict(X_val))
X_train_lvl1 = list(setwiser.predict(X_train))

In [90]:
sel = np.array([1,0,0])
np.where(sel==1)[0]

array([0])

In [103]:
for sel in [[1,0,0],[0,1,0],[0,0,1],[1,1,0],[1,0,1],[1,1,1]]:
    sel = np.array(sel)
    X_train_stacked = sparse.csr_matrix(np.column_stack([X_train_lvl1[i]  for i in np.where(sel==1)[0]]))
    X_val_stacked = sparse.csr_matrix(np.column_stack([X_val_lvl1[i]  for i in np.where(sel==1)[0]]))

    print(X_train_stacked.shape)
    print(X_val_stacked.shape)

    from scipy import sparse
    l = LogisticRegression(C=10, dual=True)
    lr = LR()
    lr.fit(X_train_stacked, y_train)
    pred = lr.predict(X_val_stacked)
    print(roc_auc_score(y_val, pred))

(106912, 6)
(52659, 6)
0.975814284462718
(106912, 6)
(52659, 6)
0.9393553331091504
(106912, 1)
(52659, 1)
0.9716189388852837
(106912, 12)
(52659, 12)
0.956282528779225
(106912, 7)
(52659, 7)
0.9755058500225573
(106912, 13)
(52659, 13)
0.9579008327742425


In [11]:
setwiser_nn = stepwise_clf(y_train)
setwiser_nn.fit(nn, X_train)

fitting all




Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
fitting toxic or not
Epoch 1/1


In [12]:
tmp = setwiser_nn.model_all.predict(X_val)

In [13]:
tmp

array([[0.17767881, 0.25328243, 0.2895647 , 0.35339016, 0.45781237,
        0.45184532],
       [0.17767884, 0.25328243, 0.29105347, 0.35339016, 0.45781237,
        0.45184532],
       [0.17767887, 0.25328243, 0.2928071 , 0.35339016, 0.45781237,
        0.45184532],
       ...,
       [0.1776789 , 0.25328243, 0.28967503, 0.35339016, 0.45781237,
        0.45184532],
       [0.17767881, 0.25328243, 0.29013598, 0.35339016, 0.45781237,
        0.45184532],
       [0.17767887, 0.25328243, 0.29073977, 0.35339016, 0.45781237,
        0.45184532]])

In [38]:
preds = np.zeros((len(list(tmp.flatten())), 6))
preds[:,0]  = tmp.flatten()

In [14]:
X_val_lvl1 = list(setwiser_nn.predict(X_val))
X_train_lvl1 = list(setwiser_nn.predict(X_train))

#for sel in [[1,0,0],[0,1,0],[0,0,1],[1,1,0],[1,0,1],[1,1,1]]:
for sel in [[1,0],[0,1],[1,1]]:
    sel = np.array(sel)
    X_train_stacked = sparse.csr_matrix(np.column_stack([X_train_lvl1[i]  for i in np.where(sel==1)[0]]))
    X_val_stacked = sparse.csr_matrix(np.column_stack([X_val_lvl1[i]  for i in np.where(sel==1)[0]]))

    print(X_train_stacked.shape)
    print(X_val_stacked.shape)

    from scipy import sparse
    l = LogisticRegression(C=10, dual=True)
    lr = LR()
    lr.fit(X_train_stacked, y_train)
    pred = lr.predict(X_val_stacked)
    print(roc_auc_score(y_val, pred))

NameError: name 'sparse' is not defined

In [15]:
X_val_lvl1

[array([[0.17767881, 0.25328243, 0.2895647 , 0.35339016, 0.45781237,
         0.45184532],
        [0.17767884, 0.25328243, 0.29105347, 0.35339016, 0.45781237,
         0.45184532],
        [0.17767887, 0.25328243, 0.2928071 , 0.35339016, 0.45781237,
         0.45184532],
        ...,
        [0.1776789 , 0.25328243, 0.28967503, 0.35339016, 0.45781237,
         0.45184532],
        [0.17767881, 0.25328243, 0.29013598, 0.35339016, 0.45781237,
         0.45184532],
        [0.17767887, 0.25328243, 0.29073977, 0.35339016, 0.45781237,
         0.45184532]]), array([[0.27818397],
        [0.27673677],
        [0.33326322],
        ...,
        [0.27240846],
        [0.25693822],
        [0.31539443]])]

In [16]:
#for sel in [[1,0,0],[0,1,0],[0,0,1],[1,1,0],[1,0,1],[1,1,1]]:
for sel in [[1,0],[0,1],[1,1]]:
    sel = np.array(sel)
    X_train_stacked = np.column_stack([X_train_lvl1[i]  for i in np.where(sel==1)[0]])
    X_val_stacked = np.column_stack([X_val_lvl1[i]  for i in np.where(sel==1)[0]])

    print(X_train_stacked.shape)
    print(X_val_stacked.shape)

    from scipy import sparse
    l = LogisticRegression(C=10, dual=True)
    lr = LR()
    lr.fit(X_train_stacked, y_train)
    pred = lr.predict(X_val_stacked)
    print(roc_auc_score(y_val, pred))

(106912, 6)
(52659, 6)
0.8655980142852702
(106912, 1)
(52659, 1)
0.9347554027827312
(106912, 7)
(52659, 7)
0.9348410781206321


In [103]:
for sel in [[1,0,0],[0,1,0],[0,0,1],[1,1,0],[1,0,1],[1,1,1]]:
    sel = np.array(sel)
    X_train_stacked = sparse.csr_matrix(np.column_stack([X_train_lvl1[i]  for i in np.where(sel==1)[0]]))
    X_val_stacked = sparse.csr_matrix(np.column_stack([X_val_lvl1[i]  for i in np.where(sel==1)[0]]))

    print(X_train_stacked.shape)
    print(X_val_stacked.shape)

    from scipy import sparse
    l = LogisticRegression(C=10, dual=True)
    lr = LR()
    lr.fit(X_train_stacked, y_train)
    pred = lr.predict(X_val_stacked)
    print(roc_auc_score(y_val, pred))

(106912, 6)
(52659, 6)
0.975814284462718
(106912, 6)
(52659, 6)
0.9393553331091504
(106912, 1)
(52659, 1)
0.9716189388852837
(106912, 12)
(52659, 12)
0.956282528779225
(106912, 7)
(52659, 7)
0.9755058500225573
(106912, 13)
(52659, 13)
0.9579008327742425
