In [1]:
import os
import re
import nltk
import regex
import pandas as pd
import numpy as np
import time
import tqdm
import torch
from torch.utils import data
from keras.preprocessing import text, sequence
from sklearn.model_selection import KFold
import multiprocessing as mp
from torch import nn
import random
n_cores = mp.cpu_count()

import gensim, logging # for word2vec
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Using TensorFlow backend.


In [2]:
# constants
POS_VEC_SIZE = 50
NUM_SPLITS = 5
SMALL_DATA = False
GEN_POS_TAGS = True
NUM_MODELS = 5
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS

In [3]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def train_model(model, train, test, loss_fn, output_dim, lr=0.001,
                batch_size=512, n_epochs=4, n_epochs_embed=2,
                enable_checkpoint_ensemble=True):
    
    
    param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
    optimizer = torch.optim.Adam(param_lrs, lr=lr)

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    all_test_preds = []
    checkpoint_weights = [2 ** epoch for epoch in range(n_epochs)]
    
    for epoch in range(n_epochs):
        start_time = time.time()
        
        scheduler.step()
        
        model.train() #set model to train mode
        avg_loss = 0.
        
        for data in tqdm(train_loader, disable=False):
            
            #training loop
            x_batch = data[:-1]
            y_batch = data[-1]

            y_pred = model(*x_batch)  #feed data into model          
            loss = loss_fn(y_pred, y_batch)
            
            #calculate error and adjust model params

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()
            avg_loss += loss.item() / len(train_loader) #gets the loss per epoch
        
        
        model.eval() #set model to eval mode for test data
        test_preds = np.zeros((len(test), output_dim))
    
        for i, x_batch in enumerate(test_loader):
            y_pred = sigmoid(model(*x_batch).detach().cpu().numpy()) #feed data into model

            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred #get test predictions
        
        #test_preds has the predictions for the entire test set now
        all_test_preds.append(test_preds) #append predictions to the record of all past predictions
        elapsed_time = time.time() - start_time
        print('Epoch {}/{} \t loss={:.4f} \t time={:.2f}s'.format(
              epoch + 1, n_epochs, avg_loss, elapsed_time))
    return model

def predict(model, test, output_dim, batch_size=512):
    
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    
    model.eval() #set model to eval mode for test data
    test_preds = np.zeros((len(test), output_dim))
    
    for i, x_batch in enumerate(test_loader):
        y_pred = sigmoid(model(*x_batch).detach().cpu().numpy()) #feed data into model
        test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred #get test predictions
        
    return test_preds

def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [4]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        #call the forward method in Dropout2d (super function specifies the subclass and instance)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x
    

class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix, num_aux_targets):
        #call the init mthod in Module (super function specifies the subclass and instance)
        super(NeuralNet, self).__init__() 
        embed_size = embedding_matrix.shape[1]
        max_features = 400000
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)
        
        self.lstm1 = nn.LSTM(embed_size, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(LSTM_UNITS * 2, LSTM_UNITS, bidirectional=True, batch_first=True)
    
        self.linear1 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        self.linear2 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        
        self.linear_out = nn.Linear(DENSE_HIDDEN_UNITS, 1)
        self.linear_aux_out = nn.Linear(DENSE_HIDDEN_UNITS, num_aux_targets)
        
    def forward(self, x, lengths=None):
        h_embedding = self.embedding(x)
        h_embedding = self.embedding_dropout(h_embedding)
        
        #first variable h_(lstm #) holds the output, _ is the (hidden state, cell state)
        h_lstm1, _ = self.lstm1(h_embedding) 
        h_lstm2, _ = self.lstm2(h_lstm1)
        
        # global average pooling
        avg_pool = torch.mean(h_lstm2, 1) #get the mean value of the first dimension in h_lstm2
        # global max pooling
        max_pool, _ = torch.max(h_lstm2, 1) #get the max value of the first dimension in h_lstm2
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2
        
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        
        return out

In [5]:
# the embedding matrix maps the word idx (instead of the word) to the embedding
def build_matrix(word_index, pos_model):
    embedding_matrix = np.zeros((len(word_index) + 1, POS_VEC_SIZE))
    unknown_words = []
    
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = pos_model[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

In [6]:
train = pd.DataFrame()
test = pd.DataFrame()
if GEN_POS_TAGS:
    print("--- Loading Files")
    start_time = time.time()
    train = pd.read_hdf('../input/train.h5')
    test = pd.read_hdf('../input/test.h5')
    print("--- Finished Loading %s" % (time.time() - start_time))
    
    
    # helper function for reading text files
    train_text = train["comment_text"]
    test_text = test["comment_text"]


    # fill in missing
    train_text[train_text.map(len)<=1] = 'neutral'
    test_text[test_text.map(len)<=1] = 'neutral'

    # map to pos
    def get_pos(x):
      tokens = nltk.tokenize.word_tokenize(x)
      tags = nltk.pos_tag(tokens)
      _, pos = zip(*tags) 
      return ' '.join(pos)

    print("--- creating train pos")
    start_time = time.time()
    if __name__ == '__main__':
       with mp.Pool(n_cores) as p:
          train_pos = list(tqdm.tqdm(p.imap(get_pos, train_text), total=30))
    print("--- Finished Loading %s" % (time.time() - start_time))

    print("--- creating test pos")
    start_time = time.time()
    if __name__ == '__main__':
       with mp.Pool(n_cores) as p:
          test_pos = list(tqdm.tqdm(p.imap(get_pos, test_text), total=30))
    print("--- Finished Loading %s" % (time.time() - start_time))
    
    # save the tags
    train = pd.DataFrame({'pos':train_pos,
                          'target': train['target'],
                          'severe_toxicity': train['severe_toxicity'],
                          'obscene': train['obscene'],
                          'identity_attack': train['identity_attack'],
                          'insult': train['insult'],
                          'threat': train['threat'],
                          'sexual_explicit': train['sexual_explicit'] # might not be so good
                         })
    test = pd.DataFrame({'pos':test_pos})
    train.to_hdf('../input/train_pos.h5',key='train_pos')
    test.to_hdf('../input/test_pos.h5',key='test_pos')
else:
    train = pd.read_hdf('../input/train_pos.h5')
    test = pd.read_hdf('../input/test_pos.h5')
if SMALL_DATA:
    print("Using small data")
    train = train[:100]
    test = test[:100]

--- Loading Files
--- Finished Loading 2.4167864322662354
--- creating train pos


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
1804874it [09:14, 3253.51it/s]                


--- Finished Loading 555.1695883274078
--- creating test pos


97320it [00:32, 3035.38it/s]                  


--- Finished Loading 32.65728688240051


In [7]:
train = pd.read_hdf('../input/train_pos.h5')
test = pd.read_hdf('../input/test_pos.h5')

In [8]:
train_pos = train["pos"].apply(lambda x: x.split()).values
test_pos = test["pos"].apply(lambda x: x.split()).values

In [9]:
both_pos = train_pos
np.append(both_pos,test_pos)

w2v_model = gensim.models.Word2Vec(iter=5, workers=n_cores,size=POS_VEC_SIZE)  # an empty model, no training yet
w2v_model.build_vocab(both_pos)  # can be a non-repeatable, 1-pass generator
w2v_model.train(both_pos,
            total_examples=w2v_model.corpus_count,
            epochs=w2v_model.iter)  # can be a non-repeatable, 1-pass generator

2019-06-04 20:45:14,871 : INFO : collecting all words and their counts
2019-06-04 20:45:14,871 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-06-04 20:45:14,923 : INFO : PROGRESS: at sentence #10000, processed 705467 words, keeping 45 word types
2019-06-04 20:45:14,969 : INFO : PROGRESS: at sentence #20000, processed 1323528 words, keeping 45 word types
2019-06-04 20:45:15,014 : INFO : PROGRESS: at sentence #30000, processed 1925423 words, keeping 45 word types
2019-06-04 20:45:15,058 : INFO : PROGRESS: at sentence #40000, processed 2525330 words, keeping 45 word types
2019-06-04 20:45:15,104 : INFO : PROGRESS: at sentence #50000, processed 3142997 words, keeping 45 word types
2019-06-04 20:45:15,147 : INFO : PROGRESS: at sentence #60000, processed 3720428 words, keeping 45 word types
2019-06-04 20:45:15,190 : INFO : PROGRESS: at sentence #70000, processed 4316144 words, keeping 45 word types
2019-06-04 20:45:15,236 : INFO : PROGRESS: at sentence #80000

2019-06-04 20:45:18,195 : INFO : PROGRESS: at sentence #720000, processed 43723028 words, keeping 45 word types
2019-06-04 20:45:18,248 : INFO : PROGRESS: at sentence #730000, processed 44339392 words, keeping 45 word types
2019-06-04 20:45:18,297 : INFO : PROGRESS: at sentence #740000, processed 44941003 words, keeping 45 word types
2019-06-04 20:45:18,342 : INFO : PROGRESS: at sentence #750000, processed 45545893 words, keeping 45 word types
2019-06-04 20:45:18,386 : INFO : PROGRESS: at sentence #760000, processed 46144376 words, keeping 45 word types
2019-06-04 20:45:18,430 : INFO : PROGRESS: at sentence #770000, processed 46742884 words, keeping 45 word types
2019-06-04 20:45:18,475 : INFO : PROGRESS: at sentence #780000, processed 47356066 words, keeping 45 word types
2019-06-04 20:45:18,522 : INFO : PROGRESS: at sentence #790000, processed 47984886 words, keeping 45 word types
2019-06-04 20:45:18,565 : INFO : PROGRESS: at sentence #800000, processed 48576365 words, keeping 45 wor

2019-06-04 20:45:21,519 : INFO : PROGRESS: at sentence #1450000, processed 86863397 words, keeping 45 word types
2019-06-04 20:45:21,563 : INFO : PROGRESS: at sentence #1460000, processed 87467318 words, keeping 45 word types
2019-06-04 20:45:21,607 : INFO : PROGRESS: at sentence #1470000, processed 88049415 words, keeping 45 word types
2019-06-04 20:45:21,650 : INFO : PROGRESS: at sentence #1480000, processed 88632588 words, keeping 45 word types
2019-06-04 20:45:21,693 : INFO : PROGRESS: at sentence #1490000, processed 89213283 words, keeping 45 word types
2019-06-04 20:45:21,736 : INFO : PROGRESS: at sentence #1500000, processed 89788220 words, keeping 45 word types
2019-06-04 20:45:21,780 : INFO : PROGRESS: at sentence #1510000, processed 90386448 words, keeping 45 word types
2019-06-04 20:45:21,824 : INFO : PROGRESS: at sentence #1520000, processed 90976198 words, keeping 45 word types
2019-06-04 20:45:21,867 : INFO : PROGRESS: at sentence #1530000, processed 91560407 words, keepi

2019-06-04 20:45:39,814 : INFO : EPOCH 2 - PROGRESS: at 21.13% examples, 1578055 words/s, in_qsize 23, out_qsize 0
2019-06-04 20:45:40,817 : INFO : EPOCH 2 - PROGRESS: at 28.52% examples, 1576731 words/s, in_qsize 22, out_qsize 1
2019-06-04 20:45:41,818 : INFO : EPOCH 2 - PROGRESS: at 35.82% examples, 1576666 words/s, in_qsize 23, out_qsize 0
2019-06-04 20:45:42,819 : INFO : EPOCH 2 - PROGRESS: at 43.07% examples, 1578933 words/s, in_qsize 23, out_qsize 0
2019-06-04 20:45:43,820 : INFO : EPOCH 2 - PROGRESS: at 50.11% examples, 1577699 words/s, in_qsize 24, out_qsize 0
2019-06-04 20:45:44,821 : INFO : EPOCH 2 - PROGRESS: at 57.45% examples, 1577732 words/s, in_qsize 20, out_qsize 4
2019-06-04 20:45:45,822 : INFO : EPOCH 2 - PROGRESS: at 64.86% examples, 1578665 words/s, in_qsize 23, out_qsize 0
2019-06-04 20:45:46,826 : INFO : EPOCH 2 - PROGRESS: at 72.38% examples, 1578197 words/s, in_qsize 21, out_qsize 2
2019-06-04 20:45:47,827 : INFO : EPOCH 2 - PROGRESS: at 79.88% examples, 1578235

2019-06-04 20:46:21,871 : INFO : EPOCH 5 - PROGRESS: at 28.70% examples, 1586579 words/s, in_qsize 23, out_qsize 0
2019-06-04 20:46:22,873 : INFO : EPOCH 5 - PROGRESS: at 36.06% examples, 1586814 words/s, in_qsize 23, out_qsize 0
2019-06-04 20:46:23,877 : INFO : EPOCH 5 - PROGRESS: at 43.31% examples, 1587428 words/s, in_qsize 23, out_qsize 0
2019-06-04 20:46:24,877 : INFO : EPOCH 5 - PROGRESS: at 50.39% examples, 1586541 words/s, in_qsize 24, out_qsize 0
2019-06-04 20:46:25,881 : INFO : EPOCH 5 - PROGRESS: at 57.67% examples, 1582568 words/s, in_qsize 20, out_qsize 3
2019-06-04 20:46:26,883 : INFO : EPOCH 5 - PROGRESS: at 65.16% examples, 1584271 words/s, in_qsize 22, out_qsize 1
2019-06-04 20:46:27,888 : INFO : EPOCH 5 - PROGRESS: at 72.78% examples, 1585298 words/s, in_qsize 23, out_qsize 2
2019-06-04 20:46:28,890 : INFO : EPOCH 5 - PROGRESS: at 80.30% examples, 1585092 words/s, in_qsize 21, out_qsize 2
2019-06-04 20:46:29,892 : INFO : EPOCH 5 - PROGRESS: at 87.78% examples, 1585125

(108050465, 537021225)

In [10]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(train['pos']) + list(test['pos']))

In [11]:
MAX_LEN = 50
x_train = tokenizer.texts_to_sequences(train['pos'])
x_test = tokenizer.texts_to_sequences(test['pos'])
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [12]:
y_train = np.where(train['target'] >= 0.5, 1, 0)
# note: not using sexual_explicit
y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]

In [13]:
y_aux_train.head()

Unnamed: 0,target,severe_toxicity,obscene,identity_attack,insult,threat
0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0
4,0.893617,0.021277,0.0,0.021277,0.87234,0.0


In [14]:
w2v_matrix, w2v_unknown_words = build_matrix(tokenizer.word_index, w2v_model)
print('n unknown words (w2v): ', len(w2v_unknown_words))

n unknown words (w2v):  34


  


In [17]:
def custom_loss(data, targets):
    ''' Define custom loss function for weighted BCE on 'target' column '''
    bce_loss_1 = torch.nn.BCEWithLogitsLoss(weight=targets[:,1:2])(data[:,:1],targets[:,:1])
    bce_loss_2 = torch.nn.BCEWithLogitsLoss()(data[:,1:],targets[:,2:])
    return (bce_loss_1 * loss_weight) + bce_loss_2

In [21]:
all_val_preds = []
all_test_preds = []

#Add in K fold 
random_state = 2019
    
#K fold splits
skf = KFold(n_splits=NUM_SPLITS, shuffle=True, random_state=random_state)
splits = list(skf.split(x_train, y_train))

#final validation predictions
final_val_preds = np.zeros((x_train.shape[0]))

#final test predictions to be stored in this var
final_test_preds = np.zeros((x_test.shape[0]))

start_time = time.time()
for fold in range(NUM_SPLITS):
    tr_ind, val_ind = splits[fold]
    all_val_preds = []
    all_test_preds = []
    
    x_training = x_train[tr_ind]
    y_training = y_train[tr_ind]
    y_aux_training = y_aux_train.values[tr_ind]
    
    x_val = x_train[val_ind]
    y_val = y_train[val_ind]
    y_aux_val = y_aux_train.values[val_ind]
    
    x_train_torch = torch.tensor(x_training, dtype=torch.long).cuda()
    x_val_torch = torch.tensor(x_val, dtype=torch.long).cuda()
    y_train_torch = torch.tensor(np.hstack([y_training[:, np.newaxis], y_aux_training]), dtype=torch.float32).cuda()
    x_test_torch = torch.tensor(x_test, dtype=torch.long).cuda()
    
    train_dataset = data.TensorDataset(x_train_torch, y_train_torch)
    
    val_dataset = data.TensorDataset(x_val_torch)
    test_dataset = data.TensorDataset(x_test_torch)

    for model_idx in range(NUM_MODELS):
        print('Model ', model_idx)
        seed_everything(1234 + model_idx)

        model = NeuralNet(w2v_matrix, y_aux_train.shape[-1])
        
        try:
            model.cuda()
            #training using training and validation set
            model = train_model(model,
                                train_dataset,
                                val_dataset,
                                output_dim=y_train_torch.shape[-1], 
                                loss_fn=custom_loss)
        except:
            model.cuda()
            #training using training and validation set
            model = train_model(model, train_dataset, val_dataset, output_dim=y_train_torch.shape[-1], 
                                 loss_fn=custom_loss)
        
        #prediction on validation set (used for score measurement)
        val_pred = predict(model, val_dataset, output_dim=y_train_torch.shape[-1]) #val preds on the val split
        all_val_preds.append(val_pred)
        
        #prediction on entire test set (actual predictions to be submitted)
        test_pred = predict(model, test_dataset, output_dim=y_train_torch.shape[-1])
        all_test_preds.append(test_pred)
        
        print()
        
    #average validation prediction amongst all models
    avg_val = np.mean(all_val_preds, axis=0)[:, 0] #will be printed out per split
    final_val_preds[val_ind] += avg_val
    
    avg_test = np.mean(all_test_preds, axis=0)[:, 0]
    
    final_test_preds += avg_test

    y_true = y_train[val_ind] #true scores for this validation set
    y_identity = y_train_identity[val_ind] #true scores for the identity groups for this validation set
    evaluator = JigsawEvaluator(y_true, y_identity)
    auc_score = evaluator.get_final_metric(avg_val)

    roc_score = roc_auc_score(y_train[val_ind], avg_val)
    print('Kaggle Score: ', auc_score)
    print('ROC score: ', roc_score)
    
    del x_train_torch
    del x_val_torch
    del y_train_torch
    del x_test_torch
    del train_dataset
    del val_dataset
    del test_dataset
    gc.collect()
    torch.cuda.empty_cache()
    
    print('=============End-of-Fold================')
    
end_time = time.time()
print('Time: ', end_time - start_time)

#Final combined score
y_true = y_train
y_identity = y_train_identity
evaluator = JigsawEvaluator(y_true, y_identity)
auc_
score = evaluator.get_final_metric(final_val_preds)
print('Final Kaggle Score: ', auc_score)
print('Final ROC score: ', roc_auc_score(y_train, final_val_preds))

#average test predictions AGAIN this time by number of splits
final_test_preds /= NUM_SPLITS
#print(final_test_preds)

Model  0


TypeError: 'module' object is not callable