### Imports, preparation

(Used the trick in https://github.com/googlecolab/colabtools/issues/253#issuecomment-648634717 to obtain more RAM in google colab)

In [2]:
!pip install -q torchtext==0.6.0

[K     |████████████████████████████████| 71kB 2.0MB/s 
[K     |████████████████████████████████| 1.1MB 7.5MB/s 
[?25h

In [3]:
from google.colab import drive
drive.mount('/content/drive')

ROOT_PATH = "/content/drive/My Drive/cil"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [4]:
import torch
from torch.utils.data import DataLoader, Dataset, random_split
import torch.nn.functional as F
from torch.autograd import Variable
from torch import optim
from torch import nn
from sklearn import metrics

import pandas as pd
import numpy as np
import joblib

import pickle
import json
import csv

import sys
import os
from tqdm.notebook import tqdm
tqdm.pandas()
import argparse
import datetime
import errno

### Set experiment parameters

In [32]:
class Obj:
    pass
args = Obj()
args.checkpoint_save_to_dir = os.path.join(ROOT_PATH, "CIL-results", "my_checkpoints")
args.checkpoint_continue_from = None # os.path.join(ROOT_PATH, "CIL-results", "my_checkpoints", "...") --> just set this inline wherever most convenient
args.cuda = True
args.epochs = 10
args.max_norm = 1e3
args.val_interval = 5000 # evaluate on the validation set every val_interval batch
args.max_samples = None # TODO: set to None to use all samples
args.batch_size = 128
args.num_workers = 4
args.val_frac = 0.01 # 0.1 # reserve 0.1 = 10% of the training samples for validation.
# args.val_frac = None # Set to None to train on full training set, without validation.

ALPHABET_SIZE = 70

### Load datasets

In [33]:
from torchtext.vocab import GloVe

class TweetsAsCharsAndWordsDataset(Dataset):
    def __init__(self, data_path, alphabet_path, is_labeled=True, l0=501, l1=131, max_samples=None, 
                 word_emb_name="twitter.27B", word_emb_dim=200, vector_cache_path=os.path.join(ROOT_PATH, "CIL-aux-data")):
        """A dataset object whose samples consist of *both*
            - the (padded) concatenation of the word vectors of a tweet, and
            - the per-character one-hot encoding of the same tweet.

        Arguments:
            data_path: path of (label and) data file in csv.
            alphabet_path: path of alphabet json file.
            is_labeled: whether the data_path file contains labels, or only the tweets.
            l1: max length of a sample, in nb of characters.
            l1: max length of a sample, in nb of words.
            max_samples: (for dev,) only keep the max_samples first samples of the data.
            
            word_emb_name: name of the word embedding to use, used by torchtext.GloVe.
            word_emb_dim: dimension of the word embedding to use, used by torchtext.GloVe.
            vector_cache_path: path to cache directory, used by torchtext.GloVe.
        """
        self.glove = GloVe(name=word_emb_name, dim=word_emb_dim, cache=vector_cache_path)
        print("loaded pretrained GloVe word-embeddings.")
        self.data_path = data_path
        self.alphabet_path = alphabet_path
        self.is_labeled = is_labeled
        self.l0 = l0
        self.l1 = l1
        with open(alphabet_path) as f:
            self.alphabet = ''.join(json.load(f))
        self.raw_nb_feats = len(self.alphabet)
        self.pro_nb_feats = word_emb_dim
        # TODO: setting max_samples only makes sense if the csv itself was shuffled
        # X_txt = pd.read_csv(data_path, nrows=max_samples) # only keep max_samples first samples, or keep all if None
        X_txt = pd.read_csv(data_path)
        if max_samples:
            assert is_labeled, "must not use `max_samples` for unlabeled (assumed test-) data, as shuffling would modify the samples' ordering"
            X_txt = X_txt.sample(frac=1).reset_index(drop=True).iloc[:max_samples] # shuffle then select max_samples first
        self.y = X_txt['label'].to_numpy().astype(np.integer, copy=False) if is_labeled else None
        self.X_pro = X_txt['preprocessed_segmented_tweet'].to_numpy()
        self.X_raw = X_txt['raw_tweet'].to_numpy()
            
    def __len__(self):
        return self.X_raw.shape[0]

    def __getitem__(self, idx):
        X_raw = self.get_item_raw(idx)
        X_pro = self.get_item_pro(idx)
        # even if X consists of two distinct parts, still output X,y so that auxiliary functions work without modification
        if self.is_labeled:
            return (X_raw, X_pro), self.y[idx]
        else:
            return (X_raw, X_pro)

    def get_item_pro(self, idx):
        words = self.X_pro[idx].lower().split()
        words += [""]*(self.l1 - len(words)) # pad with zeros until of correct size
        assert len(words) == self.l1
        X = self.glove.get_vecs_by_tokens(words, lower_case_backup=True)
        # for i in np.where(~X.bool().all(axis=1))[0]: # print OOV words
        #     if words[i] != "":
        #         print("out-of-vocabulary:", i, words[i])
        assert X.shape == (self.l1, self.glove.dim)
        return X

    def get_item_raw(self, idx):
        seq = self.X_raw[idx]
        X = self.oneHotEncode(seq)
        assert X.shape == (self.l0, self.raw_nb_feats) # NOTE: this is the transpose of what Xiaochen did
        return X

    def char2idx(self, character):
        return self.alphabet.find(character)

    def oneHotEncode(self, seq):
        X = torch.zeros(self.l0, self.raw_nb_feats)
        for i, char in enumerate(seq[::-1]):
            char_idx = self.char2idx(char)
            if char_idx != -1: # if char is in present in self.alphabet
                X[i, char_idx] = 1.0
        return X

In [34]:
ALPHABET_PATH = os.path.join(ROOT_PATH, "CIL-aux-data", "alphabet.json")
PREPROCESSED_TWITTER_DATASETS_DIR = os.path.join(ROOT_PATH, "stanford_glove_preprocessed")
TWEETS_TRAIN_FILENAME = os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "dataset_stanfordglove_segmented_full.csv")

# train_dataset = TweetsAsCharsAndWordsDataset(TWEETS_TRAIN_FILENAME, ALPHABET_PATH, is_labeled=True)
train_dataset = TweetsAsCharsAndWordsDataset(TWEETS_TRAIN_FILENAME, ALPHABET_PATH, is_labeled=True, max_samples=args.max_samples)
assert train_dataset.raw_nb_feats == ALPHABET_SIZE

if args.val_frac:
    val_size = int(args.val_frac * len(train_dataset))
    train_size = len(train_dataset) - val_size

    torch.manual_seed(0) # need random_split to be deterministic if we want to avoid information leak when we reload notebook in-between training epochs
    train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])
    torch.manual_seed(torch.initial_seed())

    val_dataloader = DataLoader(val_dataset, 
                                batch_size=args.batch_size,
                                shuffle=True, 
                                num_workers=args.num_workers)
    
train_dataloader = DataLoader(train_dataset, 
                              batch_size=args.batch_size,
                              shuffle=True, 
                              num_workers=args.num_workers)

len(train_dataloader), len(val_dataloader) if args.val_frac else None # number of batches. multiply by args.batch_size to get (approximate) number of samples.

loaded pretrained GloVe word-embeddings.


(19336, 196)

In [8]:
TWEETS_TEST_FILENAME = os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "test_stanfordglove_segmented.csv")

test_dataset = TweetsAsCharsAndWordsDataset(TWEETS_TEST_FILENAME, ALPHABET_PATH, is_labeled=False)
assert test_dataset.raw_nb_feats == ALPHABET_SIZE
test_dataloader = DataLoader(test_dataset, 
                             batch_size=args.batch_size,
                             shuffle=False, # need to keep the ordering of the tweets
                             num_workers=0)
len(test_dataloader)

loaded pretrained GloVe word-embeddings.


79

In [9]:
# for i_batch, data in enumerate(train_dataloader):
#     (X_raw, X_pro), y = data
#     print(i_batch, X_raw.shape, X_pro.shape, y)
#     if i_batch == 2:
#         break
# for i_batch, data in enumerate(val_dataloader):
#     (X_raw, X_pro), y = data
#     print(i_batch, X_raw.shape, X_pro.shape, y)
#     if i_batch == 2:
#         break

### Auxiliary functions

In [10]:
def save_checkpoint(model, optimizer, checkpoint, filename):
    """
    Args:
        optimizer: can be set to None, then no optimizer will be saved
        checkpoint is a dict that can be prepopulated (e.g with keys 'epoch' and 'validation_accuracy')
    """
    # https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-across-devices
    if isinstance(model, torch.nn.DataParallel):
        assert not isinstance(model.module, torch.nn.DataParallel) # check we didn't wrap multiple times by mistake...
        checkpoint['model_state_dict'] = model.module.state_dict()
    else:
        checkpoint['model_state_dict'] = model.state_dict()
    if optimizer is not None:
        checkpoint['optimizer_state_dict'] = optimizer.state_dict()
    torch.save(checkpoint,filename)

def load_checkpoint(model, optimizer, filename):
    """
    Args:
        optimizer: can be set to None, then the optimizer state will be ignored (if there is one stored in the checkpoint)
            MUST be set to None if no optimizer state is stored in the checkpoint (so as to minimize risks of confusion)
    """
    # https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-across-devices
    assert os.path.isfile(filename), f"no checkpoint found at {filename} (no such file)"
    # try to make it so that cpu->cpu, gpu->gpu, cpu->gpu, gpu->cpu all work (not 100% sure but I think this should do it)
    if args.cuda:
        device = torch.device("cuda")
        checkpoint = torch.load(filename, map_location=device)
    else:
        device = torch.device("cpu")
        checkpoint = torch.load(filename, map_location=device)
    # checkpoint = torch.load(filename) # or just don't worry abt it and pray that it works
    
    if isinstance(model, torch.nn.DataParallel):
        assert not isinstance(model.module, torch.nn.DataParallel) # check we didn't wrap multiple times by mistake...
        model.module.load_state_dict(checkpoint['model_state_dict'])
    else:
        model.load_state_dict(checkpoint['model_state_dict'])
    if args.cuda:
        model = model.cuda() # possibly always a noop but just in case

    loaded_optimizer = False
    if 'optimizer_state_dict' in checkpoint.keys():
        if optimizer is not None:
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            loaded_optimizer = True
    else:
        assert optimizer is None, "Argument `optimizer` MUST be set to None if no optimizer state is stored in the checkpoint"

    if loaded_optimizer:
        print("successfully loaded model and optimizer states from checkpoint (in place)")
    else:
        print("successfully loaded model state from checkpoint (in place). (Did NOT load optimizer state.)")
    
    if 'epoch' in checkpoint:
        print(f"the model was trained for {checkpoint['epoch']} epochs")
    if 'validation_accuracy' in checkpoint:
        print(f"the model had achieved validation accuracy {checkpoint['validation_accuracy']}")
    return checkpoint

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [11]:
def eval(val_dataloader, model):
    criterion_reduc_sum = nn.BCEWithLogitsLoss(reduction='sum') # sum of losses (instead of mean) over the batch
    if args.cuda:
        criterion_reduc_sum = criterion_reduc_sum.cuda()
    was_training = model.training # don't forget to put it back in training mode at the end!
    model.eval()
    with torch.no_grad():
        predicates_all = []
        target_all = []
        accumulated_loss = 0
        tot_samples = 0
        for i_batch, data in enumerate(tqdm(val_dataloader)):
            inputs, target = data
            target = target.float() # for some reason BCEWithLogitsLoss requires target to be float
            if args.cuda:
            #     inputs, target = inputs.cuda(), target.cuda()
                inputs[0] = inputs[0].cuda()
                inputs[1] = inputs[1].cuda()
                target = target.cuda()
            tot_samples += len(target)
            logit = model(inputs)
            logit = logit.squeeze(1)
            assert logit.shape == (target.shape[0],) # val_dataloader.batch_size, except for the last batch

            accumulated_loss += criterion_reduc_sum(logit, target).item() # sum of losses (instead of mean) over the batch
            predicates = torch.round(torch.sigmoid(logit))

            predicates_all.append(predicates)
            target_all.append(target)

            if args.cuda:
                torch.cuda.synchronize()
    if was_training:
        model.train()

    avg_loss = accumulated_loss / tot_samples
    predicates_all = torch.cat(predicates_all).cpu()
    target_all = torch.cat(target_all).cpu()
    accuracy = metrics.accuracy_score(target_all, predicates_all)
    f1_score = metrics.f1_score(target_all, predicates_all)
    print(f"Validation - \
        \n\t loss: {accumulated_loss / tot_samples}  \
        \n\t acc: {accuracy} \
        \n\t f1-score: {f1_score} \
    ")
    # if args.log_result:
    #     with open(os.path.join(path, args.save_folder,'result_res.csv'), 'a') as r:
    #         r.write('\n{:d},{:d},{:.5f},{:.2f},{:f}'.format(epoch_train, 
    #                                                         batch_train, 
    #                                                         avg_loss, 
    #                                                         accuracy, 
    #                                                         optimizer.state_dict()['param_groups'][0]['lr']))
    return avg_loss, accuracy

def predict(test_dataloader, model):
    assert not test_dataloader.dataset.is_labeled # the samples we get from test_dataloader are inputs only, no labels!
    was_training = model.training # don't forget to put it back in training mode at the end!
    model.eval()
    with torch.no_grad():
        y_pred = []
        for i_batch, data in enumerate(tqdm(test_dataloader)):
            inputs = data
            # inputs = inputs[::-1] # TODO: check that it's in the right order
            if args.cuda:
            #     inputs = inputs.cuda()
                inputs[0] = inputs[0].cuda()
                inputs[1] = inputs[1].cuda()
            logit = model(inputs)
            logit = logit.squeeze(1)
            assert logit.shape == (inputs.shape[0],) # test_dataloader.batch_size, except for the last batch

            predicates = torch.round(torch.sigmoid(logit))
            y_pred.append(predicates)

            if args.cuda:
                torch.cuda.synchronize()
    if was_training:
        model.train()

    y_pred = torch.cat(y_pred)
    return y_pred

### Define (and instantiate) the model

In [12]:
"""input size (N x C x L), for example, the initial text is (batch_size, 70, 501)"""

def conv3(in_features, out_features, stride=1, padding=1, dilation=1, groups=1):
    """(1D-)Convolution with kernel size 3, with padding
    Args:
        in_features: nb input channels,
        out_features: nb output channels.
    """
    return nn.Conv1d(in_features, out_features, kernel_size=3, stride=stride,
                     padding=padding, dilation=dilation, groups=groups, bias=True)

def conv1(in_features, out_features, stride=1):
    """(1D-)Convolution with kernel size 1"""
    return nn.Conv1d(in_features, out_features, kernel_size=1, stride=stride, bias=True)

class  CharResCNN_pre(nn.Module):
    def __init__(self, out_dim=60, nb_feats=70):
        """Almost the same as CharResCNN, but the output dimension is >1 (not yet logits)"""
        super().__init__()
        self.out_dim = out_dim

        self.relu = nn.ReLU(inplace=True)
        self.maxpool1d = nn.MaxPool1d(kernel_size=3, stride=3)
        # self.extension1 = conv1(128, 256)
        # self.extension2 = conv1(256, 512)
        self.conv1 = nn.Sequential(
            nn.Conv1d(nb_feats, 256, kernel_size=7, stride=1),
            nn.BatchNorm1d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(kernel_size=3, stride=3)
        )
        self.downsample = nn.Sequential(
            conv1(256, 512, stride=2),
            nn.BatchNorm1d(512)
        )         
        self.res3 = nn.Sequential(
            conv3(256, 256),
            nn.BatchNorm1d(256),
        )
        self.res4 = nn.Sequential(
            conv3(256, 256),
            nn.BatchNorm1d(256),
        )
        self.res5 = nn.Sequential(
            conv3(256, 512, stride=2),
            nn.BatchNorm1d(512),
        )
        self.res6 = nn.Sequential(
            conv3(512, 512),
            nn.BatchNorm1d(512),
        )
        # After ResConv, shape = [B, C, L] = [B, 256, 18x3] B:Batch size
        """
        Input can be of size T x B x * where T is the length of the longest sequence (equal to lengths[0]), 
        B is the batch size, and * is any number of dimensions (including 0). 
        If batch_first is True, B x T x * input is expected.
        """
        self.rnn = nn.GRU(input_size=512,
						  hidden_size=50,
						  num_layers=2,
						  bidirectional=True,
						  batch_first=True,
						  dropout=0.5)
        """
        GRU input of shape (seq_len, batch, input_size): tensor containing the features of the input sequence.
        """
		    # self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        # self.out1 = nn.Linear(1024, 512)
        # self.out2 = nn.Linear(512, 1)
        # self.dropout = nn.Dropout(0.5)
        self.lin_each_state = nn.Linear(100, 30)
        self.flatten = nn.Flatten()
        # self.out2 = nn.Linear(810, 1)
        self.out2 = nn.Linear(810, out_dim)

        # 9216 = 256*36
        # 4352 = 256*17
        # m = nn.MaxPool1d(3, stride=3)
        # input = torch.randn(20, 16, 53)
        # output = m(input)     
        # output.size()=17!!!!!

    def forward(self, x):
        """
        tweet = length of 501
        one-hot encoding = [70, 501]
        into CNN [B, C, H, W] ---> 1D conv: [B, C, L] ---> [64, 70, 501] ---> conv 3x3, do it on the last dim (length) 
          ---> if kernel is 3x3, actually is [70, 3, 3], number of kernel is the output channel 
        """
        # (batch, seqlen, nbfeats) -> (batch, nbfeats, seqlen) to feed into Conv1d
        x = x.permute(0, 2, 1)

        x = self.conv1(x)
        # x = self.conv2(x)

        identity3 = x
        x = self.res3(x)
        x += identity3
        x = self.relu(x)

        identity4 = x
        x = self.res4(x)
        x += identity4
        x = self.relu(x)

        identity5 = self.downsample(x)
        x = self.res5(x)
        x += identity5
        x = self.relu(x)

        identity6 = x
        x = self.res6(x)
        x += identity6
        x = self.relu(x)
        
        x = self.maxpool1d(x)
        # x: (batch, channels, seqlen)
        x = x.permute(0, 2, 1)
        # x: (batch, seqlen, channels) = (batch, 27, 512)
        x, _ = self.rnn(x)
            # # hidden = [n layers * n directions, batch size, emb dim]
            # hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        # x: (batch, seqlen, hiddenstates) = (batch, 27, 100)
        x = self.lin_each_state(x)
        # x: (batch, seqlen, hiddenstates) = (batch, 27, 30)
        x = self.flatten(x)
        # output = self.out1(x)
        output = self.out2(x)
        # output: (batch size, out_dim)

        return output

In [13]:
"""input size (N x C x L), for example, the initial text is (batch_size, 200, 131)"""

class  WordCNN_pre(nn.Module):
    def __init__(self, out_dim=50, input_feats=200, input_len=131):
        """Almost the same as WordCNN, but the output dimension is >1 (not yet logits)

        Args:
            input_feats (int): the expected number of features of the input sequences
            input_len (int): the expected length of the input sequences (already padded)
        """
        super().__init__()
        self.out_dim = out_dim

        self.rnn = nn.GRU(input_size=input_feats,
						  hidden_size=25,
						  num_layers=2,
						  bidirectional=True,
						  batch_first=True,
						  dropout=0.5)
        self.conv1 = nn.Sequential(
            nn.Conv1d(50, 50, 3), # nb of input channels is 2*hidden_size of rnn (2* because bidirectional)
            nn.BatchNorm1d(50),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=3, stride=3)
        )
        self.downsample = nn.Sequential(
            nn.Conv1d(50, 50, 4, stride=4),
            nn.BatchNorm1d(50)
        )
        self.fc_final = nn.Sequential(
            nn.Flatten(),
            nn.Linear(500, 50),
            nn.ReLU(),
            # nn.Linear(50, 1)
            nn.Linear(50, out_dim)
        )

    def forward(self, x):
        # x: (batch, input_len, input_feats) = (batch, 131, 200)
        out, _ = self.rnn(x) # don't need the final hidden state
        # out: (batch, seq_len, num_directions * hidden_size)
        out = out.permute(0, 2, 1)
        # out: (batch, num_directions * hidden_size, seq_len) = (batch, 60, 131)
        out = self.conv1(out)
        # out: (batch, out_channels, out_len) = (batch, 70, 43)
        out = self.downsample(out)
        # out: (batch, out_channels, out_len) = (batch, 70, 10)
        out = self.fc_final(out)
        # out: (batch, out_dim)
        return out

In [14]:
"""input size (N x C x L), for example, the initial text is (batch_size, 200, 131)"""

class  CharAndWordCNN(nn.Module):
    def __init__(self, charCNN_out_dim=60, wordCNN_out_dim=50, raw_nb_feats=70, pro_input_feats=200, pro_input_len=131):
        """Almost the same as WordCNN, but the output dimension is >1 (not yet logits)

        Args:
            input_feats (int): the expected number of features of the input sequences
            input_len (int): the expected length of the input sequences (already padded)
        """
        super().__init__()
        self.charCNN = CharResCNN_pre(out_dim=charCNN_out_dim, nb_feats=raw_nb_feats)
        self.wordCNN = WordCNN_pre(out_dim=wordCNN_out_dim, input_feats=pro_input_feats, input_len=pro_input_len)
        self.fc_combine = nn.Sequential(
            nn.Linear(charCNN_out_dim + wordCNN_out_dim, 100),
            nn.ReLU(),
            nn.Linear(100, 100),
            nn.ReLU(),
            nn.Linear(100, 1)
        )

    def forward(self, x):
        x_raw, x_pro = x
        out_charCNN = self.charCNN(x_raw) # (batch, charCNN_out_dim)
        out_wordCNN = self.wordCNN(x_pro) # (batch, wordCNN_out_dim)
        out = torch.cat((out_charCNN, out_wordCNN), dim=1) 
        out = self.fc_combine(out)
        # out: (batch, 1)
        return out

In [15]:
model = CharAndWordCNN()
model_nickname = "CharAndWordCNN"
print(f"{count_parameters(model)} parameters")

criterion = nn.BCEWithLogitsLoss()
if args.cuda:
    model = torch.nn.DataParallel(model).cuda()
    # model = model.cuda()
    criterion = criterion.cuda()

optimizer = optim.Adam(model.parameters()) # TODO: try tweaking parameters (e.g learning rate)

2215079 parameters


### Training

In [21]:
args.checkpoint_continue_from = os.path.join(ROOT_PATH, "CIL-results", "my_checkpoints", "CharAndWordCNN_epoch_10_2020-07-27T19:51:08.351260.pth.tar")
# args.checkpoint_continue_from = None

In [22]:
if args.checkpoint_continue_from:
    print(f"=> loading checkpoint from {args.checkpoint_continue_from}")
    checkpoint = load_checkpoint(model, optimizer, args.checkpoint_continue_from) # load the state to `model` and `optimizer` and fetch the remaining info into `checkpoint`
    
    # always assume that we saved a model after an epoch finished, so start at the next epoch.
    start_epoch = checkpoint['epoch'] + 1
    # load optimizer, default all parameters are in cpu     --> pretty sure it's always a noop, but just in case
    if args.cuda:
        for state in optimizer.state.values():
            for k, v in state.items():
                if torch.is_tensor(v):
                    state[k] = v.cuda()
else:
    start_epoch = 1

start_epoch

=> loading checkpoint from /content/drive/My Drive/cil/CIL-results/my_checkpoints/CharAndWordCNN_epoch_10_2020-07-27T19:51:08.351260.pth.tar
successfully loaded model and optimizer states from checkpoint (in place)
the model was trained for 10 epochs
the model had achieved validation accuracy 0.8442


11

In [18]:
# ## for a manual save
# epoch="SPECIAL"

# if args.checkpoint_save_to_dir:
#     ts = datetime.datetime.now().isoformat()
#     file_path = os.path.join(args.checkpoint_save_to_dir, f"{model_nickname}_epoch_{epoch}_{ts}.pth.tar")
#     print(f"=> saving checkpoint model to {file_path}")
#     save_checkpoint(model, 
#                     optimizer,
#                     {'epoch': epoch,
#                       'validation_accuracy': val_acc},
#                     file_path)

In [19]:
# ## for a manual validation evaluation
# val_loss, val_acc = eval(val_dataloader, model)

#### Train the whole model at once

In [20]:
model.train()

for epoch in range(start_epoch, args.epochs+1):
    print(f"\n\n===== Starting epoch #{epoch} =====")
    accumulated_train_loss = 0
    for i_batch, data in enumerate(tqdm(train_dataloader)):
        inputs, target = data
        target = target.float() # for some reason BCEWithLogitsLoss requires target to be float
        if args.cuda:
            inputs[0] = inputs[0].cuda()
            inputs[1] = inputs[1].cuda()
            target = target.cuda()

        optimizer.zero_grad()
        logit = model(inputs)
        logit = logit.squeeze(1) # (n, 1) -> (n,)
        assert logit.shape == (target.shape[0],) # train_dataloader.batch_size, except for the last batch
        loss = criterion(logit, target)
        accumulated_train_loss += criterion(logit, target).item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        optimizer.step()
        if args.cuda:
            torch.cuda.synchronize()

        # if args.verbose:
        #     print('\nTargets, Predicates')
        #     print(torch.cat((target.unsqueeze(1), torch.unsqueeze(torch.max(logit, 1)[1].view(target.size()).data, 1)), 1))
        #     print('\nLogit')
        #     print(logit)
        # if i_batch % args.log_interval == 0:
        #     corrects = (torch.round(torch.sigmoid(logit)) == target.data).float().sum()  # convert into float for division
        #     accuracy = 100.0 * corrects/args.batch_size
        #     print('Epoch[{}] Batch[{}] - loss: {:.6f}  lr: {:.5f}  acc: {:.3f}% ({}/{})'.format(epoch,
        #                                                                   i_batch,
        #                                                                   loss.data,
        #                                                                   optimizer.state_dict()['param_groups'][0]['lr'],
        #                                                                   accuracy,
        #                                                                   corrects,
        #                                                                   args.batch_size))
        if (i_batch+1) % args.val_interval == 0:
            print(f"Training - loss: {accumulated_train_loss / (i_batch+1)}")
            val_loss, val_acc = eval(val_dataloader, model)

    print(f"----- Finished epoch #{epoch} -----")
    # validation
    print('\nTraining - loss: {:.6f}'.format(accumulated_train_loss/i_batch))
    val_loss, val_acc = eval(val_dataloader, model)

    # save the model as this epoch
    if args.checkpoint_save_to_dir:
        ts = datetime.datetime.now().isoformat()
        file_path = os.path.join(args.checkpoint_save_to_dir, f"{model_nickname}_epoch_{epoch}_{ts}.pth.tar")
        print(f"=> saving checkpoint model to {file_path}")
        save_checkpoint(model, 
                        optimizer,
                        {'epoch': epoch,
                         'validation_accuracy': val_acc},
                        file_path)

    start_epoch = epoch+1

print(f"finished the required number of epochs args.epoch={args.epoch}")



===== Starting epoch #1 =====


HBox(children=(FloatProgress(value=0.0, max=17579.0), HTML(value='')))

KeyboardInterrupt: ignored

#### Train/Evaluate only the charCNN part

In [None]:
# freeze the parameters that are not in the charCNN part


#### Train/Evaluate only the wordCNN part

In [None]:
# symetric of the "train only charCNN" section

### Evaluate reliance on one method or the other

Idea: evaluate on validation samples, but put in garbage instead of the raw inputs, and see how well the model does. If it doesn't do worse than normal, then it relies much more on the `WordCNN_pre` part than on the `CharResCNN_pre` part. Symetrically when putting in garbage instead of the processed inputs.

In [23]:
def get_garbage_like(inp):
    """Given a true input vector, returns some garbage that can replace input. Namely, return a random Gaussian vector of the same shape.
    The magnitude is chosen s.t expected squared-Frobenius-norm of the garbage equals squared-Frobenius-norm of the true input.
    Args:
        inp (torch.Tensor): true input to be replaced by garbage
    """
    magnitude = torch.norm(inp) / np.sqrt(inp.numel()) 
    garb = torch.randn_like(inp) * magnitude
    # print(f"norm of true input: {torch.norm(inp).item()},  norm of garbage: {torch.norm(garb).item()}") # --> yep, norms are consistently close
    return garb

def eval_half_garbage(val_dataloader, model, garbagify):
    """Evaluate on validation but put in garbage instead of sample for one of the methods
    Args:
        val_dataloader
        model
        garbagify (str): decide which one of the methods to invalidate
            - "char[CNN]" or "raw":              put in garbage at the input of the charCNN part
            - "word[CNN]" or "[pre]pro[cessed]": put in garbage at the input of the wordCNN part
    """
    if garbagify in ["char", "charCNN", "raw"]:
        garbagify = "raw"
    elif garbagify in ["word", "wordCNN", "pro", "prepro", "preprocessed"]:
        garbagify = "pro"
    else:
        raise ValueError('Argument `garbagify` must be one of "char[CNN]"/"raw" or "word[CNN]"/"[pre]pro[cessed]"')
    criterion_reduc_sum = nn.BCEWithLogitsLoss(reduction='sum') # sum of losses (instead of mean) over the batch
    if args.cuda:
        criterion_reduc_sum = criterion_reduc_sum.cuda()
    was_training = model.training # don't forget to put it back in training mode at the end!
    model.eval()
    with torch.no_grad():
        predicates_all = []
        target_all = []
        accumulated_loss = 0
        tot_samples = 0
        for i_batch, data in enumerate(tqdm(val_dataloader)):
            inputs, target = data
            target = target.float() # for some reason BCEWithLogitsLoss requires target to be float
            if garbagify == "raw":
                inputs[0] = get_garbage_like(inputs[0])
            else:
                inputs[1] = torch.randn_like(inputs[1])
            if args.cuda:
            #     inputs, target = inputs.cuda(), target.cuda()
                inputs[0] = inputs[0].cuda()
                inputs[1] = inputs[1].cuda()
                target = target.cuda()
            tot_samples += len(target)
            logit = model(inputs)
            logit = logit.squeeze(1)
            assert logit.shape == (target.shape[0],) # val_dataloader.batch_size, except for the last batch

            accumulated_loss += criterion_reduc_sum(logit, target).item() # sum of losses (instead of mean) over the batch
            predicates = torch.round(torch.sigmoid(logit))

            predicates_all.append(predicates)
            target_all.append(target)

            if args.cuda:
                torch.cuda.synchronize()
    if was_training:
        model.train()

    avg_loss = accumulated_loss / tot_samples
    predicates_all = torch.cat(predicates_all).cpu()
    target_all = torch.cat(target_all).cpu()
    accuracy = metrics.accuracy_score(target_all, predicates_all)
    f1_score = metrics.f1_score(target_all, predicates_all)
    print(f"Validation - \
        \n\t loss: {accumulated_loss / tot_samples}  \
        \n\t acc: {accuracy} \
        \n\t f1-score: {f1_score} \
    ")
    # if args.log_result:
    #     with open(os.path.join(path, args.save_folder,'result_res.csv'), 'a') as r:
    #         r.write('\n{:d},{:d},{:.5f},{:.2f},{:f}'.format(epoch_train, 
    #                                                         batch_train, 
    #                                                         avg_loss, 
    #                                                         accuracy, 
    #                                                         optimizer.state_dict()['param_groups'][0]['lr']))
    return avg_loss, accuracy

In [35]:
## manual validation evaluation
val_loss, val_acc = eval_half_garbage(val_dataloader, model, garbagify='char')

HBox(children=(FloatProgress(value=0.0, max=196.0), HTML(value='')))


Validation -         
	 loss: 0.3503064520263672          
	 acc: 0.84856         
	 f1-score: 0.8517038777908342     


In [36]:
val_loss, val_acc = eval_half_garbage(val_dataloader, model, garbagify='word')

HBox(children=(FloatProgress(value=0.0, max=196.0), HTML(value='')))


Validation -         
	 loss: 11.320286474609375          
	 acc: 0.50024         
	 f1-score: 0.0     


### Prediction

In [None]:
y_pred = predict(test_dataloader, model)

HBox(children=(FloatProgress(value=0.0, max=79.0), HTML(value='')))




In [None]:
y_pred = y_pred.cpu().numpy()
y_pred = y_pred.astype(np.integer, copy=False)

# y_pred[y_pred==1] = -1
# y_pred[y_pred==0] = 1
y_pred[y_pred==0] = -1
print(f"predict {np.count_nonzero(y_pred==-1)} positive, {np.count_nonzero(y_pred==1)} negative")

predict 4808 positive, 5192 negative


In [None]:
ts = datetime.datetime.now().isoformat()
SUBMISSION_FILENAME = os.path.join(ROOT_PATH, f"{model_nickname}_submission_{ts}.csv")

with open(SUBMISSION_FILENAME, "w") as f:
    f.write("Id,Prediction\n")
    for i, label in enumerate(y_pred, start=1):  
        f.write(f"{i},{label}\n")

print(f"wrote to {SUBMISSION_FILENAME}")

wrote to /content/drive/My Drive/cil/WordCNN_submission_2020-07-27T13:45:33.598866.csv
