This notebook fits a very deep CNN for text classification (inspired by Very Deep Convolutional Networks for Text Classification by Alexis Conneau, Holger Schwenk, Loïc Barrault, Yann Lecun https://arxiv.org/abs/1606.01781) to the Modern Slavery hackathon dataset. Note: this approach proved unsuccessful, so I didn't spend much time documenting it.

Why a CNN approach? The task is to classify documents as to whether or not they say anything about the company conducting anti-modern-slavery training for their employees. If the document does contain such a statement, it is likely to be localized to a few sentences in maybe one or two places within the document and the rest of the document is fairly irrelevant. Also, the length of the documents varies greatly, so it makes sense to apply a sliding CNN with a global max pooling layer (which makes the model length independent; it can handle short or very very long documents with ease) hoping that a filter can be trained which will trigger when sliding across such a statement. 

It turned out that we do not appear to have a sufficiently large training set (only 650 labeled documents) to train such a CNN. With an order of magnitude (or two) more labeled documents, the CNN may be a possible solution. This approach has been shelved while a Question-Answering approach is attempted. See 'QA-sliding window.ipynb'

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
import pandas as pd
import pickle

In [2]:
embeddings=pickle.load(open('embeddings_scaled_256.pkl','rb'))#[:,:64]

In [3]:
class LitShallow1dCNN(pl.LightningModule):
    def __init__(self, architecture, channels, k_max, lr, wd=0.01, loss_penalty=0.5, num_classes=2):
        """
        architecture = list of tuples:
            each tuple defines a conv1d filter: the first element is the kernel size, the 2nd is the number of filters
            each tuple runs in parallel and the outputs are appended into a 1d embedding for each location
            Example: [(3,16),(5,16),(7,16)]
                this would construct an architecture as follows:
                1 convolutional layer with 16 3-kernel filters, 16 5-kernel filters, and 16 7-kernel filters, 
                    resulting in an output embedding of 48 dimensions
                This architecture is then always followed by global k-max pooling, reducing the embedding to 
                    1 x (k*num_filters)
                followed by
                fully connected layer (k*num_filters x num_classes)
        """
        super().__init__()
        
        self.channels=channels
        self.k_max=k_max
        self.lr=lr
        self.wd=wd
        self.penalty_scale=loss_penalty
        self.convolutions=[]
        self.num_filters=0
        for i, convolution in enumerate(architecture):
            self.num_filters+=convolution[1]
            self.convolutions.append(nn.Conv1d(in_channels=channels,
                                               out_channels=convolution[1],
                                               kernel_size=convolution[0],
                                               stride=1,
                                               padding=(convolution[0]//2),
                                               bias=False))
        self.ff = nn.Linear(self.num_filters*k_max, num_classes)

    def forward(self, x):
        conv_out = torch.cat([conv(x) for conv in self.convolutions],1)
        global_max, _  = torch.topk(conv_out, self.k_max, 2)
        global_max=global_max.reshape((global_max.size()[0],self.num_filters*self.k_max))
        logits = self.ff(global_max)
        return logits
    
    def custom_loss(self, logits, y):
        preds=F.softmax(logits)
        #quadratic penalty with penalty 0 @ 0 & 1, penalty 1 @ 0.5:
        penalty=torch.mean(4*(0.25-((preds-0.5)**2))) 
        ce_loss=F.cross_entropy(logits, y)
        return ce_loss + penalty * self.penalty_scale
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.custom_loss(logits, y)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.custom_loss(logits, y)
        self.log('val_loss', loss)
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr, weight_decay=self.wd)
        return optimizer
    
class ConvBlock(nn.Module):
    
    def __init__(self, channels, convolutions_per_block):
        super(ConvBlock, self).__init__()
        layers=[]
        for i in range(convolutions_per_block):
            layers.append(nn.Conv1d(channels, channels, kernel_size=3, stride=1, padding=1, bias=False))
            layers.append(nn.BatchNorm1d(channels))
            if i < convolutions_per_block-1:
                layers.append(nn.ReLU())
        self.model = nn.Sequential(*layers)
        
    def forward(self, x):
        out = F.relu(self.model(x) + x)
        return out

class LitDeep1dCNN(pl.LightningModule):
    def __init__(self, architecture, channels, k_max, lr, wd=0.01, loss_penalty=0.5, num_classes=2):
        """
        architecture = nested list. list levels correspond to:
            top level = each list item is a list of blocks. Subsequent list items reduce the temporal resolution by 2
            2nd level = each list item is a block. list items are integers, setting the convolutions per block
            Example: [[1,2,3],[4,5],[6]]
                this would construct an architecture as follows (starting from the bottom):
                1 convolutional block which contains a shortcut around a single conv1d layer
                followed by
                1 convolutional block which contains a shortcut around two conv1d layers
                followed by
                1 convolutional block which contains a shortcut around three conv1d layers
                followed by
                local temporal max pooling layer: takes the max of each subsequent pair of values
                followed by
                1 convolutional block which contains a shortcut around four conv1d layers
                followed by
                1 convolutional block which contains a shortcut around five conv1d layers
                followed by
                local temporal max pooling layer: takes the max of each subsequent pair of values
                followed by
                1 convolutional block which contains a shortcut around six conv1d layers
                
                This architecture is then always followed by global k-max pooling, reducing the embedding to 
                    1 x k*num_channels
                followed by
                fully connected layer (k*num_channels x num_classes)
        """
        super().__init__()
        
        self.channels=channels
        self.k_max=k_max
        self.lr=lr
        self.wd=wd
        self.penalty_scale=loss_penalty
        layers=[]
        for i, layer in enumerate(architecture):
            for block in layer:
                layers.append(ConvBlock(channels, block))
            if i < len(architecture)-1:
                layers.append(nn.MaxPool1d(2))
        self.conv_blocks = nn.Sequential(*layers)
        self.ff = nn.Linear(channels*k_max, num_classes)

    def forward(self, x):
        conv_out = self.conv_blocks(x)
        global_max, _  = torch.topk(conv_out, self.k_max, 2)
        global_max=global_max.reshape((global_max.size()[0],self.channels*self.k_max))
        logits = self.ff(global_max)
        return logits
    
    def custom_loss(self, logits, y):
        preds=F.softmax(logits)
        #quadratic penalty with penalty 0 @ 0 & 1, penalty 1 @ 0.5:
        penalty=torch.mean(4*(0.25-((preds-0.5)**2))) 
        ce_loss=F.cross_entropy(logits, y)
        return ce_loss + penalty * self.penalty_scale
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.custom_loss(logits, y)
        self.log('train_ce_loss', F.cross_entropy(logits, y))
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.custom_loss(logits, y)
        self.log('val_ce_loss', F.cross_entropy(logits, y))
        self.log('val_loss', loss)
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr, weight_decay=self.wd)
        return optimizer

In [4]:
architecture=[(x,16) for x in range(17,23+1,2)]
print(architecture)
model=LitShallow1dCNN(architecture=architecture, 
                      channels=256, 
                      k_max=8, 
                      lr=1e-3, 
                      wd=0.01)
model

[(17, 16), (19, 16), (21, 16), (23, 16)]


LitShallow1dCNN(
  (ff): Linear(in_features=512, out_features=2, bias=True)
)

In [5]:
if torch.cuda.is_available():
    embeddings = embeddings.to('cuda')
    model = model.to('cuda')

In [36]:
df_labeled=pd.read_csv('df_labeled.csv',index_col=0)
#df_hidden=pd.read_csv('df_hidden.csv',index_col=0)

In [7]:
batch_size=32
test_size=0.25

df_train = df_labeled.sample(frac=1-test_size, replace=False, random_state=42)
df_eval = df_labeled.loc[[i for i in df_labeled.index if i not in df_train.index]]

In [8]:
class HackathonDataset(torch.utils.data.Dataset):
    def __init__(self, df, embeddings, repetitions_per_epoch):
        temp_df = df[['embeddings_start','num_tokens','LABEL_MSA_training_binary']]
        class_0_df = temp_df.loc[temp_df['LABEL_MSA_training_binary']==0]
        class_0_df_repeated = pd.concat([class_0_df for i in range(repetitions_per_epoch)])
        
        class_1_df = temp_df.loc[temp_df['LABEL_MSA_training_binary']==1]
        class_1_reps = (repetitions_per_epoch * len(class_0_df))//len(class_1_df)
        class_1_df_repeated = pd.concat([class_1_df for i in range(class_1_reps)])
        
        self.df=pd.concat([class_0_df_repeated,
                           class_1_df_repeated,
                           class_1_df.sample(n=len(class_0_df_repeated)-len(class_1_df_repeated), replace=False, random_state=42)
                          ]).sample(frac=1, random_state=42)
        
        self.channels = embeddings.size()[1]
        self.embeddings = torch.transpose(embeddings,0,1)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        if isinstance(idx, slice):
            start, stop, step = idx.indices(len(self))
            idx = [x for x in range(start, stop, step)]
        if isinstance(idx, int):
            idx = [idx]
        
        y = torch.tensor([y for y in self.df.iloc[idx,:]['LABEL_MSA_training_binary']])
        
        start_ids = self.df.iloc[idx,:]['embeddings_start']
        num_tokens = self.df.iloc[idx,:]['num_tokens']
        end_ids = start_ids + num_tokens
        
        x = torch.zeros((len(idx),self.channels,max(self.df.iloc[idx,:]['num_tokens'])))
        for i in range(len(idx)):
            x[i,:,:num_tokens.iloc[i]] = self.embeddings[:,start_ids.iloc[i]:end_ids.iloc[i]]
        
        if torch.cuda.is_available():
            x=x.to('cuda')
            y=y.to('cuda')
        return [x, y]

def my_collate(batch):
    r"""Puts each data field into a tensor with outer dimension batch size"""
    ### batch = [list[x-tensors],list[y-tensors]]
    
    num_embeddings=[x_y[0].shape[2] for x_y in batch]
    max_embeddings=max(num_embeddings)
    x_return = torch.zeros((len(batch),batch[0][0].shape[1],max_embeddings))
    y_return = torch.zeros(len(batch),dtype=int)
    for i, x_y in enumerate(batch):
        x_return[i,:,:num_embeddings[i]]=x_y[0]
        y_return[i]=x_y[1]
        
    if torch.cuda.is_available():
        x_return = x_return.to('cuda')
        y_return = y_return.to('cuda')
    return [x_return, y_return]

In [9]:
train_dataset = HackathonDataset(df_train, embeddings)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=my_collate)

eval_dataset = HackathonDataset(df_eval, embeddings)
eval_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=batch_size, shuffle=False, collate_fn=my_collate)

In [10]:
trainer = pl.Trainer(callbacks=[pl.callbacks.EarlyStopping('val_loss',patience=10)])

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


In [11]:
trainer.fit(model, train_dataloader=train_loader, val_dataloaders=eval_loader)


  | Name | Type   | Params
--------------------------------
0 | ff   | Linear | 1 K   


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…






1

In [6]:
logits=torch.randn((10,2))
logits

tensor([[ 0.3865, -0.2843],
        [ 1.4720, -1.4689],
        [ 0.0431, -0.8402],
        [-0.5064,  0.0272],
        [-0.3745,  0.2278],
        [ 0.1301,  1.0098],
        [ 0.6101,  0.0175],
        [-0.4877, -0.7144],
        [ 0.3169,  0.3849],
        [ 0.2907,  0.3270]])

In [13]:
y=torch.randint(0,2,size=(10,1)).squeeze()
y

tensor([1, 0, 0, 1, 0, 0, 0, 0, 0, 1])

In [14]:
F.cross_entropy(logits, y)

tensor(0.6638)

In [10]:
F.softmax(logits)

  F.softmax(logits)


tensor([[0.6617, 0.3383],
        [0.9498, 0.0502],
        [0.7075, 0.2925],
        [0.3697, 0.6303],
        [0.3538, 0.6462],
        [0.2932, 0.7068],
        [0.6440, 0.3560],
        [0.5564, 0.4436],
        [0.4830, 0.5170],
        [0.4909, 0.5091]])

In [35]:
def my_loss(logits, y):
    preds=F.softmax(logits)
    #quadratic penalty with penalty 0 @ 0 & 1, penalty 1 @ 0.5:
    penalty=torch.mean(4*(0.25-((preds-0.5)**2))) 
    ce_loss=F.cross_entropy(logits, y)
    return ce_loss + penalty

model = nn.Linear(4, 3)
x = torch.randn(10, 4)
y = torch.randint(0,3,size=(10,1)).squeeze()
logits = model(x)
loss = my_loss(logits, y)
loss.backward()
print(model.weight.grad)

tensor([[ 0.0339,  0.1222, -0.2322,  0.1692],
        [-0.0404, -0.2055,  0.0631, -0.0918],
        [ 0.0065,  0.0833,  0.1691, -0.0774]])


  preds=F.softmax(logits)


In [26]:
preds=F.softmax(logits)
penalty=4*(0.25-((preds-0.5)**2))
torch.mean(penalty)

  preds=F.softmax(logits)


tensor(0.7886, grad_fn=<MeanBackward0>)

In [47]:
df_labeled['LABEL_MSA_training_binary'].value_counts().sorted

0    420
1    230
Name: LABEL_MSA_training_binary, dtype: int64