In [None]:
!pip install torch
!pip install transformers

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizer
import string

# the character based cnn required in the paper
class CharCNN(nn.Module):
    def __init__(self, num_chars, char_embed_size, num_filters, kernel_sizes):
        super(CharCNN, self).__init__()
        self.char_embedding = nn.Embedding(num_chars, char_embed_size)
        self.convs = nn.ModuleList([
            nn.Conv1d(char_embed_size, num_filters, kernel_size=k) # 1 dimensional convolution, not 2d like in 691.
            for k in kernel_sizes
        ])

    def forward(self, x):
        '''
        x's shape is (batch, word_len, char_embed_size)
        '''
        x = self.char_embedding(x)
        x = x.transpose(1, 2)  # (batch, char_embed_size, word_len)
        x = [F.relu(conv(x)) for conv in self.convs]  # cnn
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x, 1)  # join measurements
        return x

In [6]:
# Prepare model hyperparameters
num_chars = len(char_dict)
char_embed_size = 50
num_filters = 100
kernel_sizes = [3, 4, 5]

char_dict = {char: idx + 1 for idx, char in enumerate(string.ascii_letters + string.digits + string.punctuation)}
char_dict['<pad>'] = 0
char_dict['<unk>'] = -1

num_chars = len(char_dict)
char_embed_size = 50
num_filters = 100
kernel_sizes = [3, 4, 5]

def char_indices(word, char_dict):
    return [char_dict.get(c, char_dict['<unk>']) for c in word] # make unknowns default return

def sentence_to_char_toks(sentence):
    word_tokens = tokenizer.tokenize(sentence)
    word_ids = tokenizer.convert_tokens_to_ids(word_tokens)
    word_input = tokenizer(sentence, return_tensors="pt")['input_ids']
    char_inputs = [char_indices(word, char_dict) for word in word_tokens]
    char_input = pad_sequences(char_inputs, maxlen=max(len(word) for word in word_tokens), padding='post')
    return word_input, char_input

def pad_sequences(sequences, maxlen, padding='post'):
    # Pads sequences to the same length
    num_instances = len(sequences)
    x = torch.zeros((num_instances, maxlen), dtype=torch.long)
    for i, seq in enumerate(sequences):
        if len(seq) != 0:
            if padding == 'pre':
                x[i, -len(seq):] = torch.tensor(seq[:maxlen], dtype=torch.long)
            else:
                x[i, :len(seq)] = torch.tensor(seq[:maxlen], dtype=torch.long)
    return x

In [7]:
# Create a CharCNN instance
char_cnn = CharCNN(num_chars=num_chars, char_embed_size=char_embed_size, num_filters=num_filters, kernel_sizes=kernel_sizes)

# Function to convert sentence to character indices
def sentence_to_char_input(sentence, char_dict):
    char_seqs = []
    for word in sentence.split():
        char_seq = char_indices(word, char_dict)
        char_seqs.append(char_seq)
    char_input = pad_sequences(char_seqs, maxlen=max(len(word) for word in sentence.split()), padding='post')
    return char_input

# Prepare a sample sentence
test_sentence = "test char cnn"
char_input = sentence_to_char_input(test_sentence, char_dict)

# Add an extra dimension for batch_size since CharCNN expects a batch of words
# char_input = char_input.unsqueeze(0) would also work here if it's a single sequence
char_input = char_input[None, :]

# Test the CharCNN
with torch.no_grad():
    char_cnn_output = char_cnn(char_input)

print("CharCNN output:", char_cnn_output)

RuntimeError: Expected 2D (unbatched) or 3D (batched) input to conv1d, but got input of size: [1, 4, 3, 50]