# Text CNN

Text CNN is a method for applying convolutional architecture to text tasks.

In [18]:
import numpy as np

from gensim import downloader
from gensim.models import KeyedVectors

import torch
from torch import nn

from tqdm import tqdm
from collections.abc import Collection
from sklearn.datasets import fetch_20newsgroups

In [19]:
categories = ["talk.politics.guns", "rec.motorcycles"]
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)

In [20]:
wv = downloader.load("word2vec-google-news-300")

def seq_to_emb(
    sentences: Collection[str], 
    wv: KeyedVectors, 
    tokens_num: int, 
    pad_token: str = "</s>"
) -> np.ndarray:
    '''
    Convert a set of sentences into embeddings. Each set of symbols separated by
    a space will be recognized as a separate token.

    Parameters
    ----------
    sentences: Collection[str]
        A collection of sentences that require transformation into embeddings.
    wv: KeyedVectors
        This needs to be used to transform tokens into embeddings.
    tokens_num: int
        The number of tokens to take from each sample. Extra tokens will be 
        dropped, and if there are not enough tokens, padding will be added.
    pad_token: str = "</s.>"
        The token that will be used for padding if there are not enough tokens 
        in a sample.

    Returns
    -------
    out: np.ndarray
        Of size (<samples> ,<embedding size>, <tokens_num>).
    '''

    rv = []
    pad_vector = wv.get_vector(pad_token)

    for sentence in sentences:

        sentence_embeddings = []
        got_emb = 0
        for one_token in sentence.split():
            if wv.has_index_for(one_token):
                sentence_embeddings.append(wv.get_vector(one_token))
                got_emb += 1
                # Taking embedings only for some of the words
                if got_emb >= tokens_num: break

        sentence_embeddings = np.stack(sentence_embeddings, axis=1)
        pad_array = np.tile(
            pad_vector[:, None], 
            reps=(1, tokens_num - sentence_embeddings.shape[1])
        )
        sentence_embeddings = np.hstack([sentence_embeddings, pad_array])
        rv.append(sentence_embeddings)

    return np.stack(rv)

In [21]:
tokens_num = 100

X_train = torch.tensor(seq_to_emb(
    sentences=train["data"],
    wv=wv,
    tokens_num=tokens_num
))
X_test = torch.tensor(seq_to_emb(
    sentences=test["data"],
    wv=wv,
    tokens_num=tokens_num
))

y_train = torch.tensor(train["target"], dtype=torch.float)
y_test = torch.tensor(test["target"], dtype=torch.float)

In [27]:
class TextCNN(nn.Module):
    def __init__(
        self, 
        kernel_sizes: list[int], 
        in_channels: int, 
        out_channels: int
    ):
        
        super().__init__()

        self.conv_transforms = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    kernel_size=ks
                ),
                nn.AdaptiveAvgPool1d(output_size=1),
                nn.Flatten()
            )
            for ks in kernel_sizes
        ])

        self.head = nn.Sequential(
            nn.Linear(
                in_features=len(kernel_sizes)*out_channels,
                out_features=1
            ),
            nn.Flatten(start_dim=0),
            nn.Sigmoid()
        )
    
    def forward(self, X: torch.Tensor):
        return self.head(torch.cat([ct(X) for ct in self.conv_transforms], axis=1))

In [None]:
torch.manual_seed(10)
text_cnn = TextCNN([2,3], 300, 10)
optimizer = torch.optim.Adam(text_cnn.parameters(), lr=1e-3)
for i in tqdm(range(20)):
    optimizer.zero_grad()
    predict = text_cnn(X_train)
    loss_value = nn.functional.binary_cross_entropy(
        input=predict, target=y_train)
    loss_value.backward()
    optimizer.step()

100%|██████████| 20/20 [00:04<00:00,  4.86it/s]


In [None]:
text_cnn.eval()

with torch.no_grad():
    test_pred = text_cnn(X_test)

((test_pred > 0.5).to(dtype=torch.float) == y_test).to(dtype=torch.float).mean()

tensor(0.8635)