# Text CNN

Text CNN is a method for applying convolutional architecture to text tasks.

In [None]:
import numpy as np

from gensim import downloader
from gensim.models import KeyedVectors

import torch
from torch import nn

from collections.abc import Collection
from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = ["talk.politics.guns", "rec.motorcycles"]
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)

In [68]:
wv = downloader.load("word2vec-google-news-300")

In [None]:
def seq_to_emb(
    sentences: Collection[str], 
    wv: KeyedVectors, 
    tokens_num: int, 
    pad_token: str = "</s>"
) -> np.ndarray:
    '''
    Convert set of sentences to the embeddings. Each 
    '''

    rv = []
    pad_vector = wv.get_vector(pad_token)

    for sentence in sentences:

        sentence_embeddings = []
        got_emb = 0
        for one_token in sentence.split():
            if wv.has_index_for(one_token):
                sentence_embeddings.append(wv.get_vector(one_token))
                got_emb += 1
                # Taking embedings only for some of the words
                if got_emb >= tokens_num: break

        sentence_embeddings = np.stack(sentence_embeddings, axis=1)
        pad_array = np.tile(
            pad_vector[:, None], 
            reps=(1, tokens_num - sentence_embeddings.shape[1])
        )
        sentence_embeddings = np.hstack([sentence_embeddings, pad_array])
        rv.append(sentence_embeddings)

    return np.stack(rv)

In [None]:
ans = seq_to_emb(
    sentences=train["data"], wv=wv, tokens_num=50
)

In [None]:
conv1d = nn.Conv1d(in_channels=300, out_channels=10, kernel_size=10)
conv1d(torch.tensor(ans)).shape

torch.Size([1144, 10, 41])