# Text CNN

Text CNN is a method for applying convolutional architecture to text tasks.

In [1]:
import numpy as np
from gensim import downloader
from gensim.models import KeyedVectors
from collections.abc import Collection
from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = ["talk.politics.guns", "rec.motorcycles"]
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)

In [None]:
wv = downloader.load("word2vec-google-news-300")

def seq_to_emb(
    sentences: Collection[str], 
    wv: KeyedVectors, 
    tokens_num: int, 
    pad_token: str = "</s>"
) -> np.ndarray:
    rv = []
    pad_vector = wv.get_vector(pad_token)

    for sentence in sentences:

        sentence_embeddings = []
        got_emb = 0
        for one_token in sentence.split():
            if wv.has_index_for(one_token):
                sentence_embeddings.append(wv.get_vector(one_token))
                got_emb += 1
                # Taking embedings only for some of the words
                if got_emb >= tokens_num: break

        sentence_embeddings = np.vstack(sentence_embeddings)
        pad_array = np.tile(
            pad_vector, reps=((tokens_num - sentence_embeddings.shape[0], 1))
        )
        sentence_embeddings = np.vstack([sentence_embeddings, pad_array])
        rv.append(sentence_embeddings)

    return np.stack(rv)

In [10]:
seq_to_emb(train["data"][:10], wv=wv, tokens_num=100).shape

(10, 100, 300)

In [None]:
np.pad(ans, pad_width=((0, 200))).shape

(297, 500)