In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from tqdm import tqdm
from sklearn import utils as skl_utils
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
import numpy as np
import six
import nltk

nltk.download('word_tokenize')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Error loading word_tokenize: Package 'word_tokenize' not
[nltk_data]     found in index
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daniyarkurmanbayev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/daniyarkurmanbayev/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, alpha=0.025, window=5, max_vocab_size=None):
        self.vector_size = vector_size
        self.alpha = alpha
        self.window = window
        self.max_vocab_size = max_vocab_size

    def fit(self, X, y=None):
        sentences = [sentence.split(' ') for sentence in X]
        self.model_ = Word2Vec(sentences=sentences,
                               corpus_file=None,
                               vector_size=self.vector_size,
                               alpha=self.alpha,
                               window=self.window,
                               min_count=1,
                               max_vocab_size=self.max_vocab_size)
        return self

    def transform(self, documents):
        words = [doc.split(' ') for doc in documents]
        print(words)
        vectors = [self.model_.wv[word] for word in words]
        print(np.reshape(np.array(vectors), (len(vectors), self.vector_size * len(words[0]))))
        return np.reshape(np.array(vectors), (len(vectors), self.vector_size * len(words[0])))

In [4]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
]

In [5]:
data = fetch_20newsgroups(subset='train', categories=categories)

In [6]:
from nltk.tokenize import word_tokenize
X_tokenized = word_tokenize(data.data[0])

In [7]:
from nltk.corpus import stopwords
from string import punctuation

stop_words = set(stopwords.words('english'))
X_cleaned = [word.lower() for word in X_tokenized if word not in stop_words if word not in punctuation]

In [8]:
X_string = ' '.join(X_cleaned)


In [9]:
documents = [X_cleaned]
w2v = Word2Vec(sentences=documents,
               min_count=1)

vectors = [w2v.wv[word] for word in X_cleaned]
np.reshape(np.array(vectors), (len(X_cleaned), 100))


array([[ 0.00425052,  0.00040023, -0.0009062 , ...,  0.00015964,
        -0.00212286,  0.00620934],
       [ 0.00010218,  0.00307842, -0.00679794, ...,  0.00051521,
         0.0082068 , -0.00703009],
       [ 0.00181853,  0.00704764,  0.00298202, ..., -0.00186388,
         0.00360325, -0.00707316],
       ...,
       [-0.00862243,  0.00366124,  0.00519292, ..., -0.00238297,
        -0.00952568,  0.00449805],
       [-0.00947526,  0.00955806, -0.00772746, ..., -0.00312549,
        -0.00636282,  0.00983518],
       [ 0.00772606,  0.00911873,  0.00116021, ...,  0.00826411,
        -0.00610871,  0.0094077 ]], dtype=float32)

In [12]:
pipeline = Pipeline([
    ('w2w', Word2VecVectorizer(vector_size=50, alpha=0.2, window=10)),
    ('model', LogisticRegression())
])

pipeline.fit([X_string, X_string + 'test'], [1, 0])


[['from', 'mangoe', 'cs.umd.edu', 'charley', 'wingate', 'subject', 'benediktine', 'metaphysics', 'lines', '24', 'benedikt', 'rosenau', 'writes', 'great', 'authority', 'if', 'it', 'is', 'contradictory', 'it', 'can', 'not', 'exist', '``', 'contradictory', "''", 'property', 'language', 'if', 'i', 'correct', 'things', 'defined', 'by', 'contradictory', 'language', 'do', 'not', 'exist', 'i', 'object', 'definitions', 'reality', 'if', 'amend', 'things', 'described', 'by', 'contradictory', 'language', 'do', 'not', 'exist', "'ve", 'come', 'something', 'plainly', 'false', 'failures', 'description', 'merely', 'failures', 'description', 'i', "'m", 'objectivist', 'remember', '--', 'c.', 'wingate', '``', 'the', 'peace', 'god', 'peace', 'strife', 'closed', 'sod', 'mangoe', 'cs.umd.edu', 'yet', 'brothers', 'pray', 'one', 'thing', 'tove', 'mangoe', "marv'lous", 'peace', 'god', "''"], ['from', 'mangoe', 'cs.umd.edu', 'charley', 'wingate', 'subject', 'benediktine', 'metaphysics', 'lines', '24', 'benedikt'

Pipeline(steps=[('w2w',
                 Word2VecVectorizer(alpha=0.2, vector_size=50, window=10)),
                ('model', LogisticRegression())])