In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import numpy as np
import nltk

nltk.download('word_tokenize')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Error loading word_tokenize: Package 'word_tokenize' not
[nltk_data]     found in index
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daniyarkurmanbayev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/daniyarkurmanbayev/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, alpha=0.025, window=5, max_vocab_size=None):
        self.vector_size = vector_size
        self.alpha = alpha
        self.window = window
        self.max_vocab_size = max_vocab_size

    def fit(self, X, y=None):
        sentences = [sentence.split(' ') for sentence in X]
        self.model_ = Word2Vec(sentences=sentences,
                               corpus_file=None,
                               vector_size=self.vector_size,
                               alpha=self.alpha,
                               window=self.window,
                               min_count=1,
                               max_vocab_size=self.max_vocab_size)
        return self

    def transform(self, documents):
        words = [doc.split(' ') for doc in documents]
        vectors = [self.model_.wv[word] for word in words]
        return np.reshape(np.array(vectors), (len(vectors), self.vector_size * len(words[0])))

In [3]:
class Doc2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, alpha=0.025, window=5, max_vocab_size=None):
        self.vector_size = vector_size
        self.alpha = alpha
        self.window = window
        self.max_vocab_size = max_vocab_size

    def fit(self, X, y=None):
        sentences = [sentence.split(' ') for sentence in X]
        d2v_sentences = [TaggedDocument(words, [i]) for i, words in enumerate(sentences)]
        self.model_ = Doc2Vec(documents=d2v_sentences,
                              corpus_file=None,
                              vector_size=self.vector_size,
                              alpha=self.alpha,
                              window=self.window,
                              min_count=1,
                              max_vocab_size=self.max_vocab_size)
        return self

    def transform(self, documents):
        words = [doc.split(' ') for doc in documents]
        vectors = [self.model_.infer_vector(doc) for doc in words]
        return np.reshape(np.array(vectors), (len(words), self.vector_size))

In [4]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
]

In [5]:
data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42)

In [6]:
X_train = data_train.data
y_train = data_train.target

X_test = data_test.data
y_test = data_test.target

In [7]:
from nltk.tokenize import word_tokenize
X_train = [word_tokenize(text) for text in X_train]
X_test = [word_tokenize(text) for text in X_test]


In [8]:
from nltk.corpus import stopwords
from string import punctuation

stop_words = set(stopwords.words('english'))
X_train = [[word.lower() for word in text if word not in stop_words if word not in punctuation] for text in X_train]
X_test = [[word.lower() for word in text if word not in stop_words if word not in punctuation] for text in X_test]


In [9]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
X_train = [[lemmatizer.lemmatize(word, pos="a") for word in text] for text in X_train]
X_test = [[lemmatizer.lemmatize(word, pos="a") for word in text] for text in X_test]

In [10]:
X_train = [' '.join(text) for text in X_train]
X_test = [' '.join(text) for text in X_test]

In [11]:
feature_extractors = {
    'name': 'feature_extractor',
    'models': [
        {
            'model': CountVectorizer(),
            'parameters': {
                'decode_error': ['strict', 'ignore', 'replace'],
                'strip_accents': ['ascii', 'unicode'],
                'ngram_range': [(1, 1), (1, 2), (2, 3)],
                'analyzer': ['word', 'char', 'char_wb'],
            }
        },
        {
            'model': TfidfVectorizer(),
            'parameters': {
                'decode_error': ['strict', 'ignore', 'replace'],
                'strip_accents': ['ascii', 'unicode'],
                'ngram_range': [(1, 1), (1, 2), (2, 3)],
                'analyzer': ['word', 'char', 'char_wb']
            }
        },
        {
            'model': Word2VecVectorizer(),
            'parameters': {
                'vector_size': [int(num) for num in np.linspace(50, 150, 5)],
                'alpha': np.linspace(0.01, 0.5, 5),
                'window': [int(num) for num in np.linspace(2, 10, 5)]
            }
        },
        {
            'model': Doc2VecVectorizer(),
            'parameters': {
                'vector_size': [int(num) for num in np.linspace(50, 150, 3)],
                'alpha': np.linspace(0.2, 0.5, 5),
                'window': [int(num) for num in np.linspace(2, 10, 5)]
            }
        }
    ]
}

In [16]:
models = {
    'name': 'classifier',
    'models': [
        {
            'model': MultinomialNB(),
            'parameters': {
                'alpha': np.linspace(0, 1, 3),
                'fit_prior': [True, False],
            }
        },
        {
            'model': LogisticRegression(),
            'parameters': {
                'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                'dual': [True, False],
                'C': np.linspace(0, 1, 4),
                'class_weight': ['balanced', None],
                'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                'multi_class': ['auto', 'ovr', 'multinomial'],
                'random_state': [42]
            }
        },
        {
            'model': SVC(),
            'parameters': {
                'C': np.linspace(0.1, 100, 5),
                'gamma': np.linspace(0.001, 1, 5),
                'kernel': ['rbf', 'poly', 'sigmoid']
            }
        },
        {
            'model': RandomForestClassifier(),
            'parameters': {
                'n_estimators': [50, 100, 150, 300, 500, 1000],
                'max_depth': [None] + [num for num in np.linspace(5, 200, 4)],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['auto', 'sqrt', 'log2'],
                'bootstrap': [True, False],
                'random_state': [42]
            }
        }
    ]
}

In [17]:
results = []

for extractor in feature_extractors['models']:
    for classifier in models['models']:
        pipeline = Pipeline([
            (feature_extractors['name'], extractor['model']),
            (models['name'], classifier['model'])
        ])

        extractor_params = {}
        for key, value in extractor['parameters'].items():
            extractor_params[f'{feature_extractors["name"]}__{key}'] = value

        classifier_params = {}
        for key, value in classifier['parameters'].items():
            classifier_params[f'{models["name"]}__{key}'] = value

        param_grid = {**extractor_params, **classifier_params}

        search_cv = RandomizedSearchCV(pipeline,
                                       param_grid,
                                       n_iter=20,
                                       cv=5,
                                       random_state=42,
                                       n_jobs=3)
        search_cv.fit(X_train, y_train)

        y_pred = search_cv.predict(X_test)
        score = accuracy_score(y_test, y_pred)

        result = {
            'best_extractor': search_cv.best_estimator_.steps[0][1],
            'model': search_cv.best_estimator_.steps[1][1],
            'parameters': search_cv.best_params_,
            'score': score
        }
        print(result)
        results.append(result)

Traceback (most recent call last):
  File "/Users/daniyarkurmanbayev/Documents/GBC/mlenv/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/daniyarkurmanbayev/Documents/GBC/mlenv/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/daniyarkurmanbayev/Documents/GBC/mlenv/lib/python3.8/site-packages/sklearn/naive_bayes.py", line 638, in fit
    self._count(X, Y)
  File "/Users/daniyarkurmanbayev/Documents/GBC/mlenv/lib/python3.8/site-packages/sklearn/naive_bayes.py", line 771, in _count
    check_non_negative(X, "MultinomialNB (input X)")
  File "/Users/daniyarkurmanbayev/Documents/GBC/mlenv/lib/python3.8/site-packages/sklearn/utils/validation.py", line 1125, in check_non_negative
    raise ValueError("Negative values in data passed to %s" % whom)
ValueError: Negative values in data passed 

{'best_extractor': Doc2VecVectorizer(alpha=0.275, vector_size=50, window=10), 'model': LogisticRegression(random_state=42), 'parameters': {'feature_extractor__window': 10, 'feature_extractor__vector_size': 50, 'feature_extractor__alpha': 0.275, 'classifier__solver': 'lbfgs', 'classifier__random_state': 42, 'classifier__penalty': 'l2', 'classifier__multi_class': 'auto', 'classifier__dual': False, 'classifier__class_weight': None, 'classifier__C': 1.0}, 'score': 0.6631578947368421}
{'best_extractor': Doc2VecVectorizer(alpha=0.275, vector_size=50, window=10), 'model': SVC(C=0.1, gamma=0.75025, kernel='poly'), 'parameters': {'feature_extractor__window': 10, 'feature_extractor__vector_size': 50, 'feature_extractor__alpha': 0.275, 'classifier__kernel': 'poly', 'classifier__gamma': 0.75025, 'classifier__C': 0.1}, 'score': 0.612280701754386}
{'best_extractor': Doc2VecVectorizer(alpha=0.2, vector_size=50, window=6), 'model': RandomForestClassifier(bootstrap=False, max_depth=200.0, min_samples_l

In [18]:
results


[{'best_extractor': Doc2VecVectorizer(alpha=0.275, vector_size=50, window=10),
  'model': LogisticRegression(random_state=42),
  'parameters': {'feature_extractor__window': 10,
   'feature_extractor__vector_size': 50,
   'feature_extractor__alpha': 0.275,
   'classifier__solver': 'lbfgs',
   'classifier__random_state': 42,
   'classifier__penalty': 'l2',
   'classifier__multi_class': 'auto',
   'classifier__dual': False,
   'classifier__class_weight': None,
   'classifier__C': 1.0},
  'score': 0.6631578947368421},
 {'best_extractor': Doc2VecVectorizer(alpha=0.275, vector_size=50, window=10),
  'model': SVC(C=0.1, gamma=0.75025, kernel='poly'),
  'parameters': {'feature_extractor__window': 10,
   'feature_extractor__vector_size': 50,
   'feature_extractor__alpha': 0.275,
   'classifier__kernel': 'poly',
   'classifier__gamma': 0.75025,
   'classifier__C': 0.1},
  'score': 0.612280701754386},
 {'best_extractor': Doc2VecVectorizer(alpha=0.2, vector_size=50, window=6),
  'model': RandomFor