# Implementing GridSearch

In [9]:
#retrieving data
import wikipedia
import spacy

nlp = spacy.load('en')
#loading the pages by custom function

def pages_to_sentences(*pages):
    sentences = []
    for page in pages:
        p = wikipedia.page(page)
        doc = nlp(p.content)
        sentences += [sent.text for sent in doc.sents]
    return sentences

animal_sents = pages_to_sentences('Reticulated python', 'Ball python')
language_sents = pages_to_sentences('Python(programming language)')

#creating the data
documents = animal_sents + language_sents

#creating the labels

labels = ['animal']*len(animal_sents) + ['language']*len(language_sents)

In [4]:
#defining the lemmatizer

def lemmatizer(text):
    return [word.lemma_ for word in nlp(text)]

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

#dividing the data
X_train, X_test, y_train, y_test = train_test_split(documents, labels, test_size=0.2, random_state=0)

#getting the model
pipe = Pipeline([('vectorizer', TfidfVectorizer()), ('classifier', MultinomialNB())])

param_grids = {'vectorizer__ngram_range':[(1,1), (1,2)],
              'vectorizer__tokenizer':[None, lemmatizer]}

#setting up the Grid Search

grid_search = GridSearchCV(pipe, param_grids, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  2.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vectorizer',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                    

In [6]:
results = grid_search.cv_results_

In [8]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'vectorizer__ngram_range': (1, 1), 'vectorizer__tokenizer': None}
0.8048048048048048
