In [1]:
#loading package

import spacy

nlp = spacy.load('en')

In [3]:
#laoding data

import wikipedia

def pages_to_sentences(*pages):
    sentences = []
    for page in pages:
        p = wikipedia.page(page)
        doc = nlp(p.content)
        sentences += [sent.text for sent in doc.sents]
    return sentences

greek_sents = pages_to_sentences('Greek Mythology', 'Amazons')
river_sents = pages_to_sentences('Amazon River')
company_sents = pages_to_sentences('Amazon.com,Inc.')

In [13]:
#creating the data
document = greek_sents + river_sents + company_sents

#creating the labels

labels = ['Greek Mythology']*len(greek_sents) + ['Amazon_River']*len(river_sents) + ['Amazon_Company']*len(company_sents)

In [12]:
#cross-checking

print(len(greek_sents))
print(greek_sents[30])
print('\n')
print(len(river_sents))
print(river_sents[30])
print('\n')
print(len(company_sents))
print(company_sents[400])

995
Hesiod, a possible contemporary with Homer, offers in his  Theogony  (Origin of the Gods) the fullest account of the earliest Greek myths, dealing with the creation of the world; the origin of the gods, Titans, and Giants; as well as elaborate genealogies, folktales, and etiological myths.


442
Early human settlements were typically based on low-lying hills or mounds.



555
Bad Employers by Zeroing Out Subsidies


In [15]:
print(len(document))
print(len(labels))

1992
1992


In [16]:
#stop words function
from spacy.lang.en import STOP_WORDS

stop_words_amazon = ''.join(STOP_WORDS)

stop_words_amazon_lemma = set(i.lemma_ for i in nlp(stop_words_amazon))

#lemmatizer function

def lemmatizer(text):
    return [word.lemma_ for word in nlp(text)]

In [21]:
#creating a pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

tfidf = TfidfVectorizer(stop_words = stop_words_amazon_lemma, tokenizer = lemmatizer, ngram_range = (1,2))


X_train, X_test, y_train, y_test = train_test_split(document, labels, test_size=0.2)

pipe = Pipeline([
    ('vectorizer', tfidf),
    ('classifier', MultinomialNB())
])

pipe.fit(X_train, y_train)


Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words={"indeedvariousmeanwhileonlynexteveryonefirstsoalonep...aroundthenceabovethusherselfbackthateverywheremightmorenotwouldsomewhereforty’dtheren‘tn’tamountamthemyour"},
                                 strip_accents=None, sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function lemmatizer at 0x000001FA4B743318>,
                                 

In [22]:
#printing the score on training set

print(f'The score for training set:{pipe.score(X_train, y_train)}')

The score for training set:0.7909604519774012


In [41]:
#making a grid

grid_parameters = {'vectorizer__ngram_range': [(1,1) , (1,2)],
                  'vectorizer__tokenizer': [None, lemmatizer]}

grid_search = GridSearchCV(pipe, grid_parameters, cv=5, verbose=1)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  2.5min finished
  'stop_words.' % sorted(inconsistent))


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vectorizer',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 2),
                                    

In [42]:
#scores for the 
print(grid_search.best_params_)
print(grid_search.best_score_)

{'vectorizer__ngram_range': (1, 1), 'vectorizer__tokenizer': None}
0.7049591964846202


In [43]:
#scores for the pipeline model
print(pipe.score(X_test, y_test))

0.6165413533834586


In [32]:
y_probability = pipe.predict_proba(X_test)
y_probability.shape

(399, 3)

In [49]:
grid_probabilty  = grid_search.predict_proba(X_test)
grid_probabilty

array([[0.09136707, 0.08133639, 0.82729653],
       [0.6395364 , 0.09623164, 0.26423196],
       [0.03872368, 0.0422475 , 0.91902883],
       ...,
       [0.27934714, 0.22033898, 0.50031387],
       [0.36722953, 0.14537603, 0.48739443],
       [0.36496164, 0.21334202, 0.42169634]])

In [50]:
#printing the results for the pipeline

prediction_indices = (y_probability[:,2] > 0.5).astype(int)


for i, index in enumerate(prediction_indices):
    print(X_test[i], "--> {} at {:g}%". format(y_test[index], 100*y_probability[i,index]))

Cicero is also generally disdainful of myth, but, like Varro, he is emphatic in his support for the state religion and its institutions. --> Amazon_Company at 3.35563%
In an effort to boost employee morale, on November 2, 2015, Amazon announced that it would be extending six weeks of paid leave for new mothers and fathers. --> Greek Mythology at 47.9665%
After the battle Scythians sent young men and told them to encamp near the Amazons and to do whatsoever they should do. --> Amazon_Company at 6.6803%
= --> Amazon_Company at 16.9101%
Employees are responsible for five basic tasks: unpacking and inspecting incoming goods; placing goods in storage and recording their location; picking goods from their computer recorded locations to make up an individual shipment; sorting and packing orders; and shipping. --> Amazon_Company at 9.52997%
Wary of foreign exploitation of the nation's resources, Brazilian governments in the 1940s set out to develop the interior, away from the seaboard where fo

In [56]:
#printing the results for GridSearch

prediction_indices = (grid_probabilty[:,2] > 0.5).astype(int)


for i, index in enumerate(prediction_indices):
    print(X_test[i], "--> {} at {:g}%". format(y_test[index], 100*grid_y_probabilty[i,index]))

Cicero is also generally disdainful of myth, but, like Varro, he is emphatic in his support for the state religion and its institutions. --> Amazon_Company at 8.13364%
In an effort to boost employee morale, on November 2, 2015, Amazon announced that it would be extending six weeks of paid leave for new mothers and fathers. --> Greek Mythology at 63.9536%
After the battle Scythians sent young men and told them to encamp near the Amazons and to do whatsoever they should do. --> Amazon_Company at 4.22475%
= --> Amazon_Company at 22.0339%
Employees are responsible for five basic tasks: unpacking and inspecting incoming goods; placing goods in storage and recording their location; picking goods from their computer recorded locations to make up an individual shipment; sorting and packing orders; and shipping. --> Greek Mythology at 55.2677%
Wary of foreign exploitation of the nation's resources, Brazilian governments in the 1940s set out to develop the interior, away from the seaboard where 

In [54]:
grid_probabilty.shape

(399, 3)

In [55]:
grid_probabilty

array([[0.09136707, 0.08133639, 0.82729653],
       [0.6395364 , 0.09623164, 0.26423196],
       [0.03872368, 0.0422475 , 0.91902883],
       ...,
       [0.27934714, 0.22033898, 0.50031387],
       [0.36722953, 0.14537603, 0.48739443],
       [0.36496164, 0.21334202, 0.42169634]])

In [57]:
grid_search.best_params_

{'vectorizer__ngram_range': (1, 1), 'vectorizer__tokenizer': None}