# Word Usage Classifier

In [1]:
#loading the package required
import spacy

#loading the English module
nlp = spacy.load('en')

In [2]:
#retrieving data
import wikipedia

#loading the pages by custom function

def pages_to_sentences(*pages):
    sentences = []
    for page in pages:
        p = wikipedia.page(page)
        doc = nlp(p.content)
        sentences += [sent.text for sent in doc.sents]
    return sentences

animal_sents = pages_to_sentences('Reticulated python', 'Ball python')
language_sents = pages_to_sentences('Python(programming language)')


In [3]:
#creating the data
documents = animal_sents + language_sents

#creating the labels

labels = ['animal']*len(animal_sents) + ['language']*len(language_sents)

In [4]:
#creating the stop words
from spacy.lang.en import STOP_WORDS

#creating lemma of stop words

stop_words_str = "".join(STOP_WORDS)
stop_words_lemma = set(word.lemma_ for word in nlp(stop_words_str)) 

In [5]:
#creating the tokenizer

def lemmatizer(text):
    return [word.lemma_ for word in nlp(text)]

In [14]:
#creating the workflow

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

#instantiating
tfidf1 = TfidfVectorizer(stop_words = stop_words_lemma, tokenizer = lemmatizer , ngram_range = (2,2) )
tfidf2 = TfidfVectorizer(stop_words = stop_words_lemma, tokenizer = lemmatizer , ngram_range = (1,2) )

#creating pipeline
pipe1 = Pipeline([('vectorizer' , tfidf1), ('classifier', MultinomialNB())])
pipe2 = Pipeline([('vectorizer', tfidf2), ('classifier', MultinomialNB())])
#training

pipe1.fit(documents, labels)
pipe2.fit(documents, labels)

Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words={"amongthereincannottherebynotsomeonemineherselfwhose...namelyregardingninewellthussometimeyourwhymany’llfiftysamefurtheryou‘s’dwhichexceptwhereasunderdonewhatn't"},
                                 strip_accents=None, sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function lemmatizer at 0x00000218A04E30D8>,
                                 

In [15]:
#printing the score

print(f'Training score for bigram: {pipe1.score(documents, labels)}')
print(f'Training score for unigram: {pipe2.score(documents, labels)}')

Training score for bigram: 0.9159663865546218
Training score for unigram: 0.9135654261704682


In [16]:
#testing the classifier on a test sample set
#creating the test data

test_docs = ["My Python program is only 100 bytes long.",
             "A python's bite is not venomous but still hurts.",
             "I can't find the error in the python code.",
             "Where is my pet python; I can't find her!",
             "I use for and while loops when writing Python.",
             "The python will loop and wrap itself onto me.",
             "I use snake case for naming my variables.",
             "My python has grown to over 10 ft long!",
             "I use virtual environments to manage package versions.",
             "Pythons are the largest snakes in the environment."]

class_labels = ['animal', 'language']


In [18]:
#getting the results

y_proba = pipe2.predict_proba(test_docs)
y_proba

array([[0.28588966, 0.71411034],
       [0.3182094 , 0.6817906 ],
       [0.22235938, 0.77764062],
       [0.32069037, 0.67930963],
       [0.11266706, 0.88733294],
       [0.2619799 , 0.7380201 ],
       [0.24797437, 0.75202563],
       [0.43075403, 0.56924597],
       [0.13060872, 0.86939128],
       [0.58375111, 0.41624889]])

In [19]:
#getting the predicted indices

predicted_indices = (y_proba[:,1] > 0.5).astype(int)

for i, index in enumerate(predicted_indices):
    print(test_docs[i], "--> {} at {:g}%". format(class_labels[index], 100*y_proba[i,index]))

My Python program is only 100 bytes long. --> language at 71.411%
A python's bite is not venomous but still hurts. --> language at 68.1791%
I can't find the error in the python code. --> language at 77.7641%
Where is my pet python; I can't find her! --> language at 67.931%
I use for and while loops when writing Python. --> language at 88.7333%
The python will loop and wrap itself onto me. --> language at 73.802%
I use snake case for naming my variables. --> language at 75.2026%
My python has grown to over 10 ft long! --> language at 56.9246%
I use virtual environments to manage package versions. --> language at 86.9391%
Pythons are the largest snakes in the environment. --> animal at 58.3751%
