### Extracting the frequency of terms using the Bag of Words model

In [4]:
%cd C:\Users\U\Artificial Intelligence\data
#In order to use function from text_chunker

C:\Users\U\Artificial Intelligence\data


In [5]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import brown
from text_chunker import chunker

In [6]:
input_data = ' '.join(brown.words()[:5400])
chunk_size = 800

In [7]:
text_chunks = chunker(input_data, chunk_size)

In [8]:
#Convert to dict items 
chunks = []
for count, chunk in enumerate(text_chunks):
    d = {'index': count, 'text': chunk}
    chunks.append(d)

In [9]:
#Extract the document term matrix
count_vectorizer = CountVectorizer(min_df = 7, max_df = 20)
document_term_matrix = count_vectorizer.fit_transform([chunk['text'] for chunk in chunks])

In [11]:
#Extract the vocabulary and display it
vocabulary = np.array(count_vectorizer.get_feature_names())
print(f'Vocabulary = {vocabulary}')

Vocabulary = ['and' 'are' 'be' 'by' 'county' 'for' 'in' 'is' 'it' 'of' 'on' 'one'
 'said' 'state' 'that' 'the' 'to' 'two' 'was' 'which' 'with']


In [25]:
#Generate names for chunks
chunk_names = [f'Chunk-{i + 1}' for i in range(len(text_chunks))]

In [26]:
print('Document term matrix:\n')

formatted_text = '{:>12}' * (len(chunk_names) + 1)
print(formatted_text.format('Word', *chunk_names), '\n')

for word, item in zip(vocabulary, document_term_matrix.T):
    #'item' is a 'csr_matrix' data structure
    output = [word] + [str(freq) for freq in item.data]
    print(formatted_text.format(*output))

Document term matrix:

        Word     Chunk-1     Chunk-2     Chunk-3     Chunk-4     Chunk-5     Chunk-6     Chunk-7 

         and          23           9           9          11           9          17          10
         are           2           2           1           1           2           2           1
          be           6           8           7           7           6           2           1
          by           3           4           4           5          14           3           6
      county           6           2           7           3           1           2           2
         for           7          13           4          10           7           6           4
          in          15          11          15          11          13          14          17
          is           2           7           3           4           5           5           2
          it           8           6           8           9           3           1           2
     

### Building a category predictor

In [27]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import TfidfTransformer

In [28]:
#Define the category map
category_map = {'talk.politics.misc': 'Politics', 'rec.autos': 'Autos', 'rec.sport.hockey': 'Hockey', 
                'sci.electronics': 'Electronics', 'sci.med': 'Medicine'}

training_data = fetch_20newsgroups(subset = 'train', categories = category_map.keys(), shuffle = True, random_state = 5)

In [29]:
#Build a count vectorizer and extract term counts
count_vectorizer = CountVectorizer()
train_tc = count_vectorizer.fit_transform(training_data.data)

print('Dimensions of training data: ', train_tc.shape)

Dimensions of training data:  (2844, 40321)


In [30]:
#Create the tf-idf transformer
tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(train_tc)

In [31]:
#Train a Multinomial Naive Bayes Classifier
classifier = MultinomialNB().fit(train_tfidf, training_data.target)

In [32]:
#Define test data
input_data = ['You need to be careful with cars when you are driving on slippery roads',
              'A lot of devices can be operated wirelessly',
              'Players need to be careful when they are close to goal posts', 
              'Political debates help us understand the perspectives of both sides']

In [33]:
input_tc = count_vectorizer.transform(input_data)
input_tfidf = tfidf.transform(input_tc)

predictions = classifier.predict(input_tfidf)

In [35]:
#Print the outputs
for sentence, category in zip(input_data, predictions):
    print('\nInput: ', sentence)
    print('Predicted Category: ', category_map[training_data.target_names[category]])


Input:  You need to be careful with cars when you are driving on slippery roads
Predicted Category:  Autos

Input:  A lot of devices can be operated wirelessly
Predicted Category:  Electronics

Input:  Players need to be careful when they are close to goal posts
Predicted Category:  Hockey

Input:  Political debates help us understand the perspectives of both sides
Predicted Category:  Politics
