# NLP

In [3]:
#importing essential libraries
import spacy

nlp = spacy.load('en')

#importing the data
import wikipedia

#define a function

def pages_to_sentences(*pages):
    sentences = []
    for page in pages:
        p = wikipedia.page(page)
        doc = nlp(p.content)
        sentences += [sent.text for sent in doc.sents]
    return sentences

In [4]:
#retrieving the pages and data

animal_sentences = pages_to_sentences('Reticulated python', 'Ball python')
language_sentences = pages_to_sentences('Python(programming language)')

In [5]:
#combining into a single list

documents = animal_sentences + language_sentences

In [7]:
#checking it out
animal_sentences[:5]
language_sentences[:5]

['Python is an interpreted, high-level, general-purpose programming language.',
 "Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace.",
 'Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects.',
 'Python is dynamically typed and garbage-collected.',
 'It supports multiple programming paradigms, including procedural, object-oriented, and  functional programming.']

In [8]:
#importing the CountVectorizer 

from sklearn.feature_extraction.text import CountVectorizer

#creating the bag of words model

bag_of_words = CountVectorizer()
bag_of_words.fit(documents)

word_counts = bag_of_words.transform(documents)

In [15]:
print(word_counts)

  (0, 224)	1
  (0, 275)	1
  (0, 960)	1
  (0, 1234)	1
  (0, 1327)	1
  (0, 1498)	1
  (0, 1648)	1
  (0, 1964)	1
  (0, 1967)	1
  (0, 2085)	1
  (0, 2087)	1
  (0, 2281)	1
  (0, 2304)	1
  (0, 2305)	1
  (0, 2315)	1
  (0, 2477)	2
  (0, 2510)	1
  (1, 224)	1
  (1, 274)	1
  (1, 339)	1
  (1, 587)	1
  (1, 780)	1
  (1, 1327)	1
  (1, 1332)	1
  (1, 1344)	1
  :	:
  (824, 1328)	1
  (825, 51)	1
  (825, 1523)	1
  (825, 2412)	1
  (826, 1234)	1
  (826, 1923)	1
  (826, 1964)	1
  (827, 71)	1
  (827, 825)	1
  (828, 160)	1
  (828, 1916)	1
  (828, 2658)	1
  (829, 77)	1
  (829, 114)	1
  (829, 129)	1
  (829, 1328)	1
  (831, 950)	1
  (831, 1448)	1
  (832, 289)	1
  (832, 672)	1
  (832, 1396)	1
  (832, 1727)	1
  (832, 1923)	1
  (832, 1964)	1
  (832, 2648)	1


In [12]:
#checking the size
print(word_counts.shape)

(833, 2730)


In [13]:
# checking the type
type(word_counts)

scipy.sparse.csr.csr_matrix

In [17]:
#checking the index of the word in columns
bag_of_words.get_feature_names()

['000',
 '10',
 '11',
 '111',
 '116',
 '12',
 '125',
 '13',
 '130',
 '14',
 '15',
 '1500',
 '158',
 '15806',
 '16',
 '165',
 '17',
 '18',
 '1801',
 '1802',
 '1803',
 '182',
 '1830',
 '1849',
 '19',
 '1910s',
 '1927',
 '1932',
 '1950s',
 '1956',
 '1960s',
 '1963',
 '1972',
 '1978',
 '1980',
 '1980s',
 '1989',
 '1991',
 '1992',
 '1993',
 '1995',
 '20',
 '200',
 '2000',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2015',
 '2017',
 '2018',
 '2019',
 '2020',
 '206',
 '20th',
 '21',
 '22',
 '23',
 '2415',
 '25',
 '28',
 '29',
 '2d',
 '2nd',
 '2to3',
 '30',
 '300',
 '31',
 '32',
 '321',
 '333',
 '35',
 '350',
 '356',
 '36',
 '3d',
 '3ds',
 '3rd',
 '40',
 '400',
 '42',
 '43',
 '4302',
 '44',
 '45',
 '47',
 '48',
 '4d',
 '50',
 '500',
 '51',
 '521',
 '53',
 '54',
 '55',
 '561',
 '59',
 '59059',
 '596',
 '5th',
 '60',
 '600',
 '604259',
 '61',
 '635',
 '67',
 '68056',
 '69',
 '70',
 '72',
 '72596',
 '74',
 '75',
 '79',
 '80',
 '800

In [18]:
#checking the mapping for a certain index
bag_of_words.get_feature_names()[1000]

#checking the result we can say it refers to the animal python.

'floor'

In [19]:
#reverse mapping from the word
bag_of_words.vocabulary_

{'the': 2477,
 'reticulated': 2085,
 'python': 1964,
 'malayopython': 1498,
 'reticulatus': 2087,
 'is': 1327,
 'snake': 2281,
 'species': 2315,
 'in': 1234,
 'family': 960,
 'pythonidae': 1967,
 'native': 1648,
 'to': 2510,
 'south': 2304,
 'and': 224,
 'southeast': 2305,
 'asia': 275,
 'it': 1332,
 'world': 2699,
 'longest': 1472,
 'listed': 1452,
 'as': 274,
 'least': 1419,
 'concern': 587,
 'on': 1735,
 'iucn': 1346,
 'red': 2016,
 'list': 1451,
 'because': 339,
 'of': 1722,
 'its': 1344,
 'wide': 2677,
 'distribution': 780,
 'several': 2209,
 'range': 1983,
 'countries': 656,
 'hunted': 1192,
 'for': 1011,
 'skin': 2267,
 'use': 2588,
 'traditional': 2524,
 'medicine': 1553,
 'sale': 2138,
 'pet': 1827,
 'an': 219,
 'excellent': 912,
 'swimmer': 2427,
 'has': 1143,
 'been': 341,
 'reported': 2055,
 'far': 962,
 'out': 1770,
 'at': 289,
 'sea': 2172,
 'colonized': 530,
 'many': 1517,
 'small': 2277,
 'islands': 1330,
 'within': 2690,
 'among': 217,
 'three': 2498,
 'heaviest': 1155

In [22]:
#checking a specific word for its required index
bag_of_words.vocabulary_['memory']

1557

In [23]:
#checking for the word occurences

#individually creating sparse matrices

count_animals = bag_of_words.transform(animal_sentences)
count_language = bag_of_words.transform(language_sentences)

#creating a specific vocabulary word for testing

index_of_programming = bag_of_words.vocabulary_['programming']

#checking the total counts

print(count_animals.sum(axis=0)[0, index_of_programming])
print(count_language.sum(axis=0)[0, index_of_programming])

0
32
