In [1]:
import nltk

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gargi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\gargi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Frequency Distribution


In [8]:
text1 = 'Natural language processing (NLP) is an interdisciplinary subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data. The goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.'
fd = nltk.FreqDist(text1.split())
fd

FreqDist({'the': 6, 'of': 5, 'and': 5, 'language': 3, 'is': 2, 'computer': 2, 'computers': 2, 'in': 2, 'to': 2, 'The': 2, ...})

# Conditional Frequency Distribution

In [10]:
from nltk.probability import ConditionalFreqDist as confd
cfd = confd((len(word),word) for word in text1.split())
cfd[4]

FreqDist({'with': 1, 'goal': 1, 'then': 1, 'well': 1})

## Task: Determine FD and CFD of any one of the Presidential Inaugural addresses

In [11]:
from nltk.corpus import inaugural


In [12]:
text1 = inaugural.words(fileids='2017-Trump.txt')
fd = nltk.FreqDist(text1)
fd

FreqDist({',': 106, '.': 89, 'and': 70, 'the': 65, 'of': 48, 'our': 47, 'will': 43, 'to': 37, 'We': 26, 'we': 24, ...})

In [13]:
from nltk.probability import ConditionalFreqDist as confd
cfd = confd((len(word),word) for word in text1)
cfd[7]


FreqDist({'America': 20, 'country': 9, 'whether': 3, 'foreign': 3, 'nations': 3, 'borders': 3, 'another': 2, 'Capital': 2, 'belongs': 2, 'success': 2, ...})

## Chinese Segmentation using Jieba

In [15]:
pip install jieba

Note: you may need to restart the kernel to use updated packages.


In [16]:
import jieba
seg = jieba.cut("史密斯是王明的朋友", cut_all=True)
print(" ".join(seg))

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\gargi\AppData\Local\Temp\jieba.cache
Loading model cost 0.862 seconds.
Prefix dict has been built successfully.


史密斯 密斯 是 王明 的 朋友


In [17]:
#Word Tokenization of a sentence
import nltk
sent = "Become an expert in NLP"
words = nltk.word_tokenize(sent)
print(words)

['Become', 'an', 'expert', 'in', 'NLP']


In [18]:
#Sentence Tokenization

# Incase we are taking more than one paragraphs, we can use a for loop
# for text in texts: (following code) -- where texts are the paragraphs

text = "Onika Tanya Maraj-Petty (née Maraj; born December 8, 1982), known professionally as Nicki Minaj, is a Trinidadian-born rapper, singer, and songwriter based in the United States. She is known for her her musical versatility, animated flow in her rapping, alter egos and accents. Minaj is regarded as being the most influential female rapper of her generation and has been called the greatest female rapper of the 21st century."
sentences = nltk.sent_tokenize(text)
sentences

['Onika Tanya Maraj-Petty (née Maraj; born December 8, 1982), known professionally as Nicki Minaj, is a Trinidadian-born rapper, singer, and songwriter based in the United States.',
 'She is known for her her musical versatility, animated flow in her rapping, alter egos and accents.',
 'Minaj is regarded as being the most influential female rapper of her generation and has been called the greatest female rapper of the 21st century.']

In [19]:
#Word Tokenization of all the sentences found in above paragraph

for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    print(words)

['Onika', 'Tanya', 'Maraj-Petty', '(', 'née', 'Maraj', ';', 'born', 'December', '8', ',', '1982', ')', ',', 'known', 'professionally', 'as', 'Nicki', 'Minaj', ',', 'is', 'a', 'Trinidadian-born', 'rapper', ',', 'singer', ',', 'and', 'songwriter', 'based', 'in', 'the', 'United', 'States', '.']
['She', 'is', 'known', 'for', 'her', 'her', 'musical', 'versatility', ',', 'animated', 'flow', 'in', 'her', 'rapping', ',', 'alter', 'egos', 'and', 'accents', '.']
['Minaj', 'is', 'regarded', 'as', 'being', 'the', 'most', 'influential', 'female', 'rapper', 'of', 'her', 'generation', 'and', 'has', 'been', 'called', 'the', 'greatest', 'female', 'rapper', 'of', 'the', '21st', 'century', '.']


In [23]:
tagged = nltk.pos_tag(words)
print(tagged)

[('Minaj', 'NNP'), ('is', 'VBZ'), ('regarded', 'VBN'), ('as', 'IN'), ('being', 'VBG'), ('the', 'DT'), ('most', 'RBS'), ('influential', 'JJ'), ('female', 'JJ'), ('rapper', 'NN'), ('of', 'IN'), ('her', 'PRP$'), ('generation', 'NN'), ('and', 'CC'), ('has', 'VBZ'), ('been', 'VBN'), ('called', 'VBN'), ('the', 'DT'), ('greatest', 'JJS'), ('female', 'JJ'), ('rapper', 'NN'), ('of', 'IN'), ('the', 'DT'), ('21st', 'JJ'), ('century', 'NN'), ('.', '.')]
