## Syntax Tree

In [2]:
import os
path_to_gs = "C:\\Program Files\\gs\\gs9.50\\bin"

In [2]:
os.environ['PATH']+=os.pathsep + path_to_gs

In [4]:
# os.environ['PATH']

# Chunking

In [7]:
import nltk
from nltk import word_tokenize

sent = "The little mouse ate the fresh cheeze."

In [2]:
sent_tokens = nltk.pos_tag(word_tokenize(sent))

In [3]:
sent_tokens

[('The', 'DT'),
 ('little', 'JJ'),
 ('mouse', 'NN'),
 ('ate', 'VB'),
 ('the', 'DT'),
 ('fresh', 'JJ'),
 ('cheeze', 'NN'),
 ('.', '.')]

In [4]:
grammar_np = r"NP:{<DT>?<JJ>*<NN>}"

In [5]:
chunk_parser = nltk.RegexpParser(grammar_np)

In [6]:
chunk_result = chunk_parser.parse(sent_tokens)

In [7]:
chunk_result.draw()

In [8]:
[subtree for subtree in chunk_result.subtrees(filter=lambda t: t.label().endswith("NP"))]

[Tree('NP', [('The', 'DT'), ('little', 'JJ'), ('mouse', 'NN')]),
 Tree('NP', [('the', 'DT'), ('fresh', 'JJ'), ('cheeze', 'NN')])]

In [9]:
sent = "She is wearing a beautiful dress."
sent_tokens = nltk.pos_tag(word_tokenize(sent))

chunk_result = chunk_parser.parse(sent_tokens)

chunk_result.draw()

In [27]:
grammar_np = r"JJ: {<JJ|JJR|JJS>*<NN>}"

In [28]:
chunk_parser = nltk.RegexpParser(grammar_np)

In [32]:
sent = "whatup dude, whats cooking"
# sent = "She is wearing a beautiful dress."
sent_tokens = nltk.pos_tag(word_tokenize(sent))

chunk_result = chunk_parser.parse(sent_tokens)

chunk_result.draw()

In [13]:
sent_tokens

[('She', 'PRP'),
 ('is', 'VBZ'),
 ('walking', 'VBG'),
 ('quickly', 'RB'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('mall', 'NN')]

In [15]:
grammar_np = r"VP: {<PRP>?<VB|VBD|VBZ|VBG>}"
chunk_parser = nltk.RegexpParser(grammar_np)


sent = "He drives fast on highways"
sent_tokens = nltk.pos_tag(word_tokenize(sent))

chunk_result = chunk_parser.parse(sent_tokens)

chunk_result.draw()

In [14]:
chunk_result.leaves()

[('He', 'PRP'),
 ('drives', 'VBZ'),
 ('fast', 'RB'),
 ('on', 'IN'),
 ('highways', 'NNS')]

In [55]:
chunk_result.subtrees

<generator object Tree.subtrees at 0x000001E8030ECB48>

In [17]:
[subtree for subtree in chunk_result.subtrees(filter=lambda t: t.label().endswith("VP"))]

[Tree('VP', [('He', 'PRP'), ('drives', 'VBZ')])]

### Chinking
Helps us define, what we want to exclude from a chunk

In [33]:
chink_grammar = r"""
    chk_name: #chunk name
    {<PRP>?<VB|VBD|VBZ|VBG>*<RB|RBR>?} #chunk regex sequence
    }<RB>+ { #chink regex sequence - adverb
"""

In [35]:
sent = "He drives fast on highways"
sent_tokens = nltk.pos_tag(word_tokenize(sent))

chink_parser = nltk.RegexpParser(chink_grammar)
chink_parser.parse(sent_tokens).draw()

#### Context Free Grammar (CFG)

In [42]:
from nltk.parse.generate import generate

CFG_Grammer = nltk.CFG.fromstring("""
S -> NP VP
VP -> V JJ N
V -> "taking"|"eating"
JJ -> "heavy"
NP -> "John"|"Jim"|"Carry"
N -> "Advil"
""")

In [43]:
for sentence in generate(CFG_Grammer):
    print (' '.join(sentence))

John taking heavy Advil
John eating heavy Advil
Jim taking heavy Advil
Jim eating heavy Advil
Carry taking heavy Advil
Carry eating heavy Advil


In [44]:
def cfg_parse(sentence):
    sent_tk = nltk.pos_tag(word_tokenize(sentence))
    for one in sent_tk:
        if one[1] == 'NNP':
            s_NP = "\'" + one[0] + "\'"
        if one[1] == 'VBD' or one[1] == 'VBN':
            s_V = "\'" + one[0] + "\'"
        if one[1] == 'NN':
            s_N = "\'" + one[0] + "\'"
        else:
            pass
    cfg_grammer2 = nltk.CFG.fromstring("""
    S -> NP VP
    VP -> V N
    V -> {}
    NP -> {}
    N -> {}
    """.format(s_V,s_NP,s_N))
    for sentence in generate(cfg_grammer2):
        print(" ".join(sentence))
    return

In [46]:
cfg_parse("John saw a long white boat")

John saw boat


In [47]:
cfg_parse("John saw a cat")

John saw cat


In [48]:
cfg_parse("A cat was seen by John")

John seen cat


In [49]:
cfg_parse("A Cat was seen by john")

Cat seen john


In [50]:
nltk.pos_tag(word_tokenize("A Cat was seen by john"))

[('A', 'DT'),
 ('Cat', 'NNP'),
 ('was', 'VBD'),
 ('seen', 'VBN'),
 ('by', 'IN'),
 ('john', 'NN')]

### Extractive text Summerization

In [1]:
#importing libraries
# !pip install beautifulsoup4
# !pip install urllib3

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
import bs4 as BeautifulSoup
import urllib.request  

In [2]:
#fetching the content from the URL
fetched_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/20th_century')

article_read = fetched_data.read()

#parsing the URL content and storing in a variable
article_parsed = BeautifulSoup.BeautifulSoup(article_read,'html.parser')

#returning <p> tags
paragraphs = article_parsed.find_all('p')

article_content = ''

#looping through the paragraphs and adding them to the variable
for p in paragraphs:  
    article_content += p.text


In [8]:
len(word_tokenize(article_content))

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\Dhaval Simaria/nltk_data'
    - 'C:\\ProgramData\\Anaconda3\\nltk_data'
    - 'C:\\ProgramData\\Anaconda3\\share\\nltk_data'
    - 'C:\\ProgramData\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Dhaval Simaria\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [11]:
def _create_dictionary_table(text_string) -> dict:
   
    #removing stop words
    stop_words = set(stopwords.words("english"))
    
    words = word_tokenize(text_string)
    
    #reducing words to their root form
    stem = PorterStemmer()
    
    #creating dictionary for the word frequency table
    frequency_table = dict()
    for wd in words:
        wd = stem.stem(wd)
        if wd in stop_words:
            continue
        if wd in frequency_table:
            frequency_table[wd] += 1
        else:
            frequency_table[wd] = 1

    return frequency_table

In [19]:
freqtable = _create_dictionary_table(article_content)
freqtable

{'20th': 19,
 '(': 11,
 'twentieth': 1,
 ')': 11,
 'centuri': 43,
 'wa': 31,
 'began': 5,
 'januari': 2,
 '1': 4,
 ',': 284,
 '1901': 2,
 '[': 37,
 ']': 37,
 'end': 13,
 'decemb': 2,
 '31': 3,
 '2000': 1,
 '.': 120,
 '2': 2,
 'It': 7,
 'tenth': 1,
 'final': 2,
 '2nd': 1,
 'millennium': 1,
 'distinct': 1,
 'known': 2,
 '1900': 2,
 '1999': 2,
 'domin': 3,
 'chain': 1,
 'event': 1,
 'herald': 1,
 'signific': 4,
 'chang': 7,
 'world': 50,
 'histori': 6,
 'redefin': 1,
 'era': 1,
 ':': 4,
 'flu': 1,
 'pandem': 1,
 'war': 41,
 'I': 4,
 'II': 8,
 'nuclear': 10,
 'power': 10,
 'space': 4,
 'explor': 3,
 'nation': 19,
 'decolon': 3,
 'cold': 5,
 'post-cold': 1,
 'conflict': 8,
 ';': 11,
 'intergovernment': 1,
 'organ': 3,
 'cultur': 7,
 'homogen': 2,
 'develop': 15,
 'emerg': 3,
 'transport': 4,
 'commun': 10,
 'technolog': 20,
 'poverti': 2,
 'reduct': 1,
 'popul': 11,
 'growth': 2,
 'awar': 2,
 'environment': 4,
 'degrad': 1,
 'ecolog': 2,
 'extinct': 2,
 '3': 1,
 '4': 1,
 'birth': 1,
 'digit

In [20]:
sentences = sent_tokenize(article_content)
sentences

['The 20th (twentieth) century was a century that began on\nJanuary 1, 1901[1] and ended on December 31, 2000.',
 '[2] It was the tenth and final century of the 2nd millennium.',
 'It is distinct from the century known as the 1900s which began on January 1, 1900, and ended on December 31, 1999.',
 'The 20th century was dominated by a chain of events that heralded significant changes in world history as to redefine the era: flu pandemic, World War I and World War II, nuclear power and space exploration, nationalism and decolonization, the Cold War and post-Cold War conflicts; intergovernmental organizations and cultural homogenization through developments in emerging transportation and communications technology; poverty reduction and world population growth, awareness of environmental degradation, ecological extinction;[3][4] and the birth of the Digital Revolution, enabled by the wide adoption of MOS transistors and integrated circuits.',
 'It saw great advances in communication and me

In [35]:
def _calculate_sentence_scores(sentences, frequency_table) -> dict:   

    #algorithm for scoring a sentence by its words
    sentence_weight = dict()

    for sentence in sentences:
#       sentence_wordcount = (len(word_tokenize(sentence)))
        sentence_wordcount = 0
        for word in frequency_table:
            if word in sentence.lower():
                sentence_wordcount += 1
                if sentence[:7] in sentence_weight:
                    sentence_weight[sentence[:7]] += frequency_table[word]
                else:
                    sentence_weight[sentence[:7]] = frequency_table[word]

        sentence_weight[sentence[:7]] = sentence_weight[sentence[:7]] / sentence_wordcount

    return sentence_weight

In [36]:
sentscore = _calculate_sentence_scores(sentences, freqtable)
sentscore

{'The 20t': 14.548890200708383,
 '[2] It ': 25.77777777777778,
 'It is d': 31.5,
 'It saw ': 9.923076923076923,
 'The ave': 13.108695652173912,
 '[5]\nThe': 23.161290322580644,
 'The Mar': 10.0,
 'For dec': 16.9375,
 'Open wa': 17.43243243243243,
 'It took': 18.742857142857144,
 '[8] Glo': 22.0,
 'Up unti': 15.157894736842104,
 '[9]\nThe': 23.1875,
 'Nationa': 17.424242424242426,
 'The cen': 26.35,
 'Terms l': 36.733333333333334,
 'Scienti': 22.076923076923077,
 'It was ': 20.583333333333332,
 'Horses,': 20.782608695652176,
 'These d': 17.59259259259259,
 'Humans ': 37.083333333333336,
 'Mass me': 19.5,
 'Advance': 11.05,
 'Rapid t': 28.473684210526315,
 'World W': 26.391304347826086,
 'However': 44.90909090909091,
 'For the': 21.82758620689655,
 'The las': 22.727272727272727,
 '[10]\nTh': 19.692307692307693,
 'Technol': 22.185185185185187,
 'After m': 15.333333333333334,
 'In addi': 17.296296296296298,
 'The Aus': 22.7,
 'The Rus': 14.8,
 'The vic': 41.46153846153846,
 'At the ': 33.3

In [6]:
def _calculate_average_score(sentence_weight) -> int:
   
    #calculating the average score for the sentences
    sum_values = 0
    for entry in sentence_weight:
        sum_values += sentence_weight[entry]

    #getting sentence average value from source text
    average_score = (sum_values / len(sentence_weight))

    return average_score

In [37]:
threshold = _calculate_average_score(sentscore)
threshold

22.14859472423448

In [41]:
def _get_article_summary(sentences, sentence_weight, threshold):

    article_summary = ''

    for sentence in sentences:
        if sentence[:7] in sentence_weight and sentence_weight[sentence[:7]] >= (threshold):
            article_summary += " " + sentence

    return article_summary

In [8]:
def _run_article_summary(article):
    
    #creating a dictionary for the word frequency table
    frequency_table = _create_dictionary_table(article)

    #tokenizing the sentences
    sentences = sent_tokenize(article)

    #algorithm for scoring a sentence by its words
    sentence_scores = _calculate_sentence_scores(sentences, frequency_table)

    #getting the threshold
    threshold = _calculate_average_score(sentence_scores)

    #producing the summary
    article_summary = _get_article_summary(sentences, sentence_scores, 1.5 * threshold)

    return article_summary



In [39]:
summary_results = _run_article_summary(article_content)

In [40]:
print(summary_results)

 Terms like ideology, world war, genocide, and nuclear war entered common usage. Humans explored space for the first time, taking their first footsteps on the Moon. However, these same wars resulted in the destruction of the imperial system. The victorious Bolsheviks then established the Soviet Union, the world's first communist state. At the beginning of the period, the British Empire was the world's most powerful nation,[12] having acted as the world's policeman for the past century. In total, World War II left some 60 million people dead. At the beginning of the century, strong discrimination based on race and sex was significant in general society. During the century, the social taboo of sexism fell. Communications and information technology, transportation technology, and medical advances had radically altered daily lives. Since the US was in a dominant position, a major part of the process was Americanization. Terrorism, dictatorship, and the spread of nuclear weapons were pressi