In [1]:
import wikipedia
import nltk
import collections
import sys

In [2]:
# http://english-e-reader.net/book/the-call-of-the-wild-jack-london

In [3]:
with open('The_Call_of_the_Wild-Jack_London.txt', 'r') as f:
    text = f.read()   

In [4]:
#Preparation of the text. Delete tabs and line breaks.
text_clear = text.replace('\n', ' ').replace('\t', '').replace('\\', '')

In [5]:
print(text_clear)

CHAPTER ONE  The Southland  Buck did not read the newspapers. He did not know that there was trouble for every big, strong dog in California. Men found a yellow metal called gold in the Klondike. They needed big, strong dogs with furry coats to work in the north.  Buck lived in a big house in the sunny Santa Clara Valley. The house was called Judge Miller's Place. Around the house there were big gardens, fruit trees and horses.  Buck was born here and now he was four years old. There were other dogs too, but they were not important. Buck was the king! Judge Miller's Place was his. He swam and hunted with the Judge's sons. He walked with the Judge's daughters. He carried the Judge's grandsons on his back and played with them. In the winter Buck sat at the Judge's feet in front of the fire.  Buck's father was a big St Bernard and his mother was a Scottish sheepdog. Buck weighed one hundred and forty pounds arid he was a happy and handsome dog.  In the fall of 1897 men from all over the w

# Part-Of-Speech Tagging

In [6]:
def POS_tagging(text):
    tokens = nltk.word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i].isupper():
            tokens[i] = tokens[i].lower()
    tagged = nltk.pos_tag(tokens)
    return tagged

# Named Entity Recognition
* NER with entity classification (using nltk.ne_chunk)

In [7]:
def extractEntities(ne_chunked):
    data = {}
    for entity in ne_chunked:
        if isinstance(entity, nltk.tree.Tree):
            text = " ".join([word for word, tag in entity.leaves()])
            ent = entity.label()
            data[text] = ent
        else:
            continue
    return data

* NER with custom patterns

In [8]:
# in - TOKENS! and grammar
def parser(tagged_tokens, grammar):
    entity_parse = []
    grammar =  grammar
    cp = nltk.RegexpParser(grammar)
    chunked = cp.parse(tagged_tokens)
    for entity in chunked:
        for subtree in chunked.subtrees(filter=lambda t: t.label() == 'NP'):
            text = " ".join([word for word, tag in subtree.leaves()])
            entity_parse.append(text)
        return entity_parse

# Wikipedia package.
* using the first sentences (before comma) and detect pattern

In [9]:
def wiki_search(entity_parse):
    multiple_entity = []
    for ent in entity_parse:
        try:
            search = wikipedia.summary(ent, sentences=1)
            search = search[:search.find(',')]
            search_tokens = nltk.pos_tag(nltk.word_tokenize(search))
            gram =  r"""NP: {<VBZ><DT>?<JJ>*<NN>*}""" 
            pars_seach = parser(search_tokens, gram)
            if pars_seach:
                print(ent, ' - ', pars_seach[0], '\n')
        except wikipedia.exceptions.DisambiguationError as e:
            multiple_entity.append(ent)   
        except wikipedia.exceptions.PageError:
            #if a "PageError" was raised, ignore it and continue to next link
            continue
        except wikipedia.exceptions.HTTPTimeoutError as e:
            continue
    print('Words that have multiple meanings:')
    for multi_ent in multiple_entity:
        try:
            search = wikipedia.summary(multi_ent)
        except wikipedia.exceptions.DisambiguationError as e:
            print(multi_ent, ':')
            try:
                print(e)
            except:
                continue
            print('\n')
        except wikipedia.exceptions.WikipediaException as e:
            continue
        except wikipedia.exceptions.PageError as e:
            continue

In [10]:

def entity_print(entity, N, all_entity):
    d = dict((x,entity.count(x)) for x in set(entity))
    s = sorted(d.items(), key=lambda x: x[1], reverse=True)
    print(N, " most common entities")
    for i in range(N):
        print(s[i][0] + ": " + str(s[i][1]))
    if all_entity:
        print('\n', "All entities: \n")
        entity_parse = sorted(set(entity))
        for ent in entity_parse:
            print('\n', ent)
    
    

# Composing all tasks in files

In [12]:
# save default stdout
out_old = sys.stdout
tagged = POS_tagging(text_clear)
N = 10
#sys.stdout = out_old

In [13]:
sys.stdout = open("POS_tagging.txt", "w+")
print('Part-Of-Speech Tagging', '\n')
ignore = {'.',',',"'",'``',':', "''"}
POS_cout = collections.Counter(row[0] for row in tagged if row[0] not in ignore)
POS = POS_cout.most_common() 
print(N, ' most common words:\n')
for i in range(N):
        print(POS[i][0] + ": " + str(POS[i][1]))
        
POS_cout_part_of_sp = collections.Counter(row[1] for row in tagged if row[1] not in ignore)
POS_part = POS_cout_part_of_sp.most_common()
print('\n', N, ' most common parts of speech: \n')
for i in range(N):
        print(str(POS_part[i][0]) + ": " + str(POS_part[i][1]))
print("\n All: \n")
for ent in tagged:
    print(ent, '\n')

In [14]:
sys.stdout = open("NER_classification_ne_chank.txt", "w+")
print('NER with entity classification (using nltk.ne_chunk)', '\n\n')
ne_chunked = nltk.ne_chunk(tagged, binary=False)
ne_chunked_entity = extractEntities(ne_chunked)
for nce in ne_chunked_entity:
    print('\n', nce, ": ", ne_chunked_entity[nce])

In [15]:
sys.stdout = open("NER_with_custom_patterns(Adj+Noun).txt", "w+")
print('NER with custom patterns. RegexpParser. ')
print('entity matched with an adjective (optional) and proper noun (singular/plural)')
print('The rule: “NP: {<JJ>* <NN|NNS>}”', '\n\n')
grammar = "NP: {<JJ>* <NN|NNS>}"
entity_parse = parser(tagged, grammar)
entity_print(entity_parse, 10, True)

In [16]:
sys.stdout = open("NER_with_custom_patterns(V_to_V).txt", "w+")
print('NER with custom patterns. RegexpParser. ')
print('Exploring Text Corpora. V to V')
print('The rule: "NP: {<V.*> <TO> <V.*>}”', '\n\n')
grammar = "NP: {<V.*> <TO> <V.*>}"
entity_parse = parser(tagged, grammar)
entity_print(entity_parse, 10, True)

In [22]:
sys.stdout = open("Wiki_search_with_ne_chunk", "w+",encoding='utf8')
print('detected entity using nltk.ne_chunk and wiki search')
print('Words that have multiple meanings at the end', '\n\n')
entity_name = []
ne_chunked = nltk.ne_chunk(tagged, binary=True)
ne_chunked_entity = extractEntities(ne_chunked)
for nce in ne_chunked_entity:
    entity_name.append(nce)
wiki_search(entity_name)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [21]:
sys.stdout = open("Wiki_search_with_pattern.txt", "w+", encoding='utf8')
print('detected entity using pattern "NP: {<NNP>*}" and wiki search')
print('Words that have multiple meanings at the end', '\n\n')
grammar = "NP: {<NNP>*}"
entity_parse = parser(tagged, grammar)
entity_print(entity_parse, 10, False)
entity_parse = sorted(set(entity_parse))
wiki_search(entity_parse)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [23]:
sys.stdout = out_old