In [1]:
import nltk
from nltk.tag import StanfordNERTagger

In [2]:
sentences = [
    'Facebook is an American for-profit corporation based in Menlo Park, California.',
    'The Facebook website was launched on February 4, 2004, by Mark Zuckerberg, along with fellow Harvard College students and roommates, Eduardo Saverin, Andrew McCollum, Dustin Moskovitz, and Chris Hughes.',
    'Apple Inc. is an American multinational technology company headquartered in Cupertino, California.',
    'Apple was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976 to develop and sell personal computers',
    'Microsoft was founded by Paul Allen and Bill Gates on April 4, 1975, to develop and sell BASIC interpreters for the Altair 8800.',
    'Saul Hudson better known by his stage name Slash, is a British-American musician and songwriter. He is best known as the lead guitarist of the American rock band Guns N\' Roses.',
    'England is a country that is part of the United Kingdom. It shares land borders with Scotland to the north and Wales to the west.',
    'Mario Draghi is an Italian economist who has served as President of the European Central Bank since November 2011.',
    'Italy is a unitary parliamentary republic in Europe. Italy shares open land borders with France, Switzerland, Austria, Slovenia, San Marino and Vatican City.',
    'Tesla is an American automaker, energy storage company, and solar panel manufacturer based in Palo Alto.',
    'Elon Reeve Musk is a South African-born Canadian-American business magnate, investor, engineer, and inventor.',
    'Musk has stated that the goals of SolarCity, Tesla, and SpaceX revolve around his vision to change the world and humanity.',
    'Steven Paul Jobs was an American entrepreneur, businessman, inventor, and industrial designer.',
    'As Sinatra said "The best revenge is massive success"!',
    'Frank Underwood is the main character of House of Cards',
    'Kevin Spacey is my favorite actor'
]

In [3]:
import os
java_path = "C:\\Program Files\\Java\\jdk1.8.0_25\\bin\\java.exe"
os.environ['JAVAHOME'] = java_path

In [4]:
st = StanfordNERTagger(
    './stanford/classifiers/english.all.3class.distsim.crf.ser.gz',
    './stanford/stanford-ner.jar',
    encoding='latin1')

In [5]:
def split2(x,by=2):
    out = []
    for i in range(0,len(x)):
        if i % 2 == 0:
            out.append(tuple(x[i:i+2]))
    return(out)

def from_sentence_to_ne(x,method='nltk'):
    token = nltk.word_tokenize(x)
    if method == 'nltk':
        tag = nltk.pos_tag(token)
        ne = nltk.ne_chunk(tag)
        out = nltk.chunk.tree2conllstr(ne)
        
        out = out.split()
        length = len(out)
        for i in range(0,length):
            if i % 2 != 0:
                try:
                    del out[i]
                except IndexError:
                    break
        out = split2(out)
    else:
        out = st.tag(token) 
    return(out)

In [6]:
%%time
method = 'nltk'
for s in sentences:
    app = from_sentence_to_ne(s, method=method)
    print(s)
    print('\n')
    for i in app:
        if i[1]!='O':
            print(i)
    print("\n\n")

Facebook is an American for-profit corporation based in Menlo Park, California.


('Facebook', 'B-GPE')
('American', 'B-GPE')
('Menlo', 'B-GPE')
('Park', 'I-GPE')
('California', 'B-GPE')



The Facebook website was launched on February 4, 2004, by Mark Zuckerberg, along with fellow Harvard College students and roommates, Eduardo Saverin, Andrew McCollum, Dustin Moskovitz, and Chris Hughes.


('Facebook', 'B-ORGANIZATION')
('Mark', 'B-PERSON')
('Zuckerberg', 'I-PERSON')
('Harvard', 'B-ORGANIZATION')
('College', 'I-ORGANIZATION')
('Eduardo', 'B-PERSON')
('Saverin', 'I-PERSON')
('Andrew', 'B-PERSON')
('McCollum', 'I-PERSON')
('Dustin', 'B-PERSON')
('Moskovitz', 'I-PERSON')
('Chris', 'B-PERSON')
('Hughes', 'I-PERSON')



Apple Inc. is an American multinational technology company headquartered in Cupertino, California.


('Apple', 'B-PERSON')
('Inc.', 'B-ORGANIZATION')
('American', 'B-GPE')
('Cupertino', 'B-GPE')
('California', 'B-GPE')



Apple was founded by Steve Jobs, Steve Wozniak, and

In [7]:
%%time
method = 'stanford'
for s in sentences:
    app = from_sentence_to_ne(s, method=method)
    print(s)
    print('\n')
    for i in app:
        if i[1]!='O':
            print(i)
    print("\n\n")

Facebook is an American for-profit corporation based in Menlo Park, California.


('Facebook', 'ORGANIZATION')
('Menlo', 'LOCATION')
('Park', 'LOCATION')
('California', 'LOCATION')



The Facebook website was launched on February 4, 2004, by Mark Zuckerberg, along with fellow Harvard College students and roommates, Eduardo Saverin, Andrew McCollum, Dustin Moskovitz, and Chris Hughes.


('Facebook', 'ORGANIZATION')
('Mark', 'PERSON')
('Zuckerberg', 'PERSON')
('Harvard', 'ORGANIZATION')
('College', 'ORGANIZATION')
('Eduardo', 'PERSON')
('Saverin', 'PERSON')
('Andrew', 'PERSON')
('McCollum', 'PERSON')
('Dustin', 'PERSON')
('Moskovitz', 'PERSON')
('Chris', 'PERSON')
('Hughes', 'PERSON')



Apple Inc. is an American multinational technology company headquartered in Cupertino, California.


('Apple', 'ORGANIZATION')
('Inc.', 'ORGANIZATION')
('Cupertino', 'LOCATION')
('California', 'LOCATION')



Apple was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976 to develop and sel