## Using NLTK Library

In [1]:
import nltk
import pandas as pd

In [2]:
f = open('./SampleData/sample_text.txt', 'r')
text = f.read()

In [3]:
text[:200]

'artificial intelligence (AI), the ability of a digital computer or computer-controlled robot to perform tasks commonly associated with intelligent beings. The term is frequently applied to the project'

Word based approach

In [4]:
words = nltk.word_tokenize(text)

In [5]:
#Part of speech tagging
pos_tags = nltk.pos_tag(words)
pos_tags[:5]

[('artificial', 'JJ'),
 ('intelligence', 'NN'),
 ('(', '('),
 ('AI', 'NNP'),
 (')', ')')]

In [6]:
nltk.help.upenn_tagset('NNP')

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


In [7]:
#Chunking
chunks = nltk.ne_chunk(pos_tags, binary=False)
entities = []
labels = []
for chunk in chunks:
    if hasattr(chunk, 'label'):
        entities.append(' '.join(c[0] for c in chunk))
        labels.append(chunk.label())

entities_labels = list(set(zip(entities, labels)))
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ['Entities', 'labels']
entities_df

Unnamed: 0,Entities,labels
0,AI,GPE
1,AI,ORGANIZATION
2,DataRobot,ORGANIZATION
3,Sphex,GPE


Sentence Based

In [8]:
entities = []
labels = []

sentences = nltk.sent_tokenize(text)
for sent in sentences:
    pos_tags = nltk.pos_tag(nltk.word_tokenize(sent))
    for chunk in nltk.ne_chunk(pos_tags, binary=False):
        if hasattr(chunk, 'label'):
            entities.append(' '.join(c[0] for c in chunk))
            labels.append(chunk.label())
entities_labels = list(set(zip(entities, labels)))
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ['Entities', 'labels']
entities_df

Unnamed: 0,Entities,labels
0,AI,GPE
1,Inductive,GPE
2,AI,ORGANIZATION
3,DataRobot,ORGANIZATION
4,Research,GPE
5,Artificial,GPE
6,Sphex,GPE


In [9]:
def get_named_entities(text, binary=True):
    entities = []
    labels = []
    sentences = nltk.sent_tokenize(text)
    for sent in sentences:
        pos_tags = nltk.pos_tag(nltk.word_tokenize(sent))
        for chunk in nltk.ne_chunk(pos_tags, binary=binary):
            if hasattr(chunk, 'label'):
                entities.append(' '.join(c[0] for c in chunk))
                labels.append(chunk.label())
    entities_labels = list(set(zip(entities, labels)))
    entities_df = pd.DataFrame(entities_labels)
    entities_df.columns = ['Entities', 'labels']
    return entities_df

In [10]:
text2 = '''Global warming or climate change has today become a major threat to the mankind. The Earth’s temperature is on the rise and there are various reasons for it such as greenhouse gases emanating from carbon dioxide (CO2) emissions, burning of fossil fuels or deforestation. Recent years have been unusually warm, causing worldwide concern. But the fact is that the increase in carbon dioxide actually began in 1800, due to the deforestation of a large chunk of North-eastern American, besides forested parts of the world. The things became worse with emissions in the wake of the industrial revolution, leading to increase in carbon dioxide level by 1900.The sea levels are constantly rising as fresh water marshlands, low-lying cities, and islands have been inundated with seawater
There have been changes in rainfall patterns, leading to droughts and fires in some areas, and flooding in other areas.
Ice caps are constantly melting posing a threat to polar bears as their feeding season stands reduced.
Glaciers are gradually melting.
Animal populations are gradually vanishing as there has been a widespread loss of their habitat.'''

In [11]:
get_named_entities(text2)

Unnamed: 0,Entities,labels
0,Global,NE
1,Earth,NE
2,Animal,NE
3,American,NE
4,CO2,NE


## Using Spacy Library

In [12]:
import spacy

In [16]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     -------------------------------------- 12.8/12.8 MB 139.4 kB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.4.1
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


2022-11-03 15:23:50.293237: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-11-03 15:23:50.293276: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-11-03 15:23:53.753147: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-11-03 15:23:53.753507: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cublas64_11.dll'; dlerror: cublas64_11.dll not found
2022-11-03 15:23:53.753850: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cublasLt64_11.dll'; dlerror: cublasLt64_11.dll not found
2022-11-03 15:23:53.754203: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cu

In [17]:
nlp = spacy.load('en_core_web_sm')

In [18]:
doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)

df = pd.DataFrame({'Entities': entities, 'Labels':labels, 'Position_Start': position_start, 'Position_End':position_end})
df

Unnamed: 0,Entities,Labels,Position_Start,Position_End
0,(AI),ORG,25,27
1,"(the, 1940s)",DATE,430,439
2,(tasksâ€”as),ORG,525,535
3,(Sphex),PERSON,1400,1405
4,(first),ORDINAL,1478,1483
5,"(a, few, inches)",QUANTITY,1704,1716
6,(AI),GPE,2117,2119
7,(AI),ORG,3609,3611
8,(one),CARDINAL,3969,3972
9,(DataRobot),ORG,4143,4152


In [19]:
def combine_elements(x):
    s = ''
    for e in x:
        s += str(e)+" "
    return s

In [20]:
def get_named_entities(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    entities = []
    labels = []
    position_start = []
    position_end = []

    for ent in doc.ents:
        entities.append(ent)
        labels.append(ent.label_)
        position_start.append(ent.start_char)
        position_end.append(ent.end_char)

    df = pd.DataFrame({'Entities': entities, 'Labels':labels, 'Position_Start': position_start, 'Position_End':position_end})
    df['Entities'] = df['Entities'].apply(combine_elements)
    return df

In [21]:
get_named_entities(text2)

Unnamed: 0,Entities,Labels,Position_Start,Position_End
0,today,DATE,37,42
1,Earth,LOC,85,90
2,Recent years,DATE,271,283
3,1800,DATE,407,411
4,North - eastern,LOC,458,471
5,American,NORP,472,480
6,1900.The,CARDINAL,647,655
7,Glaciers,NORP,1003,1011
