# NER using NLTK

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
 import nltk
 nltk.download('punkt')
 nltk.download('averaged_perceptron_tagger')
 nltk.download('maxent_ne_chunker')
 nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
DATA_DIR = '/content/drive/MyDrive/Colab Notebooks/TextMining/lab07/sampleText2.txt'
with open(DATA_DIR, 'r',encoding='utf-8', errors='ignore') as f:
     sample = f.read()
#sample = "John drove to a town in Auckland yesterday to buy a Toyota for $5000."
#sample = "John drove town"
sentences = nltk.sent_tokenize(sample)
print("sentences",sentences)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
print("tokenized_sentences",tokenized_sentences)
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
print("tagged_sentences:",tagged_sentences)
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
print("chunked_sentences:",chunked_sentences)

def extract_entity_names(t):
    entity_names = []

    if hasattr(t, 'label') and t.label:
        if t.label() == 'NE':
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child))

    return entity_names

entity_names = []
for tree in chunked_sentences:
    #Print results per sentence
    #print(extract_entity_names(tree))
    entity_names.extend(extract_entity_names(tree))



# Print all entity names
print(entity_names)

# Print unique entity names
print (set(entity_names))
print(len(entity_names))
print(len(set(entity_names)))

sentences ['Covid 19 Delta outbreak: Alert level review on way with new vaccine passport - Deputy Prime Minister Grant Robertson.', 'September 27 2021 There are 12 new cases of Covid-19 in the community today.', 'The seven-day rolling average of Covid cases in NZ is now 15, compared with 17 last week and 19 the week before.', 'The Government is reviewing its alert-level system - and broader freedoms - as more Kiwis get vaccinated and with a new vaccine passport on the way.', 'Deputy Prime Minister Grant Robertson said the Government was updating its work around the alert-level framework on the basis it had seen a terrific response to vaccination.', '"We want to push on 90 per cent plus, keep moving and open up a whole range of options for ourselves in the face of Delta," Robertson told The AM Show today.', "In early November there should be a vaccine passport that could be downloaded on people's phones.", 'The app was being managed by the Ministry of Health which was working with priva

In [None]:
for sent in nltk.sent_tokenize(sample):
  for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
     if hasattr(chunk, 'label'):
        print(chunk.label(), ' '.join(c[0] for c in chunk))

PERSON Alert
PERSON Grant Robertson
GPE Covid
GPE NZ
PERSON Grant Robertson
GPE Delta
PERSON Robertson
ORGANIZATION AM Show
ORGANIZATION Ministry
GPE Health
PERSON Work
PERSON Act
ORGANIZATION Party
PERSON David Seymour
ORGANIZATION Newstalk
PERSON Mike Hosking
PERSON Delta
GPE New Zealand
ORGANIZATION MIQ
PERSON Robertson
GPE Covid
GPE Auckland
PERSON Delta
ORGANIZATION TVNZ
PERSON Robertson
GPE New Zealand
GPE New
PERSON Zealand
PERSON Robertson
PERSON Kiwis
PERSON Māori
PERSON Robertson
ORGANIZATION MIQ
GPE Auckland
PERSON Jacinda Ardern
ORGANIZATION MIQ
PERSON Ardern
ORGANIZATION Auckland Business
PERSON Michael Barnett
GPE New Zealand
ORGANIZATION MIQ
PERSON Barnett
GPE Auckland
ORGANIZATION Delta
GPE New Zealand
PERSON Covid
PERSON Epidemiologist
PERSON Michael Baker
GPE Australia
GPE Taiwan
GPE Hong Kong
GPE China
GPE New Zealand
PERSON Kiwis
PERSON Pensioner
PERSON Sue Hotu
ORGANIZATION TVNZ
GPE New Zealand
ORGANIZATION Gold Coast
ORGANIZATION MIQ
PERSON Hotu
PERSON Hotu
ORGANI

#Using Spacy Library

In [None]:
from os import forkpty
# # Using Spacy NER
import spacy  # Install spacy if required.

nlp = spacy.load('en_core_web_sm')  # Install en_core_web_sm if required.
doc = nlp(sample)

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
#Grant Robertson 101 116 PERSON
#12 146 148 CARDINAL
#today 188 193 DATE
#seven-day 199 208 DATE


print("######################################################")
file1 = open('/content/drive/MyDrive/Colab Notebooks/TextMining/lab07/deptokens.csv','w')
allTokens = ''
for token in doc:
    tokens = token.text + ',' + token.lemma_+ ',' + token.pos_ + ',' + token.tag_ + ',' + token.dep_ + ',' +\
             token.shape_ + ',' +  str(token.is_alpha) + ',' + str(token.is_stop) + ',' + token.ent_type_ +'\n'
    #print(tokens)
    allTokens = allTokens + tokens
file1.write(allTokens)
file1.close
print(allTokens)

#today,today,NOUN,NN,npadvmod,xxxx,True,False

# filter doc.ents where ent.label_ == 'DATE'

'''
early October 8847 8860 DATE
Ardern 8862 8868 ORG
Vanuatu 8902 8909 GPE
Samoa 8911 8916 ORG
Tonga 8921 8926 GPE
at least one 8938 8950 CARDINAL
Ardern 8965 8971 ORG
RSE 9059 9062 ORG
seven days 9101 9111 DATE
######################################################
Covid,Covid,PROPN,NNP,nmod,Xxxxx,True,False
19,19,NUM,CD,nummod,dd,False,False
Delta,Delta,PROPN,NNP,dobj,Xxxxx,True,False
outbreak,outbreak,NOUN,NN,ROOT,xxxx,True,False
:,:,PUNCT,:,punct,:,False,False
Alert,alert,ADJ,JJ,amod,Xxxxx,True,False
level,level,NOUN,NN,compound,xxxx,True,False
review,review,NOUN,NN,appos,xxxx,True,False
'''



Covid 0 5 PERSON
19 6 8 CARDINAL
Delta 9 14 LOC
Grant Robertson 101 116 PERSON
September 27 2021 118 135 DATE
12 146 148 CARDINAL
today 188 193 DATE
seven-day 199 208 DATE
Covid 228 233 PERSON
NZ 243 245 GPE
15 253 255 DATE
17 last week 271 283 DATE
19 the week before 288 306 DATE
Kiwis 392 397 NORP
Grant Robertson 479 494 PERSON
90 per cent 651 662 MONEY
Delta 747 752 ORG
Robertson 755 764 PERSON
today 782 787 DATE
early November 792 806 DATE
the Ministry of Health 916 938 ORG
the next few weeks 1158 1176 DATE
Act Party 1178 1187 ORG
David Seymour 1195 1208 PERSON
Newstalk ZB's 1214 1227 PERSON
Mike Hosking 1228 1240 PERSON
Covid 1275 1280 PERSON
today 1298 1303 DATE
Delta 1427 1432 ORG
New Zealand 1511 1522 GPE
Robertson 1594 1603 ORG
today 1609 1614 DATE
Covid 1686 1691 PERSON
Auckland 1704 1712 GPE
Delta 1885 1890 ORG
TVNZ 1940 1944 ORG
Breakfast 1947 1956 LOC
Robertson 1958 1967 ORG
New Zealand 2118 2129 GPE
New Zealand 2161 2172 GPE
Robertson 2239 2248 ORG
Kiwis 2315 2320 GPE
Māo

'\nearly October 8847 8860 DATE\nArdern 8862 8868 ORG\nVanuatu 8902 8909 GPE\nSamoa 8911 8916 ORG\nTonga 8921 8926 GPE\nat least one 8938 8950 CARDINAL\nArdern 8965 8971 ORG\nRSE 9059 9062 ORG\nseven days 9101 9111 DATE\n######################################################\nCovid,Covid,PROPN,NNP,nmod,Xxxxx,True,False\n19,19,NUM,CD,nummod,dd,False,False\nDelta,Delta,PROPN,NNP,dobj,Xxxxx,True,False\noutbreak,outbreak,NOUN,NN,ROOT,xxxx,True,False\n:,:,PUNCT,:,punct,:,False,False\nAlert,alert,ADJ,JJ,amod,Xxxxx,True,False\nlevel,level,NOUN,NN,compound,xxxx,True,False\nreview,review,NOUN,NN,appos,xxxx,True,False\n'

In [None]:
!python --version

Python 3.10.12


In [None]:

from datetime import datetime, timedelta
import re

def replace_date(ent):
    lower_name = ent.lower()
    today = datetime.today()
    month_mapping = {
        "january": 1, "february": 2, "march": 3, "april": 4,
        "may": 5, "june": 6, "july": 7, "august": 8,
        "september": 9, "october": 10, "november": 11, "december": 12
    }

    word_to_number = {
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5,
    'six': 6,
    'seven': 7,
    'eight': 8,
    'nine': 9,
    'ten': 10,
    'eleven': 11,
    'twelve': 12,
    'thirteen': 13,
    'fourteen': 14,
    'fifteen': 15,
    'sixteen': 16,
    'seventeen': 17,
    'eighteen': 18,
    'nineteen': 19,
    'twenty': 20,
    'thirty': 30,
    'forty': 40,
    'fifty': 50,
    'sixty': 60,
    'seventy': 70,
    'eighty': 80,
    'ninety': 90,
    'hundred': 100,
    'thousand': 1000,
    'million': 1000000,
    'billion': 1000000000,
}

    if lower_name == 'today':
      t = datetime.now().strftime('%Y-%m-%d')
      return t;

    if lower_name == 'yesterday':
      t = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
      return t;

    if lower_name == 'next year':
      next_year_start = datetime(today.year + 1, 1, 1)
      next_year_end = datetime(today.year + 1, 12, 31)
      t = f"{next_year_start.strftime('%Y-%m-%d')} TO {next_year_end.strftime('%Y-%m-%d')}"
      return t;


    #  '17 last week','last week'
    if 'last week' in lower_name:
      # Last week (from today to 7 days prior)
      last_week_start = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
      last_week_end = (datetime.now()).strftime('%Y-%m-%d')
      t = f"{last_week_start} TO {last_week_end}"
      return t;

    # Seven days
    if lower_name == 'seven-day' or lower_name=="seven days":
      #seven_days = "7 DAY"

      # Last seven days (from today to previous seven days)
      last_seven_days_start = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
      last_seven_days_end = (datetime.now()).strftime('%Y-%m-%d')
      t = f"{last_seven_days_start} TO {last_seven_days_end}"
      return t;

    if lower_name == 'about six weeks':
      # About six weeks (approximate to two dates with 6-week interval)
      six_weeks_start = (datetime.now() - timedelta(weeks=6)).strftime('%Y-%m-%d')
      six_weeks_end = datetime.now().strftime('%Y-%m-%d')
      t = f"{six_weeks_start} TO {six_weeks_end}"
      return t;

    # 'the next few weeks'
    if  lower_name == 'the next few weeks':
        next_few_weeks_start_date = today + timedelta(weeks=1)
        next_few_weeks_end_date = today + timedelta(weeks=3)
        return f"{next_few_weeks_start_date.strftime('%Y-%m-%d')} TO {next_few_weeks_end_date.strftime('%Y-%m-%d')}"

    if lower_name == 'the coming days':
      # coming days (approximate 4-day interval from the current date)
      coming_days_start = datetime.now().strftime('%Y-%m-%d')
      coming_days_end = (datetime.now() + timedelta(days=4)).strftime('%Y-%m-%d')
      t = f"{coming_days_start} TO {coming_days_end}"
      return t;


    #month
    if lower_name in month_mapping:
      c_month = month_mapping[lower_name]
      current_year = datetime.now().year

      start_date = datetime(current_year, c_month, 1)
      next_month = start_date.replace(day=28) + timedelta(days=4)  # Add 4 days to avoid issues with leap years
      end_date = next_month - timedelta(days=next_month.day)

      t = f"{start_date.strftime('%Y-%m-%d')} TO {end_date.strftime('%Y-%m-%d')}"
      return t

    #'September 27 2021'
    pattern = r"^(january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2}\s+\d{4}$"
    if re.match(pattern, lower_name):
      date_obj = datetime.strptime(lower_name, "%B %d %Y")
      t = date_obj.strftime("%Y-%m-%d")
      return t;



    #  '19 the week before'
    pattern = r"^\d{1,2}\s(the week before)$"
    if re.match(pattern, lower_name):
      day = int(lower_name.split()[0])
      two_weeks_ago_date = today - timedelta(weeks=2)
      specific_date = two_weeks_ago_date.replace(day=day)
      t = specific_date.strftime('%Y-%m-%d')
      return t;

    #  'early november'
    pattern = r"^(early)\s(january|february|march|april|may|june|july|august|september|october|november|december)$"
    if re.match(pattern, lower_name):
      c_month = month_mapping[lower_name.split()[1]]
      start_date = datetime(today.year, c_month, 1)
      #next_month = start_date.replace(day=28) + timedelta(days=4)  # Add 4 days to avoid issues with leap years

      end_date = start_date + timedelta(days=4)
      t = f"{start_date.strftime('%Y-%m-%d')} TO {end_date.strftime('%Y-%m-%d')}"
      return t


    #'the past six months', 'the past seven weeks' ，'the end of october'
    pattern = r'the past (\w+) (months|weeks)|the end of (\w+)'
    match = re.search(pattern, lower_name)

    if match:
        if match.group(1) and match.group(2):
            # past week or month
            num_units = word_to_number.get(match.group(1).lower(), match.group(1))
            unit_type = match.group(2)

            print(num_units)
            #print(class(num_nuits))
            print(unit_type)

            if unit_type == 'months':
                start_date = today - timedelta(days=num_units*30)
            elif unit_type == 'weeks':
                start_date = today - timedelta(weeks=num_units)
            end_date = today
        elif match.group(3):
            # the end of month
            month_name = match.group(3)
            c_month = month_mapping[month_name]
            start_date = datetime(today.year, c_month, 1)
            end_date = datetime(today.year, c_month + 1, 1) - timedelta(days=1)

        t = f"{start_date.strftime('%Y-%m-%d')} TO {end_date.strftime('%Y-%m-%d')}"
        return t



    #only day
    pattern = r"^\d{1,2}$"
    if re.match(pattern, lower_name):
      t = f"{today.year:04d}-{today.month:02d}-{int(lower_name):02d}"
      return t;

    # cannot convert,out put original string
    return lower_name



date_entities = [ent.text for ent in doc.ents if ent.label_ == 'DATE']
print("Date entities:", date_entities)


replaced_dates = [replace_date(date) for date in date_entities]
print(replaced_dates)

Date entities: ['September 27 2021', 'today', 'seven-day', '15', '17 last week', '19 the week before', 'today', 'early November', 'the next few weeks', 'today', 'today', 'the end of October', 'about six weeks', 'next year', 'May', 'today', 'the coming days', 'seven-day', '15', '17 last week', '19 the week before', 'the past six months', 'the past seven weeks', 'March', 'early October', 'seven days']
6
months
7
weeks
['2021-09-27', '2024-04-09', '2024-04-02 TO 2024-04-09', '2024-04-15', '2024-04-02 TO 2024-04-09', '2024-03-19', '2024-04-09', '2024-11-01 TO 2024-11-05', '2024-04-16 TO 2024-04-30', '2024-04-09', '2024-04-09', '2024-10-01 TO 2024-10-31', '2024-02-27 TO 2024-04-09', '2025-01-01 TO 2025-12-31', '2024-05-01 TO 2024-05-31', '2024-04-09', '2024-04-09 TO 2024-04-13', '2024-04-02 TO 2024-04-09', '2024-04-15', '2024-04-02 TO 2024-04-09', '2024-03-19', '2023-10-12 TO 2024-04-09', '2024-02-20 TO 2024-04-09', '2024-03-01 TO 2024-03-31', '2024-10-01 TO 2024-10-05', '2024-04-02 TO 2024

In [None]:


print("######################################################")
file1 = open('/content/drive/MyDrive/Colab Notebooks/TextMining/lab07/deptokens2.csv','w')
allTokens = ''
for token in doc:
    tokens = token.text + ',' + token.lemma_+ ',' + token.pos_ + ',' + token.tag_ + ',' + token.dep_ + ',' +\
             token.shape_ + ',' +  str(token.is_alpha) + ',' + str(token.is_stop)

    if token.ent_type_ == 'DATE':
      tokens +=  ',' + replace_date(token.text)
    tokens = tokens + '\n'
    #print(tokens)
    allTokens = allTokens + tokens
file1.write(allTokens)
file1.close
print(allTokens)

######################################################
Covid,Covid,PROPN,NNP,nmod,Xxxxx,True,False
19,19,NUM,CD,nummod,dd,False,False
Delta,Delta,PROPN,NNP,dobj,Xxxxx,True,False
outbreak,outbreak,NOUN,NN,ROOT,xxxx,True,False
:,:,PUNCT,:,punct,:,False,False
Alert,alert,ADJ,JJ,amod,Xxxxx,True,False
level,level,NOUN,NN,compound,xxxx,True,False
review,review,NOUN,NN,appos,xxxx,True,False
on,on,ADP,IN,prep,xx,True,True
way,way,NOUN,NN,pobj,xxx,True,False
with,with,ADP,IN,prep,xxxx,True,True
new,new,ADJ,JJ,amod,xxx,True,False
vaccine,vaccine,NOUN,NN,compound,xxxx,True,False
passport,passport,NOUN,NN,compound,xxxx,True,False
-,-,PUNCT,HYPH,punct,-,False,False
Deputy,Deputy,PROPN,NNP,compound,Xxxxx,True,False
Prime,Prime,PROPN,NNP,compound,Xxxxx,True,False
Minister,Minister,PROPN,NNP,compound,Xxxxx,True,False
Grant,Grant,PROPN,NNP,compound,Xxxxx,True,False
Robertson,Robertson,PROPN,NNP,pobj,Xxxxx,True,False
.,.,PUNCT,.,punct,.,False,False

,
,SPACE,_SP,dep,
,False,False
September,September,PRO