In [494]:
import spacy
from spacy import displacy

import re
import pandas as pd
from textacy import extract

from collections import defaultdict 
from fuzzywuzzy import fuzz
import time
import uuid

import os
import json

from datetime import datetime

In [495]:
pd.set_option('display.max_columns', None)
print(time.localtime())

disaster_summary_file = "D://projects//_external_files//surveyor//rw_disaster_preprocessed//disaster_summaries_json_parsed_fb7929b5d8944d72a9a06653ac0822d2.xlsx"
fpath = "D://projects//_external_files//surveyor//rw_disaster_preprocessed//"

df_rw_disaster_sum = pd.read_excel(disaster_summary_file)
df_rw_disaster_sum = df_rw_disaster_sum.fillna('')

time.struct_time(tm_year=2023, tm_mon=12, tm_mday=10, tm_hour=15, tm_min=35, tm_sec=36, tm_wday=6, tm_yday=344, tm_isdst=0)


In [496]:
def remove_references(text):
    try:   
        pattern = r'\([^)]+\)\)\s?'  # Matches anything inside a ([xxxx](url)) pattern
        text_without_sections = re.sub(pattern, '', text)
        return text_without_sections
    except:
        return text

def secondary_pass_text_scrub(text):

    def convert_spelled_nums_to_digit(token):
        clean_token = re.sub(r'[^a-zA-Z]', '', token).lower()
        
        mappings = {
            'one' : 1,'two' : 2,'three' : 3,'four' : 4,'five' : 5,'six' : 6,'seven' : 7,'eight' : 8,'nine' : 9, 'ten' : 10
            ,'eleven' : 11, 'twelve' : 12, 'thirteen':13, 'fourteen':14, 'fifteen':15, 'sixteen':16, 'seventeen':17
            ,'eighteen':18, 'nineteen':19, 'twenty':20, 'dozen':12
        }
    
        if mappings.get(clean_token) is not None:
            return mappings[clean_token]
        else:
            return token


    def standardize_hyphens(text):

        # Replace special characters at the beginning with an empty string
        pattern = r'^[^a-zA-Z0-9]+'
        text = re.sub(pattern, '', text)
    
        # where hyphen is between 2 chars... replace with underscore
        pattern = r'([a-zA-Z])\-([a-zA-Z])'
        text = re.sub(pattern, r'\1_\2', text)
    
        # where hyphen is between char and num... replace with underscore
        # eg COVID-19 -> COVID_19
        pattern = r'([a-zA-Z])\-([\d])'
        text = re.sub(pattern, r'\1_\2', text)
    
        # where hyphen is between num and num... replace with ' to '
        # eg COVID-19 -> COVID_19
        pattern = r'([\d])\-([\d])'
        text = re.sub(pattern, r'\1 to \2', text)
        
        return text


    def standardize_cardinal_directions(text):

        #first capitalize
         
        pattern = re.compile(r'\b(north)\b', re.IGNORECASE)
        text = re.sub(pattern, 'north', text)
        pattern = re.compile(r'\b(northern)\b', re.IGNORECASE)
        text = re.sub(pattern, 'northern', text)
        
        pattern = re.compile(r'\b(south)\b', re.IGNORECASE)
        text = re.sub(pattern, 'south', text)
        pattern = re.compile(r'\b(southern)\b', re.IGNORECASE)
        text = re.sub(pattern, 'southern', text)

        pattern = re.compile(r'\b(east)\b', re.IGNORECASE)
        text = re.sub(pattern, 'east', text)
        pattern = re.compile(r'\b(eastern)\b', re.IGNORECASE)
        text = re.sub(pattern, 'eastern', text)
        
        pattern = re.compile(r'\b(west)\b', re.IGNORECASE)
        text = re.sub(pattern, 'west', text)
        pattern = re.compile(r'\b(western)\b', re.IGNORECASE)
        text = re.sub(pattern, 'western', text)
        


        
        # Standardize on "Southeast" for each variant
        pattern = re.compile(r'\b(northeast|north[\s-]?east)', re.IGNORECASE)
        text = re.sub(pattern, 'northeast', text)
        
        pattern = re.compile(r'\b(northwest|north[\s-]?west)', re.IGNORECASE)
        text = re.sub(pattern, 'northwest', text)
        
        pattern = re.compile(r'\b(southeast|south[\s-]?east)', re.IGNORECASE)
        text = re.sub(pattern, 'southeast', text)
        
        pattern = re.compile(r'\b(southwest|south[\s-]?west)', re.IGNORECASE)
        text = re.sub(pattern, 'southwest', text)
        #print(f"end {text}")
    
        return text

    def standardize_time_indicators(text):
        pattern = re.compile(r'\b(local[\s-]?time)\b', re.IGNORECASE)
        text = re.sub(pattern, 'localtime', text)
        return text
        
    
    #turn 'four' into 4
    try:
        text = ' '.join([str(convert_spelled_nums_to_digit(t)) for t in text.split(" ")])
    except:
        print(text)
    text = standardize_cardinal_directions(text)
    text = standardize_time_indicators(text)

    # get rid of remaining content within square brackets
    pattern = r'\[.*?\]'
    text = re.sub(pattern, '', text)

    # "per cent" to "percent"
    pattern = r'per cent'
    text = re.sub(pattern, 'percent', text)

    # "\n" to " "
    pattern = r'\n'
    text = re.sub(pattern, ' ', text)

    # change 34km to 34 kilometers
    text = re.sub(r'(\d+)\s?km\b', r'\1 kilometers', text)

    # ellipses
    text = re.sub(r'\.\.\.', '\. ', text)

    # 2+ spaces in a row
    text = re.sub(r'\s{2,}', ' ', text)

    text = standardize_hyphens(text)

    #not doing this because it strips out other scripts' unicode
    #remove all non alpha numeric and punctuation
    #pattern = r'[^a-zA-Z0-9\s\,\.\?\!\-\(\)]'
    #text = re.sub(pattern, '', text)

    
    text.strip()
    
    return text


def string_remove_parenthetical_content(text):
    # Use regular expression to remove content inside parentheses
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'\s{2,}', ' ', text)
    return text


In [497]:



#df_rw_disaster_sum[['references','reference_auth_org','reference_date_str','reference_date_iso','reference_url']] = df_rw_disaster_sum['text'].apply(parse_references) 
df_rw_disaster_sum['text'] = df_rw_disaster_sum['text'].apply(remove_references) 
df_rw_disaster_sum['text'] = df_rw_disaster_sum['text'].apply(secondary_pass_text_scrub) 
df_rw_disaster_sum['authoring_org'] = 'reliefweb'
df_rw_disaster_sum['non_parenthetical_text'] = df_rw_disaster_sum['text'].apply(string_remove_parenthetical_content)

def generate_uuid(x):
    return uuid.uuid4().hex

output_file = f"{fpath}disaster_summaries_preprocessed_{generate_uuid(1)}.xlsx"
df_rw_disaster_sum.to_excel(output_file, index=False)
print(output_file)

D://projects//_external_files//surveyor//rw_disaster_preprocessed//disaster_summaries_preprocessed_9fcc0753cbbf4fb7a37ea5a15f872a11.xlsx


In [500]:
x = 326

print(df_rw_disaster_sum['source_original_text'].tolist()[x])
print()
print(df_rw_disaster_sum['non_parenthetical_text'].tolist()[x])

As of 30 January, almost 67,000 people are sheltered in 415 evacuation centres, while approximately 225,400 people have been displaced in other areas. Overall, more than 464,700 people have been affected by the eruption. €750,000 in emergency humanitarian funding has been released by DG ECHO to assist those affected. Taal Volcano may erupt in the coming days as seismic activity suggests magma is flowing into the volcano. An Alert Level 3 is in effect signalling a possible eruption within weeks. ([ECHO, 30 Jan 2020](https://reliefweb.int/node/3499483)

As of 30 January, almost 67,000 people are sheltered in 415 evacuation centres, while approximately 225,400 people have been displaced in other areas. Overall, more than 464,700 people have been affected by the eruption. €750,000 in emergency humanitarian funding has been released by DG ECHO to assist those affected. Taal Volcano may erupt in the coming days as seismic activity suggests magma is flowing into the volcano. An Alert Level 3 

## END

In [438]:
def parse_summary_json(j):
    #j = j['data'][0]
    

    #try:
    #reference_url = j['href']
    reference_url = ''
    file_url = ''
    themes = []
    author_org = ''
    
    rec_id = j['id']
    j = j['fields']

    status = j['status']
    
    
    #glide_id = None
    glide_id = j.get('glide')
    #print(f"glide {glide_id}")
    status = j['status']
    
   
    title = j['name']
    description = j['description']
    file_url = j['url_alias']
    #file_url = j['file'][0]['url']
    primary_country_iso3 = j['primary_country']['iso3']
    primary_country = j['primary_country']['shortname']
    #author_org = j['source'][0]['shortname']
    report_date = j['date']['changed']
    
   
    original_text_list = description.split("\n\n")
    idx_para=0
    for o in original_text_list:
        o = o.strip()
        row = ['disaster summary',status,file_url,glide_id,idx_para,primary_country,title,themes,o,reference_url,o,author_org,report_date]
        df_rw_disaster_sum.loc[len(df_rw_disaster_sum)] = row
        idx_para += 1


def parse_references(text):
    ref = ''
    org = ''
    date = ''
    iso_date = None
    url = ''
    
    pattern = r'\([^)]+\)\)\s?'  # Matches anything inside a ([xxxx](url)) pattern

    ref = re.findall(pattern, text)
    if len(ref) > 0:
        last_reference = ref[-1]
        pattern = r'\[(.*?),\s(\d+\s\w+\s\d+)\]\((.*?)\)'

        # Use regex to find matches
        matches = re.search(pattern, last_reference)
        
        if matches:
            # Extracting the parts
            org = matches.group(1)
            date = matches.group(2)
            url = matches.group(3)
        
            #print("ref_to_authoring_org:", org)
            #print("Date:", date)
            #print("URL:", url)

            date_object = datetime.strptime(date, "%d %b %Y")
            iso_date = date_object.date().isoformat()
            
            #print("ISO Date:", iso_date)
        else:
            print("No match found.")

    
    return pd.Series({'references':ref,'auth_org':org,'date_str':date,'date_iso':iso_date,'reference_url':url})

def remove_references(text):
    pattern = r'\([^)]+\)\)\s?'  # Matches anything inside a ([xxxx](url)) pattern

    text_without_sections = re.sub(pattern, '', text)
    return text_without_sections

def secondary_pass_text_scrub(text):

    def convert_spelled_nums_to_digit(token):
        clean_token = re.sub(r'[^a-zA-Z]', '', token).lower()
        
        mappings = {
            'one' : 1,'two' : 2,'three' : 3,'four' : 4,'five' : 5,'six' : 6,'seven' : 7,'eight' : 8,'nine' : 9, 'ten' : 10
            ,'eleven' : 11, 'twelve' : 12, 'thirteen':13, 'fourteen':14, 'fifteen':15, 'sixteen':16, 'seventeen':17
            ,'eighteen':18, 'nineteen':19, 'twenty':20, 'dozen':12
        }
    
        if mappings.get(clean_token) is not None:
            return mappings[clean_token]
        else:
            return token


    def standardize_cardinal_directions(text):

        #first capitalize
         
        pattern = re.compile(r'\b(north)\b', re.IGNORECASE)
        text = re.sub(pattern, 'North', text)
        pattern = re.compile(r'\b(northern)\b', re.IGNORECASE)
        text = re.sub(pattern, 'Northern', text)
        
        pattern = re.compile(r'\b(south)\b', re.IGNORECASE)
        text = re.sub(pattern, 'South', text)
        pattern = re.compile(r'\b(southern)\b', re.IGNORECASE)
        text = re.sub(pattern, 'Southern', text)

        pattern = re.compile(r'\b(east)\b', re.IGNORECASE)
        text = re.sub(pattern, 'East', text)
        pattern = re.compile(r'\b(eastern)\b', re.IGNORECASE)
        text = re.sub(pattern, 'Eastern', text)
        
        pattern = re.compile(r'\b(west)\b', re.IGNORECASE)
        text = re.sub(pattern, 'West', text)
        pattern = re.compile(r'\b(western)\b', re.IGNORECASE)
        text = re.sub(pattern, 'Western', text)
        


        
        # Standardize on "Southeast" for each variant
        pattern = re.compile(r'\b(northeast|north[\s-]?east)', re.IGNORECASE)
        text = re.sub(pattern, 'Northeast', text)
        
        pattern = re.compile(r'\b(northwest|north[\s-]?west)', re.IGNORECASE)
        text = re.sub(pattern, 'Northwest', text)
        
        pattern = re.compile(r'\b(southeast|south[\s-]?east)', re.IGNORECASE)
        text = re.sub(pattern, 'Southeast', text)
        
        pattern = re.compile(r'\b(southwest|south[\s-]?west)', re.IGNORECASE)
        text = re.sub(pattern, 'Southwest', text)
        #print(f"end {text}")
    
        return text

    def standardize_time_indicators(text):
        pattern = re.compile(r'\b(local[\s-]?time)\b', re.IGNORECASE)
        text = re.sub(pattern, 'localtime', text)
        return text
        
    
    #turn 'four' into 4
    text = ' '.join([str(convert_spelled_nums_to_digit(t)) for t in text.split(" ")])
    text = standardize_cardinal_directions(text)
    text = standardize_time_indicators(text)

    # get rid of remaining content within square brackets
    pattern = r'\[.*?\]'
    text = re.sub(pattern, '', text)

    # "per cent" to "percent"
    pattern = r'per cent'
    text = re.sub(pattern, 'percent', text)

    # "\n" to " "
    pattern = r'\n'
    text = re.sub(pattern, ' ', text)

    # change 34km to 34 kilometers
    text = re.sub(r'(\d+)\s?km\b', r'\1 kilometers', text)


    text = re.sub(r'\s{2,}', ' ', text)
    text.strip()
    
    return text


def string_remove_parenthetical_content(text):
    # Use regular expression to remove content inside parentheses
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'\s{2,}', ' ', text)
    return text


In [414]:
#prep receiving df
df_rw_disaster_sum = pd.DataFrame(columns = ['record_type','status','source_url','glide_id','idx_para','source_level_country','source_title','source_desc',
                                                        'source_original_text','reference_url','text','authoring_org','reported_date'])

f = "D://projects//_external_files//reliefweb_disaster_reports//51754_2023-11-16_reliefweb_disaster_afghanistan.json"

with open(f"{f}", 'r') as file:
    json_data = json.load(file)
    
parse_summary_json(json_data)

df_rw_disaster_sum[['references','reference_auth_org','reference_date_str','reference_date_iso','reference_url']] = df_rw_disaster_sum['text'].apply(parse_references) 
df_rw_disaster_sum['text'] = df_rw_disaster_sum['text'].apply(remove_references) 
df_rw_disaster_sum['text'] = df_rw_disaster_sum['text'].apply(secondary_pass_text_scrub) 
df_rw_disaster_sum['authoring_org'] = 'reliefweb'
df_rw_disaster_sum['non_parenthetical_text'] = df_rw_disaster_sum['text'].apply(string_remove_parenthetical_content)

            

In [415]:
df_rw_disaster_sum = df_rw_disaster_sum[df_rw_disaster_sum['reference_auth_org'] != '']


In [418]:
x = 4

print(df_rw_disaster_sum['source_original_text'].tolist()[x])
print()
print(df_rw_disaster_sum['non_parenthetical_text'].tolist()[x])



A new 6.3 M earthquake at a depth of 9 km, followed by two aftershocks of 5 M and 4.1 M, hit Herat Province on 11 October at 00:41 UTC (05:11 local time). The 6.3 M earthquake was located in Zinda Jan District, 27 km northwest of Herat City and at 14 km south-east of the 6.3 M earthquake that occurred on 7 October that caused more than 2,400 fatalities, as of 10 October. According to USGS, approximately 4,000 people were exposed to severe shaking and 1,273 million people were exposed to strong and very strong shaking. Media reports, as of 11 October, 80 injured people that have been hospitalized while rescue operation are still ongoing throughout the region for the 11 October earthquake. ([ECHO, 11 Oct 2023](https://reliefweb.int/node/4004884))

A new 6.3 M earthquake at a depth of 9 kilometers, followed by 2 aftershocks of 5 M and 4.1 M, hit Herat Province on 11 October at 00:41 UTC . The 6.3 M earthquake was located in Zinda Jan District, 27 kilometers Northwest of Herat City and at 

In [329]:
df_rw_disaster_sum.to_excel("c:/temp/afghan_test.xlsx", index=False)


PermissionError: [Errno 13] Permission denied: 'c:/temp/afghan_test.xlsx'

## Clean up numbers

In [369]:
text = """On 7 October 2023 at around 11.00 local time, a 6.3 magnitude earthquake struck 40km west of Herat City in Herat Province, western Afghanistan. Several aftershocks have occurred since, 
with the initial quake felt in neighbouring Badghis and Farah provinces. Initial assessments indicate that as many as 100 people have been killed across eight villages in Zindajan Province, 
Herat Province – Mahal Wardkah (20), Dasht Hows (15), Bahadorzai (14), Zoryan (13), Koshkak (12), Sar Boland (11), Sanjab (8), and Hilalzai (7), with a further 500 people injured. """

text2 = """
To date, 1,023 people have been killed and 1,663 people injured across eleven villages of Zindajan district, Herat Province, where 100 percent of homes are estimated to have been completely destroyed. 
A further 516 people (203 men and 213 women) are reported to be missing from the district. In total, 11,585 people (1,655 families) are assessed to have been affected to date across south-east Zindajan district (1,320 families), 
and Injil (150 families), Gulran (95 families), southeast Kohsan (60 families) and Southeast Kushk (Robat-e-Sagani) districts (30 families). """


text2 = """
More than 3 quarters of homes destroyed are located in 2 districts – Zinjadin (1,353) and Injil districts (586). Additionally, 21,300 buildings are estimated to have sustained damage.
"""

import spacy
import textacy
nlp = spacy.load('en_core_web_sm')

doc = nlp(text2)



In [423]:
entities = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
entities

[('As many as 100', 'CARDINAL', '')]

In [424]:
from textacy import extract

noun_chunks = list(extract.noun_chunks(doc,drop_determiners=True))
for i in noun_chunks:
    print([w.lemma_ for w in i])

In [358]:
noun_chunks.index


<function list.index(value, start=0, stop=9223372036854775807, /)>

In [359]:
for chunk in doc.noun_chunks:
    print(f"{chunk.text}, root: {chunk.root.text}, dep: {chunk.root.dep_},   root_head: {chunk.root.head.text}")
    


More than 3 quarters, root: quarters, dep: nsubjpass,   root_head: located
homes, root: homes, dep: pobj,   root_head: of
2 districts, root: districts, dep: pobj,   root_head: in
21,300 buildings, root: buildings, dep: nsubjpass,   root_head: estimated
sustained damage, root: damage, dep: dobj,   root_head: have


In [360]:
for sent in doc.sents:
    print(sent)


More than 3 quarters of homes destroyed are located in 2 districts – Zinjadin (1,353) and Injil districts (586).
Additionally, 21,300 buildings are estimated to have sustained damage.



In [361]:
for sent in doc.sents:
    for t in sent:
        if t.pos_ == 'VERB':
            print(t.lemma_)
            if t.lemma_ == 'kill':
                print(f"{t.text}, root: {t.root.text}, dep: {t.root.dep_},   root_head: {t.root.head.text}")

destroy
locate
estimate
have
sustain


In [362]:
people_killed = 0
people_injured = 0
tokens_of_interest = []

for token in doc:
    # Check for tokens related to "killed" or "injured"
    if token.lemma_ in ["kill"]:
        # Traverse the dependency tree to find the associated numeric value
        tokens_of_interest = []
        for ancestor in token.ancestors:
            print(f"ancestors: {ancestor.pos_}, {ancestor.text}")
            if ancestor.pos_ == 'VERB':
                tokens_of_interest.append(ancestor)
        for child in token.children:
            print(f"ancestors: {child.pos_}, {child.text}")
            if child.pos_ == 'VERB':
                tokens_of_interest.append(child)


tokens_of_interest = list(set(tokens_of_interest))
print(tokens_of_interest)


for t in tokens_of_interest:
    print(f"now examining {t}")
    for ancestor in t.ancestors:
        print(f"ancestors: {ancestor.pos_}, {ancestor.text}")
    for child in t.children:
        print(f"ancestors: {child.pos_}, {child.text}")
        


print(f"People killed: {people_killed}")
print(f"People injured: {people_injured}")


[]
People killed: 0
People injured: 0


In [422]:
doc = nlp("As many as 100.")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_, [child for child in token.children])
    print()

As advmod 100 NUM []

many amod 100 NUM []

as quantmod 100 NUM []

100 ROOT 100 NUM [As, many, as, .]

. punct 100 NUM []



In [364]:
print(f"start {chunk.start} --  end {chunk.end}")



start 33 --  end 35


In [365]:
for e in entities[0:1]:
    print(dir(e))
    #print(f"start {e.start} --  end {e.end}")


['__add__', '__class__', '__class_getitem__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'count', 'index']


In [366]:
entity_spans = [(ent.text, ent.start, ent.end, ent.label_) for ent in doc.ents]

print(entity_spans)

[('1,023', 3, 4, 'CARDINAL'), ('1,663', 12, 13, 'CARDINAL'), ('eleven', 16, 17, 'CARDINAL'), ('Zindajan', 19, 20, 'GPE'), ('Herat Province', 22, 24, 'GPE'), ('100 per cent', 26, 29, 'MONEY')]


In [433]:
text = 'there is a 87 percent likelihood. '
for x in df_rw_disaster_sum['non_parenthetical_text'].tolist():
    text = text + ' ' + x



doc = nlp(text)

from spacy import displacy
displacy.render(doc, style="ent")

In [434]:
for e in doc.ents:
    if e.label_ == 'CARDINAL':
        print(e)
        for t in e:
            if t.pos_ == 'NUM':
                print(f"   {t}")

6.3
   6.3
as many as 100
   100
8
   8
500
   500
1,023
   1,023
1,663
   1,663
11
   11
516
   516
11,585
   11,585
6.3
   6.3
12,110
   12,110
5
   5
1,294
   1,294
1,688
   1,688
485
   485
5.1
   5.1
13.30
   13.30
2
   2
5
   5
4.1
   4.1
6.3
   6.3
6.3
   6.3
more than 2,400
   2,400
approximately 4,000
   4,000
1,273 million
   1,273
   million
80
   80
6.3
   6.3
1
   1
140
   140
at least 2,400
   2,400
several hundred
   hundred
1,714
   1,714
11,066
   11,066
4
   4
15
   15
3
   3
38
   38
1,384
   1,384
1,853
   1,853
3,067
   3,067
2,499
   2,499
363
   363
250
   250
2
   2
21,300
   21,300
6.3
   6.3
2
   2
more than 150
   150
43,395
   43,395
More than 3,330
   3,330
2,137
   2,137
1,697
   1,697
6.0
   6.0
more than 21,500
   21,500
around 154,000
   154,000
2
   2
7
   7
1,480
   1,480
1,950
   1,950
513
   513
nearly 43,400
   43,400
6
   6
More than half
more than 3,330
   3,330
21,300
   21,300
48,347
   48,347
275,256
   275,256
382
   382
10,002
   10,002
20,4

In [420]:
displacy.render(doc, style="dep")