In [1]:
import spacy
from spacy import displacy

import re
import pandas as pd
from textacy import extract

from collections import defaultdict 

In [2]:
nlp = spacy.load("en_core_web_md")

In [3]:
def convert_spelled_nums_to_digit(token):
    mappings = {
        'one' : 1,'two' : 2,'three' : 3,'four' : 4,'five' : 5,'six' : 6,'seven' : 7,'eight' : 8,'nine' : 9,'ten' : 10
    }

    if mappings.get(token) is not None:
        return mappings[token]
    else:
        return token

assert convert_spelled_nums_to_digit("one") == 1
assert convert_spelled_nums_to_digit("onet") == 'onet'


In [4]:
STOP_ENTS = ['WASH','PSS','GTC','PFA','NFI','IYCF']

In [7]:
f = 'Herat_Earthquake_Flash_Update_13Oct2023_FINAL.txt'
f = 'unrwa_gaza_sitrep_27_nov_10_2023_eng.txt'
f = 'Herat Earthquake_Flash Update #1_7Oct2023_FINAL.txt'
file = f'D://projects//_external_files//surveyor//{f}'

#df_entities[df_entities['sent_idx'] == idx]

with open(file) as f:
    text = f.read()
    
def string_preprocess(text):
    text = text.replace("\n"," ")
    text = text.replace("-","_") #replace so tokenization doesn't separate

    #turn 'four' into 4
    text = ' '.join([str(convert_spelled_nums_to_digit(t)) for t in text.split(" ")])


    # remove content in parentheses
    #processed_string = re.sub(r'\([^)]*\)', '', input_string)

    #remove all non alpha numeric and punctuation
    pattern = r'[^a-zA-Z0-9\s\,\.\?\!\-\(\)]'
    text = re.sub(pattern, '', text)

    pattern = r'(\d+)\s+million'
    text = re.sub(r'(\d+)\s+million', r'\1,000,000', text)

    
    #remove commas that serve as thousands separators
    #Hack... fix this so I don't have to run it 3x
    text = re.sub(r'(\d+),(\d+)', r'\1\2', text)
    text = re.sub(r'(\d+),(\d+)', r'\1\2', text)
    text = re.sub(r'(\d+),(\d+)', r'\1\2', text)

    text = text.replace("\s+","\s")

    return text
date_of_report = '2023-10-13'
source = 'UNOCHA'
text = string_preprocess(text)
full_doc = nlp(text)

def string_remove_parenthetical_content(text):
    # Use regular expression to remove content inside parentheses
    text = re.sub(r'\([^)]*\)', '', text)

    return text
    
non_parens_text = string_remove_parenthetical_content(text)

In [8]:
admin_info = "D:\\projects\\_external_files\\country_info\\afg_adminboundaries_tabulardata_2.xlsx"

def build_admin_reference(admin_info):
    admin_df = pd.read_excel(admin_info, sheet_name="ADM2")
    return admin_df

admin_df = build_admin_reference(admin_info)

admins = []
admins.extend(admin_df['ADM2_EN'].tolist())
admins.extend(admin_df['ADM1_EN'].tolist())
admins.extend(admin_df['ADM0_EN'].tolist())
admins = list(set(admins))
admins.append('Herat')
admin_dict = {}
for ele in admins:
    admin_dict[ele]=1

In [9]:
def build_base_dataframe(text, filename='', date_of_report='', source=''):
    string_sent = [s.strip() for s in text.split('.')]

    dataframe_data=[]
    i=0
    for s in string_sent:
        x = [date_of_report,source,filename,i,s]
        i+=1
        dataframe_data.append(x)

    

    df = pd.DataFrame(dataframe_data,columns=['report_date','source','filename','sent_idx','string_sentence'])
    df['spacy_doc'] = df['string_sentence'].apply(lambda x: nlp(x))
    df['lower_lemmas'] = df['spacy_doc'].apply(lambda x: [w.lemma_.lower() for w in x])
    
    df['string_sent_wo_parens'] = df['string_sentence'].apply(string_remove_parenthetical_content)
    df['spacy_wo_parens'] = df['string_sent_wo_parens'].apply(lambda x: nlp(x))
    df['wo_parens_lower_lemmas'] = df['spacy_wo_parens'].apply(lambda x: [w.lemma_.lower() for w in x])
        

    df['locations'] = df['spacy_doc'].apply(lambda doc: [e for e in doc.ents if e.label_ == 'GPE'])
    df['dates'] = df['spacy_doc'].apply(lambda doc: [e for e in doc.ents if e.label_ == 'DATE'])
    df['svot'] = df['spacy_wo_parens'].apply(lambda doc: list(extract.subject_verb_object_triples(doc)))

    

    return df

df = build_base_dataframe(text, filename=file, date_of_report=date_of_report, source=source)


In [10]:
#keyword_indicators
indicators = {
    'i_people' : ['people','person','child','man','woman']
    ,'i_civilian' : ['civilian']
    ,'i_killed' : ['dead','fatal','die','kill','deceased']
    ,'i_injured' : ['injure']
    ,'i_damage' : ['damage','destroy','collapse']
    ,'i_health_infrastructure' : ['hospital','surgery']
    ,'i_education_infrastructure' : ['school','university']
    ,'i_cash_xfer' : ['xx']
    ,'i_wash' : ['sanitation','water','sewer','drain','drainage']
    ,'i_shelter' : ['shelter','tent','camp','blanket']
    ,'i_food' : ['food','cook','stove','feed','feed','nutrient','meal']
    ,'i_health' : ['health','medical','medicine']
    ,'i_gender_vuln' : ['dignity','gender','pregnant','lactate','lactating']
    ,'i_protection' : ['trauma','mental']
    ,'i_response_capacity' : ['personnel']
    ,'i_other_infrastructure' : ['communicate','radio','internet']

    ,'i_problem' : ['challenge']
    ,'i_demand_side' : ['need','demand','gap','priority', 'receive'] # note receive implies both supply and demand
    ,'i_supply_side' : ['response','contribute','provide','source','address','deploy','receive'] # note receive implies both supply and demand

    ,'i_assessments' : ['assess','assessment']
}

#some words, particularly verbs, must be in their conjucated form to be useful
#eg. "sourcing" vs "source" has too much potential overlap
#verb_tense_indicators = {
#    'i_supply_side_v' : ['sourcing','providing']
#}


In [11]:
def find_and_add_indicator(df, indicators):
    ind_counter = []
    for ind in indicators:
  
        df[ind] = df['lower_lemmas'].apply(lambda x: 1 if len([w for w in x if w in indicators[ind]])>0 else 0)
        ind_counter.append(ind)
        #print(ind_counter)
    df['i_count'] = df[ind_counter].sum(axis=1)

    return df

df = find_and_add_indicator(df, indicators)

def declare_primary_record_type(row):

    if row['i_count'] == 0:
        return 'background'
    elif row['i_supply_side']:
        return 'response_details'
    elif row['i_demand_side']:
        return 'demand_side'
    elif row[['i_damage','i_health_infrastructure','i_education_infrastructure']].sum() > 0:
        return 'damage_to_homes_and_infrastructure'
    else:
        return 'other'




df['record_type'] = df.apply(declare_primary_record_type, axis=1)

## Now That Indicators Are set - extract more specific details

In [12]:
def obtain_killed_numeric_value(doc):

    key_values = []
    
    def check_flags(lst):
        for l in lst:
            if l == -1:
                return False
        return True

    #doc = doc.tolist()[0]
    attribute = -1
    noun = -1
    count = -1

    for t in doc:
        if str(t).isdigit():
            count = t
        if t in indicators['i_people']:
            noun = t
        if t in indicators['i_killed']:
            attribute = t

        if check_flags([noun,attribute,count]):

            noun_att_cnt = (noun,attribute,count)
            key_values.append(noun_att_cnt)

            noun = -1
            attribute = -1
            count = -1

    return key_values
            
    
def obtain_injured_numeric_value(doc):

    key_values = []
    
    def check_flags(lst):
        for l in lst:
            if l == -1:
                return False
        return True

    #doc = doc.tolist()[0]
    attribute = -1
    noun = -1
    count = -1

    for t in doc:
        if str(t).isdigit():
            count = t
        if t in indicators['i_people']:
            noun = t
        if t in indicators['i_injured']:
            attribute = t

        if check_flags([noun,attribute,count]):

            noun_att_cnt = (noun,attribute,count)
            key_values.append(noun_att_cnt)

            noun = -1
            attribute = -1
            count = -1

    return key_values

def obtain_counted_noun_chunks(doc):
    counted_things = []
    for x in list(extract.noun_chunks(doc)):
        for token in x:
            if str(token).isdigit():
                counted_things.append(x)
                continue
    if len(counted_things) > 0:
        return counted_things
    else:
        return ''

def obtain_all_entities(doc):
    #stop_ents = STOP_ENTS
    ents = list(extract.entities(doc))
    if len(ents) < 1:
        return None
    entities = defaultdict(list) 
    for e in ents:
        #if e.text not in stop_ents:
        entities[e.label_].append(e)

    return entities   
    

print(obtain_killed_numeric_value(df['wo_parens_lower_lemmas'][df['i_killed'] == 1].tolist()[0]))
print(obtain_injured_numeric_value(df['wo_parens_lower_lemmas'][df['i_injured'] == 1].tolist()[0]))
        

[('people', 'kill', '100')]
[('people', 'injure', '500')]


In [13]:
df['num_killed'] = df['wo_parens_lower_lemmas'][df['i_killed'] == 1].apply(obtain_killed_numeric_value)
df['num_injured'] = df['wo_parens_lower_lemmas'][df['i_injured'] == 1].apply(obtain_injured_numeric_value)
df['num_others'] = df['spacy_wo_parens'].apply(obtain_counted_noun_chunks)
df['noun_chunks'] = df['spacy_wo_parens'].apply(lambda doc: list(extract.noun_chunks(doc)))
df['entities'] = df['spacy_wo_parens'].apply(obtain_all_entities)

In [14]:
df[df['i_killed'] == 1]

Unnamed: 0,report_date,source,filename,sent_idx,string_sentence,spacy_doc,lower_lemmas,string_sent_wo_parens,spacy_wo_parens,wo_parens_lower_lemmas,...,i_demand_side,i_supply_side,i_assessments,i_count,record_type,num_killed,num_injured,num_others,noun_chunks,entities
4,2023-10-13,UNOCHA,D://projects//_external_files//surveyor//Herat...,4,Initial assessments indicate that as many as 1...,"(Initial, assessments, indicate, that, as, man...","[initial, assessment, indicate, that, as, many...",Initial assessments indicate that as many as 1...,"(Initial, assessments, indicate, that, as, man...","[initial, assessment, indicate, that, as, many...",...,0,0,1,4,other,"[(people, kill, 100)]","[(people, injure, 500)]","[(as, many, as, 100, , people), (8, villages)...","[(Initial, assessments), (as, many, as, 100, ...","{'CARDINAL': [(as, many, as, 100), (8), (500),..."


In [15]:
df.to_csv("c://temp//proc.csv")

PermissionError: [Errno 13] Permission denied: 'c:/temp/proc.csv'

# Main DF Built... now do more processing

In [16]:

def extract_entities(row):
    entities = row['entities']
    if entities is None:
        return ''
    en=[]
    for label in entities:
        for e in entities.get(label):
            ent = ' '.join([w.text for w in e]).strip()
            en.append([label,ent])
            
    return en
    #else:
    #    return ''


def extract_ncs(row):
    #data type, list of spans
    xs = row['noun_chunks']
    if xs is None:
        return ''
    en=[]

    for e in xs:
        ent = ' '.join([w.text for w in e]).strip()
        en.append(['NOUN_CHUNK',ent])
    return en


def extract_numeric_key_values(row):
    #data type, list of spans
    xs = row['num_others']
    if xs is None:
        return ''
    return_list=[]

    for e in xs:
        prefix = ''
        numeric = ''
        suffix = ''

        for token in e:
            if token.is_alpha == False:
                numeric = token.text
            elif numeric == '': #alpha but numeric not set yet, this is prefix
                prefix = prefix + ' ' + token.text
            else:
                suffix = suffix + ' ' + token.text
        
        return_list.append([prefix.strip(),numeric,suffix.strip()])   
        
    return return_list


def split_key_value_in_df(field,delim=','):

    s = pd.Series({'prefix' : field, 'left_label' : field, 'right_label' : field})
    
    if isinstance(field, list):
        fields = field
    elif isinstance(field, str):
        fields = field.split(delim)
    else:
        print(field)
    
     
    if len(fields) == 2:
        s = pd.Series({'prefix' : '', 'left_label' : fields[0], 'right_label' : fields[1]})
    elif len(fields) == 3:
        s = pd.Series({'prefix' : fields[0], 'left_label' : fields[1], 'right_label' : fields[2]})



    return s

def split_key_value_in_df_orig(field,left_label="d",right_label="f",delim=','):

    s = pd.Series({left_label : field, right_label : field})
    
    if isinstance(field, list):
     
        if len(field) == 2:
            s = pd.Series({left_label : field[0], right_label : field[1]})

    elif isinstance(field, str):
        fields = field.split(delim)
        if len(fields) == 2:
            s = pd.Series({left_label : fields[0], right_label : fields[1]})

    return s
        
    

In [17]:
df[['filename','sent_idx','string_sentence','noun_chunks']][df['noun_chunks'].isna() == False]

Unnamed: 0,filename,sent_idx,string_sentence,noun_chunks
0,D://projects//_external_files//surveyor//Herat...,0,Highlights On 7 October 2023 at around 11,"[(Highlights), (7, October)]"
1,D://projects//_external_files//surveyor//Herat...,1,"00 local time, a 6","[(00, local, time)]"
2,D://projects//_external_files//surveyor//Herat...,2,3 magnitude earthquake struck 40km west of He...,"[(3, , magnitude, earthquake), (Herat, , Cit..."
3,D://projects//_external_files//surveyor//Herat...,3,"Several aftershocks have occurred since, with...","[(Several, , aftershocks), (initial, , quake..."
4,D://projects//_external_files//surveyor//Herat...,4,Initial assessments indicate that as many as 1...,"[(Initial, assessments), (as, many, as, 100, ..."
5,D://projects//_external_files//surveyor//Herat...,5,"Additionally, 465 houses are reported to have...","[(465, , houses)]"
6,D://projects//_external_files//surveyor//Herat...,6,"In total, 4200 people (600 families) are asses...","[(4200, people), (date), (1400, IDPs)]"
7,D://projects//_external_files//surveyor//Herat...,7,Mahal Wadakah is understood to be the worst a...,"[(Mahal, Wadakah), (worst, affected, village)]"
8,D://projects//_external_files//surveyor//Herat...,8,"Additionally, an estimated 300 families (2100 ...","[(estimated, 300, families), (Herat, City), (t..."
9,D://projects//_external_files//surveyor//Herat...,9,Partners and local authorities anticipate the ...,"[(Partners), (local, authorities), (number), (..."


In [18]:
#use this as a repeatable-ish pattern for expanding on all the qualitative fields
df_entities = df[['filename','sent_idx','string_sentence','entities']][df['entities'].isna() == False].copy()
df_entities['tmp'] = df_entities.apply(extract_entities, axis=1)
df_entities = df_entities.drop(columns=['entities'])
df_entities = df_entities.explode('tmp')
df_entities['rec_type'] = 'ENTITY'
df_entities[['rec_prefix','rec_key','rec_value']] = df_entities.apply(lambda x: split_key_value_in_df(x.tmp), axis=1)

#now noun_chunks
df_nouns = df[['filename','sent_idx','string_sentence','noun_chunks']][df['noun_chunks'].isna() == False].copy()
df_nouns['tmp'] = df_nouns.apply(extract_ncs, axis=1)
#df_nouns = df_nouns.drop(columns=['noun_chunks'])
df_nouns = df_nouns.explode('tmp')
df_nouns['rec_type'] = 'NOUN_SEQUENCE'
df_nouns = df_nouns[df_nouns['tmp'].isna() == False].copy()
df_nouns[['rec_prefix','rec_key','rec_value']] = df_nouns.apply(lambda x: split_key_value_in_df(x.tmp), axis=1)


In [19]:
df_quants = df[['filename','sent_idx','string_sentence','num_others']][df['num_others'] != ''].copy()
df_quants['tmp'] = df_quants.apply(extract_numeric_key_values, axis=1)
df_quants = df_quants.drop(columns=['num_others'])
df_quants = df_quants.explode('tmp')
df_quants['rec_type'] = 'QUANTIFIED_NOUN'
df_quants[['rec_prefix','rec_key','rec_value']] = df_quants.apply(lambda x: split_key_value_in_df(x.tmp), axis=1)
df_quants


#num_others

Unnamed: 0,filename,sent_idx,string_sentence,tmp,rec_type,rec_prefix,rec_key,rec_value
0,D://projects//_external_files//surveyor//Herat...,0,Highlights On 7 October 2023 at around 11,"[, 7, October]",QUANTIFIED_NOUN,,7.0,October
1,D://projects//_external_files//surveyor//Herat...,1,"00 local time, a 6","[, 00, local time]",QUANTIFIED_NOUN,,0.0,local time
2,D://projects//_external_files//surveyor//Herat...,2,3 magnitude earthquake struck 40km west of He...,"[, , magnitude earthquake]",QUANTIFIED_NOUN,,,magnitude earthquake
4,D://projects//_external_files//surveyor//Herat...,4,Initial assessments indicate that as many as 1...,"[as many as, , people]",QUANTIFIED_NOUN,as many as,,people
4,D://projects//_external_files//surveyor//Herat...,4,Initial assessments indicate that as many as 1...,"[, 8, villages]",QUANTIFIED_NOUN,,8.0,villages
4,D://projects//_external_files//surveyor//Herat...,4,Initial assessments indicate that as many as 1...,"[further, 500, people]",QUANTIFIED_NOUN,further,500.0,people
4,D://projects//_external_files//surveyor//Herat...,4,Initial assessments indicate that as many as 1...,"[closer to, 320, people]",QUANTIFIED_NOUN,closer to,320.0,people
5,D://projects//_external_files//surveyor//Herat...,5,"Additionally, 465 houses are reported to have...","[, , houses]",QUANTIFIED_NOUN,,,houses
6,D://projects//_external_files//surveyor//Herat...,6,"In total, 4200 people (600 families) are asses...","[, 4200, people]",QUANTIFIED_NOUN,,4200.0,people
6,D://projects//_external_files//surveyor//Herat...,6,"In total, 4200 people (600 families) are asses...","[, 1400, IDPs]",QUANTIFIED_NOUN,,1400.0,IDPs


In [20]:
idx=5
print(list(set(df_entities['string_sentence'][df_entities['sent_idx'] == idx].tolist())))
df_entities[df_entities['sent_idx'] == idx]




['Additionally, 465  houses are reported to have been destroyed and  135 damaged']


Unnamed: 0,filename,sent_idx,string_sentence,tmp,rec_type,rec_prefix,rec_key,rec_value
5,D://projects//_external_files//surveyor//Herat...,5,"Additionally, 465 houses are reported to have...","[CARDINAL, 465]",ENTITY,,CARDINAL,465
5,D://projects//_external_files//surveyor//Herat...,5,"Additionally, 465 houses are reported to have...","[CARDINAL, 135]",ENTITY,,CARDINAL,135


In [21]:
df_nouns[df_nouns['sent_idx'] == idx]

Unnamed: 0,filename,sent_idx,string_sentence,noun_chunks,tmp,rec_type,rec_prefix,rec_key,rec_value
5,D://projects//_external_files//surveyor//Herat...,5,"Additionally, 465 houses are reported to have...","[(465, , houses)]","[NOUN_CHUNK, 465 houses]",NOUN_SEQUENCE,,NOUN_CHUNK,465 houses


In [22]:
df_quants[df_quants['sent_idx'] == idx]

Unnamed: 0,filename,sent_idx,string_sentence,tmp,rec_type,rec_prefix,rec_key,rec_value
5,D://projects//_external_files//surveyor//Herat...,5,"Additionally, 465 houses are reported to have...","[, , houses]",QUANTIFIED_NOUN,,,houses


# End

In [305]:
#df_org_details = df[['filename','sent_idx','string_sentence','spacy_doc','entities']].copy()
#df_org_details['organization'] = df['entities'].apply(extract_orgs)
#df_org_details = df_org_details.drop(columns=['entities'])
#df_org_details = df_org_details.explode('organization')

#unique_orgs = list(set(df_org_details['organization'].tolist()))

#df_org_details['spacy_doc'][df_org_details['organization'] == 'UNHCR'].tolist()


In [306]:
def extract_kwic(doc, keyword):
    triple = list(extract.keyword_in_context(doc, keyword, window_width = 80,))

    prefix = ''
    keyword = ''
    suffix = ''
    #print(triple)
    if len(triple) == 1:
        prefix = triple[0][0]
        keyword = triple[0][1]
        suffix = triple[0][2]
    return pd.Series({'prefix':prefix,'keyword':keyword,'suffix':suffix})
    

keyword = 'UNHCR'
df_org_details[['prefix','keyword','suffix']] = df_org_details['spacy_doc'].apply(lambda x: extract_kwic(x, keyword))

In [307]:
df_org_details[df_org_details['keyword'] != '']

Unnamed: 0,filename,sent_idx,string_sentence,spacy_doc,entities,prefix,keyword,suffix
32,D://projects//_external_files//surveyor//Herat...,32,"ESNFI Cluster partners IOM, UNHCR, and UNICEF ...","(ESNFI, Cluster, partners, IOM, ,, UNHCR, ,, a...","{'ORG': [(ESNFI, Cluster), (IOM), (UNHCR), (UN...","ESNFI Cluster partners IOM,",UNHCR,", and UNICEF have delivered emergency relief ..."
36,D://projects//_external_files//surveyor//Herat...,36,"In coordination with the Shelter Cluster, UNHC...","(In, coordination, with, the, Shelter, Cluster...","{'ORG': [(Shelter, Cluster), (UNHCR), (Health,...","In coordination with the Shelter Cluster,",UNHCR,contributed 100 refugee housing units and 100...
38,D://projects//_external_files//surveyor//Herat...,38,UNHCR and its partners are expected to start t...,"(UNHCR, and, its, partners, are, expected, to,...","{'ORG': [(UNHCR)], 'DATE': [(tomorrow)], 'FAC'...",,UNHCR,and its partners are expected to start the as...
87,D://projects//_external_files//surveyor//Herat...,87,UNHCR has distributed solar lamps and hygiene ...,"(UNHCR, has, distributed, solar, lamps, and, h...",{'ORG': [(UNHCR)]},,UNHCR,"has distributed solar lamps and hygiene kits,..."


In [308]:
kwic = list(extract.keyword_in_context(full_doc, 'UNHCR', window_width = 80,))
kwic

[('nvolved and supporting community level assessments. ESNFI Cluster partners IOM, ',
  'UNHCR',
  ', and UNICEF  have delivered emergency relief assistance, including emergency sh'),
 ('er clothing kits, and solar modules.  In coordination with the Shelter Cluster, ',
  'UNHCR',
  ' contributed 100 refugee housing units and 100 blankets to  the Health Cluster, '),
 ('nse. The units are to be set up at the health centres to  accommodate patients. ',
  'UNHCR',
  ' and its partners are expected to start the assembly of the units tomorrow, at  '),
 (' PFA, psychosocial counselling, psychiatry  services, and medicine to patients. ',
  'UNHCR',
  ' has distributed solar lamps and hygiene kits, among other critical supplies, an')]