In [1]:
import spacy
from spacy import displacy

import re
import pandas as pd
from textacy import extract

from collections import defaultdict 
from fuzzywuzzy import fuzz
import time
import uuid


In [2]:
pd.set_option('display.max_columns', None)
print(time.localtime())

time.struct_time(tm_year=2023, tm_mon=12, tm_mday=2, tm_hour=5, tm_min=55, tm_sec=30, tm_wday=5, tm_yday=336, tm_isdst=0)


In [3]:
#files
pcode_file = "D://projects//_external_files//cod_files//combined_locations//locations.csv"
situation_reports = "D://projects//_external_files//surveyor//rw_siturep_preprocessed//reliefweb_situation_reports.xlsx"
situation_reports = "D://projects//_external_files//surveyor//rw_disaster_preprocessed//disaster_summaries_2f96ea6d16c942018c0ac2469aab62a3.xlsx"

#situation_reports = "D:\projects\_external_files\reliefweb_disaster_reports"

## Load geolocation_services


In [4]:
df_location = pd.read_csv(pcode_file)

def get_pcode_from_location(loc, country_prefix='XX', lang_code='all'):

    if country_prefix != 'XX': #if the country prefix is set, limit search to that
        df_loc = df_location[df_location['pcode_prefix'] == country_prefix]
    else:
        df_loc = df_location

    if lang_code != 'all': #secondary filter - especially important to remove dupes with diff langs share the same script
        df_loc = df_loc[df_loc['lang_code'] == lang_code]
        
    matches = df_loc['pcode'][df_loc['location_name'].str.lower() == loc.lower()].tolist()

    #if the match fails, try again on the normalized name
    if len(matches) == 0:
        #remove common variations in names that can cause misses
        n_loc = re.sub(r'[^a-zA-Z]', '', loc)

        #this will cause problems for non-English.. so if then len is 0, exit
        if len(n_loc) == 0:
            return []
            
        matches = df_loc['pcode'][df_loc['location_normalized'].str.lower() == n_loc.lower()].tolist()
        

    #now check results
    if len(matches) > 1:
        #print(f"more than 1 matches... likely due to different granularity of entities with the same name (ie. Herat City in Herat Province) {matches}")
        #print(f"returning the lowest granularity match. {min(matches, key=len)}")
        #print("if the pcodes are all the same granularity.... you get the first element.")
        return min(matches, key=len)
            
        return matches
    elif len(matches) == 1:
        return matches

    else:
        #couldn't find a match, do a fuzzy search
        compare_list = list(set(df_loc['location_name'].tolist()))
        possible_matches=[]
        for i in compare_list:
            if fuzz.ratio(loc,i) > 70:
                possible_matches.append(i)
                print (f"No exact match to '{loc}'. see if these alternative spellings are correct: {possible_matches}")

    
    return []

assert get_pcode_from_location('istanbul')[0] == 'TUR034'

def get_adm_lvl_from_pcode(pcode):
    return list(set(df_location['adm_lvl'][df_location['pcode'] == pcode].tolist()))
    
def get_name_in_lang(pcode, lang='en'):
    return list(set(df_location['location_name'][(df_location['pcode'] == pcode) & (df_location['lang_code'] == lang)].tolist()))

def get_descendents_of(pcode, lang='en', include_self=True):
    if include_self==True:
        return df_location[df_location['pcode'].str.contains(pcode) & (df_location['lang_code'] == lang)]
    else:
        return df_location[df_location['pcode'].str.contains(pcode) & (df_location['lang_code'] == lang)\
        & (df_location['pcode'] != pcode)]

def get_admin_chain(pcode, lang='en'):
    split_pcode = df_location['split_pcode'][df_location['pcode'] == pcode].tolist()[0]
    levels = split_pcode.split(".")
    pc =''
    admin_chain = []
    #rebuild the pcode one level at a time
    for i in levels:
        pc = pc + i
        admin_chain.append(df_location['location_name'][(df_location['pcode'] == pc) & (df_location['lang_code'] == lang)].tolist()[0])

    return admin_chain

def get_all_locations(lang_code='all'):

    #return all unique location names
    if lang_code == 'all':
        return list(set(df_location['location_name'].to_list()))
    else:
        return list(set(df_location['location_name'][df_location['lang_code'] == lang_code].to_list()))




## Load Preprocessing Routines

In [5]:
def string_preprocess(text):
    
    def convert_spelled_nums_to_digit(token):
        clean_token = re.sub(r'[^a-zA-Z]', '', token).lower()
        
        mappings = {
            'one' : 1,'two' : 2,'three' : 3,'four' : 4,'five' : 5,'six' : 6,'seven' : 7,'eight' : 8,'nine' : 9, 'ten' : 10
            ,'eleven' : 11, 'twelve' : 12, 'thirteen':13, 'fourteen':14, 'fifteen':15, 'sixteen':16, 'seventeen':17
            ,'eighteen':18, 'nineteen':19, 'twenty':20, 'dozen':12
        }
    
        if mappings.get(clean_token) is not None:
            return mappings[clean_token]
        else:
            return token
        
    text = text.replace("\n"," ")
    text = text.replace("-","_") #replace so tokenization doesn't separate

    #turn 'four' into 4
    text = ' '.join([str(convert_spelled_nums_to_digit(t)) for t in text.split(" ")])


    # remove content in parentheses
    #processed_string = re.sub(r'\([^)]*\)', '', input_string)

    #remove all non alpha numeric and punctuation
    pattern = r'[^a-zA-Z0-9\s\,\.\?\!\-\(\)]'
    text = re.sub(pattern, '', text)

    pattern = r'(\d+)\s+million'
    text = re.sub(r'(\d+)\s+million', r'\1,000,000', text)

    
    #remove commas that serve as thousands separators
    #Hack... fix this so I don't have to run it 3x
    text = re.sub(r'(\d+),(\d+)', r'\1\2', text)
    text = re.sub(r'(\d+),(\d+)', r'\1\2', text)
    text = re.sub(r'(\d+),(\d+)', r'\1\2', text)
    text = text.replace("\s+","\s")
    return text

def string_remove_parenthetical_content(text):
    # Use regular expression to remove content inside parentheses
    text = re.sub(r'\([^)]*\)', '', text)
    return text
    




## Load NLP routines

In [6]:
nlp = spacy.load("en_core_web_md")

# Create patterns and add to the entity ruler to better find locations

all_locs = get_all_locations(lang_code='en')
gpes = []

STOP_LOCS = ['of','can']
all_locs = [e for e in all_locs if e.lower() not in STOP_LOCS]

# create pattern rules for locations based on the COD files
for l in all_locs:
    token_sequence=[]
    for token in l.split('\s+'):
        token_sequence.append({"LOWER":token.lower()})
    x = {'label':'GPE', 'pattern': token_sequence, 'id':get_pcode_from_location(l, lang_code='en')[0]}
    gpes.append(x)
    #print(get_pcode_from_location(l, lang_code='en'))

ruler = nlp.add_pipe('entity_ruler', before='ner')
ruler.add_patterns(gpes)

In [7]:
#keyword_indicators
indicators = {
    'i_people' : ['people','person','child','man','woman','civilian','colleague','fatality']
    ,'i_killed' : ['dead','fatal','die','kill','deceased','fatality','fatality'] #think about how to incorporate 2 co-existing terms "648 people who lost their lives"
    ,'i_injured' : ['injure','wound','wounded']
    ,'i_damage' : ['damage','destroy','collapse']
    ,'i_health_infrastructure' : ['hospital','surgery']
    ,'i_education_infrastructure' : ['school','university']
    ,'i_cash_xfer' : ['xx']
    ,'i_wash' : ['sanitation','water','sewer','drain','drainage']
    ,'i_shelter' : ['shelter','tent','camp','blanket']
    ,'i_food' : ['food','cook','stove','feed','feed','nutrient','meal']
    ,'i_health' : ['health','medical','medicine']
    ,'i_gender_vuln' : ['dignity','gender','pregnant','lactate','lactating']
    ,'i_protection' : ['trauma','mental']
    ,'i_response_capacity' : ['personnel']
    ,'i_other_infrastructure' : ['communicate','radio','internet','telecommunication','electric','line']
    ,'i_money' : ['grant','loan','finance','appeal','chf','fund']
    ,'i_other' : ['biometric']
    ,'i_problem' : ['challenge']
    ,'i_demand_side' : ['need','demand','gap','priority', 'receive'] # note receive implies both supply and demand
    ,'i_supply_side' : ['response','contribute','provide','source','address','deploy','receive'] # note receive implies both supply and demand

    ,'i_assessments' : ['assess','assessment']
}

In [24]:
def get_future_tense_verb(doc):
    def is_future_tense(token):
        #Check if a token is indicative of future tense.
        return (
            token.tag_ == "MD" and token.text.lower() == "will"
            or (token.dep_ == "aux" and token.head.lemma_ == "will")
        )

    for t in doc:
        if is_future_tense(t):
            return f"{t.text} {t.head}"

def find_and_add_indicator(df, indicators):
    ind_counter = []
    for ind in indicators:
  
        df[ind] = df['lower_lemmas'].apply(lambda x: 1 if len([w for w in x if w in indicators[ind]])>0 else 0)
        ind_counter.append(ind)
        #print(ind_counter)
    df['i_count'] = df[ind_counter].sum(axis=1)

    return df


def declare_primary_record_type(row):

    if row['i_count'] == 0:
        return 'background'
    elif row['i_supply_side']:
        return 'response_details'
    elif row['i_demand_side']:
        return 'demand_side'
    elif row[['i_damage','i_health_infrastructure','i_education_infrastructure']].sum() > 0:
        return 'damage_to_homes_and_infrastructure'
    else:
        return 'other'

def obtain_killed_numeric_value(doc):

    key_values = []
    just_count = []
    
    def check_flags(lst):
        for l in lst:
            if l == -1:
                return False
        return True

    def reset_indicators():
        return -1, -1, -1

    noun, attribute, count = reset_indicators()
            
    for t in doc:
        if (str(t).isdigit()) & (t.ent_type_ not in ['DATE','TIME']):
            count = t
            print(f"  c {t}")
        if (t.lemma_ in indicators['i_people']) or (t.ent_type_ == 'NORP'):
            noun = t
            print(f"   peop {t}")
        if t.lemma_ in indicators['i_killed']:
            attribute = t
            print(f"    kil {t}")
        if check_flags([noun,attribute,count]):

            noun_att_cnt = (noun,attribute,count)
            key_values.append(noun_att_cnt)
            just_count.append(count)

            noun, attribute, count = reset_indicators()

    #if more than 1 figure is returned, typically those will be
    #contextualizing numbers, just return the first
    return just_count[0]


def obtain_injured_numeric_value(doc):

    key_values = []
    just_count = []
    
    def check_flags(lst):
        for l in lst:
            if l == -1:
                return False
        return True

    def reset_indicators():
        return -1, -1, -1

    noun, attribute, count = reset_indicators()
            
    for t in doc:
        if (str(t).isdigit()) & (t.ent_type_ not in ['DATE','TIME']):
            count = t
            print(f"  c {t}")
        if (t.lemma_ in indicators['i_people']) or (t.ent_type_ == 'NORP'):
            noun = t
            print(f"   peop {t}")
        if t.lemma_ in indicators['i_injured']:
            attribute = t
            print(f"    kil {t}")
        if check_flags([noun,attribute,count]):

            noun_att_cnt = (noun,attribute,count)
            key_values.append(noun_att_cnt)
            just_count.append(count)

            noun, attribute, count = reset_indicators()

    #if more than 1 figure is returned, typically those will be
    #contextualizing numbers, just return the first
    return just_count[0]


def OLD_obtain_killed_numeric_value(doc):

    key_values = []
    just_count = []
    
    def check_flags(lst):
        for l in lst:
            if l == -1:
                return False
        return True

    #doc = doc.tolist()[0]
    attribute = -1
    noun = -1
    count = -1

    for t in doc:
        if (str(t).isdigit()) & (t.ent_type_ not in ['DATE','TIME']):
            count = t
        if (t.lemma_ in indicators['i_people']) or (t.ent_type_ == 'NORP'):
            noun = t
        if t.lemma_ in indicators['i_killed']:
            attribute = t

        if check_flags([noun,attribute,count]):

            noun_att_cnt = (noun,attribute,count)
            key_values.append(noun_att_cnt)
            just_count.append(count)

            noun = -1
            attribute = -1
            count = -1

    #changing to return only the count
    return just_count
    #return key_values
            
    
def obtain_injured_numeric_value(doc):

    key_values = []
    just_count = []
    
    def check_flags(lst):
        for l in lst:
            if l == -1:
                return False
        return True

    #doc = doc.tolist()[0]
    attribute = -1
    noun = -1
    count = -1

    for t in doc:
        if (str(t).isdigit()) & (t.ent_type_ not in ['DATE','TIME']):
            count = t
        if (t.lemma_ in indicators['i_people']) or (t.ent_type_ == 'NORP'):
            noun = t
        if t.lemma_ in indicators['i_injured']:
            attribute = t

        if check_flags([noun,attribute,count]):

            noun_att_cnt = (noun,attribute,count)
            key_values.append(noun_att_cnt)
            just_count.append(count)

            noun = -1
            attribute = -1
            count = -1

    #changing to return only the count
    return just_count
    #return key_values

def obtain_counted_noun_chunks(doc):
    counted_things = []
    for x in list(extract.noun_chunks(doc)):
        for token in x:
            if str(token).isdigit():
                counted_things.append(x)
                continue
    if len(counted_things) > 0:
        return counted_things
    else:
        return ''



def obtain_all_entities(doc):

    STOP_ENTS = ['WASH','PSS','GTC','PFA','NFI','IYCF']
    STOP_ENT_TYPE = ['CARDINAL','ORDINAL']
    #stop_ents = STOP_ENTS
    ents = list(extract.entities(doc))
    if len(ents) < 1:
        return None
    entities = defaultdict(list) 
    for e in ents:
        #if e.text not in stop_ents:
        if e.label_ not in STOP_ENT_TYPE:
            entities[e.label_].append(e)

    return entities   

def extract_entities(row):
    entities = row['entities']
    if entities is None:
        return ''
    en=[]
    for label in entities:
        for e in entities.get(label):
            ent = ' '.join([w.text for w in e]).strip()
            en.append([label,ent])
            
    return en


def extract_ncs(row):
    
    #data type, list of spans
    xs = row['noun_chunks']
    if xs is None:
        return ''
    en=[]

    for e in xs:
        ent = ' '.join([w.text for w in e]).strip()
        en.append(['NOUN_CHUNK',ent])
    return en


def extract_numeric_key_values(row):
    #data type, list of spans
    xs = row['num_others']
    if xs is None:
        return ''
    return_list=[]

    for e in xs:
        prefix = ''
        numeric = ''
        suffix = ''

        for token in e:
            if token.is_alpha == False:
                numeric = token.text
            elif numeric == '': #alpha but numeric not set yet, this is prefix
                prefix = prefix + ' ' + token.text
            else:
                suffix = suffix + ' ' + token.text
        
        return_list.append([prefix.strip(),numeric,suffix.strip()])   
        
    return return_list


def split_key_value_in_df(field,delim=','):

    s = pd.Series({'prefix' : field, 'left_label' : field, 'right_label' : field})
    
    if isinstance(field, list):
        fields = field
    elif isinstance(field, str):
        fields = field.split(delim)
    else:
        print(field)
    
     
    if len(fields) == 2:
        s = pd.Series({'prefix' : '', 'left_label' : fields[0], 'right_label' : fields[1]})
    elif len(fields) == 3:
        s = pd.Series({'prefix' : fields[0], 'left_label' : fields[1], 'right_label' : fields[2]})



    return s

def split_key_value_in_df_orig(field,left_label="d",right_label="f",delim=','):

    s = pd.Series({left_label : field, right_label : field})
    
    if isinstance(field, list):
     
        if len(field) == 2:
            s = pd.Series({left_label : field[0], right_label : field[1]})

    elif isinstance(field, str):
        fields = field.split(delim)
        if len(fields) == 2:
            s = pd.Series({left_label : fields[0], right_label : fields[1]})

    return s

## Now build base DF

In [9]:
df = pd.read_excel(situation_reports)

## Naive sentence split below is a bad idea....
df['string_sentence'] = df['text'].astype(str).apply(lambda x: string_preprocess(x).split('.'))
df = df.explode('string_sentence')
df['string_sentence'] = df['string_sentence'].apply(lambda x: x.strip() + '.')
df = df.reset_index(drop=True)
df

Unnamed: 0,record_type,status,source_url,glide_id,source_level_country,source_title,source_desc,source_original_text,reference_url,text,authoring_org,reported_date,string_sentence
0,disaster summary,ongoing,,EP-2013-000175-COD,DR Congo,DR Congo: Cholera and Measles Outbreaks - Jan ...,[],While the last major **cholera** outbreak in D...,https://reliefweb.int/disaster/ep-2013-000175-cod,While the last major **cholera** outbreak in D...,reliefweb,2023-10-16T12:15:16+00:00,While the last major cholera outbreak in DR Co...
1,disaster summary,ongoing,,EP-2013-000175-COD,DR Congo,DR Congo: Cholera and Measles Outbreaks - Jan ...,[],While the last major **cholera** outbreak in D...,https://reliefweb.int/disaster/ep-2013-000175-cod,While the last major **cholera** outbreak in D...,reliefweb,2023-10-16T12:15:16+00:00,"intdisasterep2011000076cod), cholera is endemi..."
2,disaster summary,ongoing,,EP-2013-000175-COD,DR Congo,DR Congo: Cholera and Measles Outbreaks - Jan ...,[],While the last major **cholera** outbreak in D...,https://reliefweb.int/disaster/ep-2013-000175-cod,While the last major **cholera** outbreak in D...,reliefweb,2023-10-16T12:15:16+00:00,"Over the course of 2013, ongoing violence and ..."
3,disaster summary,ongoing,,EP-2013-000175-COD,DR Congo,DR Congo: Cholera and Measles Outbreaks - Jan ...,[],While the last major **cholera** outbreak in D...,https://reliefweb.int/disaster/ep-2013-000175-cod,While the last major **cholera** outbreak in D...,reliefweb,2023-10-16T12:15:16+00:00,"Between January and September, more than 21000..."
4,disaster summary,ongoing,,EP-2013-000175-COD,DR Congo,DR Congo: Cholera and Measles Outbreaks - Jan ...,[],While the last major **cholera** outbreak in D...,https://reliefweb.int/disaster/ep-2013-000175-cod,While the last major **cholera** outbreak in D...,reliefweb,2023-10-16T12:15:16+00:00,"During the same time frame, more than 74299 ca..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15501,disaster summary,ongoing,,FL-2023-000236-DOM,Dominican Rep.,Dominican Republic: Floods and Landslides - No...,[],"As of 21 November, severe rains and subsequent...",https://reliefweb.int/disaster/fl-2023-000236-dom,"As of 21 November, severe rains and subsequent...",reliefweb,2023-11-28T19:32:06+00:00,.
15502,disaster summary,ongoing,,FL-2023-000236-DOM,Dominican Rep.,Dominican Republic: Floods and Landslides - No...,[],"According to [information from 22 November], 7...",https://reliefweb.int/disaster/fl-2023-000236-dom,"According to [information from 22 November], 7...",reliefweb,2023-11-28T19:32:06+00:00,"According to information from 22 November, 741..."
15503,disaster summary,ongoing,,FL-2023-000236-DOM,Dominican Rep.,Dominican Republic: Floods and Landslides - No...,[],"According to [information from 22 November], 7...",https://reliefweb.int/disaster/fl-2023-000236-dom,"According to [information from 22 November], 7...",reliefweb,2023-11-28T19:32:06+00:00,At least 37060 people were displaced to safe a...
15504,disaster summary,ongoing,,FL-2023-000236-DOM,Dominican Rep.,Dominican Republic: Floods and Landslides - No...,[],"According to [information from 22 November], 7...",https://reliefweb.int/disaster/fl-2023-000236-dom,"According to [information from 22 November], 7...",reliefweb,2023-11-28T19:32:06+00:00,"A total of 4 shelters were opened, housing 158..."


In [10]:
idx = df.sample(1).index[0]
print(df.loc[idx]['source_original_text']) #.tolist()[0]
print()
print(df.loc[idx]['string_sentence']) #.tolist()[0]


Over the last 24 hours, 49 deaths were reported, the majority in Sindh, taking the death toll to 777 since mid-June, and 59,665 houses were damaged, taking the total damaged houses to 176,436, mostly in Sindh and Balochistan. According to National Disaster Management Authority (NDMA), as of 21 August, around 1,868,098 people were rescued, and 317,896 individuals are living in relief camps across the country. In Sindh, as of 20 August, 1,356,863 people are affected, with 309,944 households affected and around 495,381 are displaced due to floods in the province. ([ECHO, 22 Aug 2022](https://reliefweb.int/node/3878589/))

Over the last 24 hours, 49 deaths were reported, the majority in Sindh, taking the death toll to 777 since midJune, and 59665 houses were damaged, taking the total damaged houses to 176436, mostly in Sindh and Balochistan.


In [11]:
#b build out initial dataframe
df['spacy_doc'] = df['string_sentence'].apply(lambda x: nlp(x))
df['lower_lemmas'] = df['spacy_doc'].apply(lambda x: [w.lemma_.lower() for w in x])

df['string_sent_wo_parens'] = df['string_sentence'].apply(string_remove_parenthetical_content)
df['spacy_wo_parens'] = df['string_sent_wo_parens'].apply(lambda x: nlp(x))
df['wo_parens_lower_lemmas'] = df['spacy_wo_parens'].apply(lambda x: [w.lemma_.lower() for w in x])
df['locations'] = df['spacy_doc'].apply(lambda doc: [e.text for e in doc.ents if e.label_ == 'GPE'])
df['dates'] = df['spacy_doc'].apply(lambda doc: [e.text for e in doc.ents if e.label_ == 'DATE'])
df['svot'] = df['spacy_wo_parens'].apply(lambda doc: list(extract.subject_verb_object_triples(doc)))
df['future_verbs'] = df['spacy_doc'].apply(get_future_tense_verb)

    


In [12]:
df = find_and_add_indicator(df, indicators)
df['record_type'] = df.apply(declare_primary_record_type, axis=1)

In [13]:
df['num_killed'] = df['spacy_wo_parens'][df['i_killed'] == 1].apply(obtain_killed_numeric_value)
df['num_injured'] = df['spacy_wo_parens'][df['i_injured'] == 1].apply(obtain_injured_numeric_value)
#df['num_killed'] = df['wo_parens_lower_lemmas'][df['i_killed'] == 1].apply(obtain_killed_numeric_value)
#df['num_injured'] = df['wo_parens_lower_lemmas'][df['i_injured'] == 1].apply(obtain_injured_numeric_value)

df['num_others'] = df['spacy_wo_parens'].apply(obtain_counted_noun_chunks)

stop_noun_chunks = ['which','these','that','it','this']
df['noun_chunks'] = df['spacy_wo_parens'].apply(lambda doc: [i for i in list(extract.noun_chunks(doc)) if i.text.lower() not in stop_noun_chunks])
df['entities'] = df['spacy_wo_parens'].apply(obtain_all_entities)

In [14]:
def generate_uuid(x):
    foo = uuid.uuid4().hex
    return foo
    
df['sent_idx'] = df['string_sentence'].apply(generate_uuid)

In [15]:
#df.to_csv("c://temp//foo.csv")

In [16]:
#use this as a repeatable-ish pattern for expanding on all the qualitative fields
df_entities = df[['source_url','sent_idx','string_sentence','entities']][df['entities'].isna() == False].copy()
df_entities['tmp'] = df_entities.apply(extract_entities, axis=1)
df_entities = df_entities.drop(columns=['entities'])
df_entities = df_entities.explode('tmp')
df_entities['rec_type'] = 'ENTITY'
df_entities = df_entities[df_entities['tmp'].isna() == False].copy() # added as a final cleanup to resolve stop ents
df_entities[['rec_prefix','rec_key','rec_value']] = df_entities.apply(lambda x: split_key_value_in_df(x.tmp), axis=1)

#now noun_chunks
df_nouns = df[['source_url','sent_idx','string_sentence','noun_chunks']][df['noun_chunks'].isna() == False].copy()
df_nouns['tmp'] = df_nouns.apply(extract_ncs, axis=1)
df_nouns = df_nouns.drop(columns=['noun_chunks'])
df_nouns = df_nouns.explode('tmp')
df_nouns['rec_type'] = 'NOUN_SEQUENCE'
df_nouns = df_nouns[df_nouns['tmp'].isna() == False].copy()
df_nouns[['rec_prefix','rec_key','rec_value']] = df_nouns.apply(lambda x: split_key_value_in_df(x.tmp), axis=1)

#quantitative values
df_quants = df[['source_url','sent_idx','string_sentence','num_others']][df['num_others'] != ''].copy()
df_quants['tmp'] = df_quants.apply(extract_numeric_key_values, axis=1)
df_quants = df_quants.drop(columns=['num_others'])
df_quants = df_quants.explode('tmp')
df_quants['rec_type'] = 'QUANTIFIED_NOUN'
df_quants[['rec_prefix','rec_key','rec_value']] = df_quants.apply(lambda x: split_key_value_in_df(x.tmp), axis=1)


In [17]:
df_attributes = pd.concat([df_quants, df_nouns,df_entities])
df_attributes = df_attributes.drop(columns=['tmp'])
df_attributes


Unnamed: 0,source_url,sent_idx,string_sentence,rec_type,rec_prefix,rec_key,rec_value
3,,ace705f240ef41a6a0eeef639e889796,"Between January and September, more than 21000...",QUANTIFIED_NOUN,more than,21000,cases
3,,ace705f240ef41a6a0eeef639e889796,"Between January and September, more than 21000...",QUANTIFIED_NOUN,,376,deaths
4,,9359a8da82764f6a85432c89334211a8,"During the same time frame, more than 74299 ca...",QUANTIFIED_NOUN,more than,74299,cases
4,,9359a8da82764f6a85432c89334211a8,"During the same time frame, more than 74299 ca...",QUANTIFIED_NOUN,,1160,deaths
6,,d128deec6de54662bc659ddc45385314,"In 2013, Katanga was the mostaffected province...",QUANTIFIED_NOUN,,13726,cholera cases
...,...,...,...,...,...,...,...
15499,,abb40de965af414ba9d8db6f0360952a,The UN is also helping the Ministry of Health ...,ENTITY,,ORG,UN
15499,,abb40de965af414ba9d8db6f0360952a,The UN is also helping the Ministry of Health ...,ENTITY,,ORG,Ministry of Health
15499,,abb40de965af414ba9d8db6f0360952a,The UN is also helping the Ministry of Health ...,ENTITY,,NORP,Haitian
15500,,91f045ac8562476298374735a7d7ab99,There are concerns that damages to water manag...,ENTITY,,DATE,2023


In [18]:
#Join base and attribute df
df_joined = df.merge(df_attributes[['sent_idx','rec_type','rec_prefix','rec_key','rec_value']], left_on='sent_idx', right_on='sent_idx', how='left').copy()
df_joined.explode('locations')
df_joined['locations'] = df_joined['locations'].apply(lambda x: x[0] if len(x)==1 else '')
df_joined.explode('dates')
df_joined['dates'] = df_joined['dates'].apply(lambda x: x[0] if len(x)==1 else '')


df_joined.explode('svot')
df_joined.shape

(101995, 54)

In [19]:
sidx = df_joined['sent_idx'].sample().tolist()[0]
print(df_joined['string_sentence'][df_joined['sent_idx'] == sidx].tolist()[0])
df_joined[['glide_id','source_level_country','string_sentence','locations','sent_idx','rec_type','rec_prefix','rec_key','rec_value']][df_joined['sent_idx'] == sidx]

On 24 April, flash floods killed 4 persons and damaged 53 houses (fully) and 212 houses (partially) in Dire Dawa on 25 April, river overflow damaged social infrastructure and affected livestock in Jinka town, SNNP and on 2526 April, flash floods affected 34507 households and displaced 15195 households in Erer, Sitti, Nogob and Korahe zones, Somali region.


Unnamed: 0,glide_id,source_level_country,string_sentence,locations,sent_idx,rec_type,rec_prefix,rec_key,rec_value
15923,FL-2020-000126-ETH,Ethiopia,"On 24 April, flash floods killed 4 persons and...",,9fd4c79f6837427194db05edc2cbeddb,QUANTIFIED_NOUN,,24,April
15924,FL-2020-000126-ETH,Ethiopia,"On 24 April, flash floods killed 4 persons and...",,9fd4c79f6837427194db05edc2cbeddb,QUANTIFIED_NOUN,,4,persons
15925,FL-2020-000126-ETH,Ethiopia,"On 24 April, flash floods killed 4 persons and...",,9fd4c79f6837427194db05edc2cbeddb,QUANTIFIED_NOUN,,53,houses
15926,FL-2020-000126-ETH,Ethiopia,"On 24 April, flash floods killed 4 persons and...",,9fd4c79f6837427194db05edc2cbeddb,QUANTIFIED_NOUN,,212,houses
15927,FL-2020-000126-ETH,Ethiopia,"On 24 April, flash floods killed 4 persons and...",,9fd4c79f6837427194db05edc2cbeddb,QUANTIFIED_NOUN,,25,April
15928,FL-2020-000126-ETH,Ethiopia,"On 24 April, flash floods killed 4 persons and...",,9fd4c79f6837427194db05edc2cbeddb,QUANTIFIED_NOUN,,2526,April
15929,FL-2020-000126-ETH,Ethiopia,"On 24 April, flash floods killed 4 persons and...",,9fd4c79f6837427194db05edc2cbeddb,QUANTIFIED_NOUN,,34507,households
15930,FL-2020-000126-ETH,Ethiopia,"On 24 April, flash floods killed 4 persons and...",,9fd4c79f6837427194db05edc2cbeddb,QUANTIFIED_NOUN,displaced,15195,households
15931,FL-2020-000126-ETH,Ethiopia,"On 24 April, flash floods killed 4 persons and...",,9fd4c79f6837427194db05edc2cbeddb,NOUN_SEQUENCE,,NOUN_CHUNK,24 April
15932,FL-2020-000126-ETH,Ethiopia,"On 24 April, flash floods killed 4 persons and...",,9fd4c79f6837427194db05edc2cbeddb,NOUN_SEQUENCE,,NOUN_CHUNK,flash floods


In [20]:
df_foo = df_joined[['sent_idx','rec_type','rec_value']][df_joined['sent_idx'] == '17c67b978f934355b9e2ebacb1b76ba0']
df_foo

Unnamed: 0,sent_idx,rec_type,rec_value


In [21]:
def filter_duplicates(group):
    noun_sequence_values = group[group['rec_type'] == 'NOUN_SEQUENCE']['rec_value']
    entity_values = group[group['rec_type'] == 'ENTITY']['rec_value']
    mask = ~((group['rec_type'] == 'NOUN_SEQUENCE') & (noun_sequence_values.isin(entity_values)))
    return group[mask]

# Apply the filtering operation grouped by 'sent_idx'
df_joined = df_joined.groupby('sent_idx', group_keys=False).apply(filter_duplicates)

In [22]:
output_file = f"D://projects//_external_files//surveyor//files_for_dashboarding//disaster_reports_{generate_uuid(1)}.xlsx"
df_joined.to_excel(output_file)

In [23]:
print(time.localtime())

time.struct_time(tm_year=2023, tm_mon=12, tm_mday=2, tm_hour=6, tm_min=1, tm_sec=17, tm_wday=5, tm_yday=336, tm_isdst=0)
