In [193]:
import spacy
from spacy import displacy

import re
import pandas as pd
from textacy import extract

from collections import defaultdict 
from fuzzywuzzy import fuzz
import time
import uuid

import os
import json

from datetime import datetime

In [194]:
pd.set_option('display.max_columns', None)
print(time.localtime())

nlp = spacy.load("en_core_web_sm")

time.struct_time(tm_year=2023, tm_mon=12, tm_mday=10, tm_hour=8, tm_min=11, tm_sec=8, tm_wday=6, tm_yday=344, tm_isdst=0)


In [195]:
disaster_summary_preprocessed_file = "D://projects//_external_files//surveyor//rw_disaster_preprocessed//disaster_summaries_preprocessed_69079bdfb35643eea3b17fe452c63d29.xlsx"

df = pd.read_excel(disaster_summary_preprocessed_file)
df = df.fillna('')

In [196]:
doc = nlp("This is a sentence. This is another sentence.")

sentences = []
for sent in doc.sents:
    print(sent.text)

This is a sentence.
This is another sentence.


In [197]:
def expand_to_sentence_level(doc):
    sentences = []
    #print()
    #print(doc)
    for sent in doc.sents:
        #print(sent)
        #create new doc objects for each sentence and append to a list
        doc_from_span = spacy.tokens.Doc(doc.vocab, words=[token.text for token in sent])
        sentences.append(doc_from_span)

    return sentences


def expand_to_sentence_level(doc):
    sentences = []
    for sent in doc.sents:
        sent_text = sent.text
        if len(sent_text) > 20:
            sentences.append(nlp(sent_text)) # horrendously inefficient but...
    if len(sentences) == 0:
        sentences.append(nlp("No content to return."))
    return sentences

# Function to increment by one for each idx_parad
def generate_sent_id(group, new_column_name='idx_sent'):
    group[new_column_name] = range(0, len(group))
    return group

In [198]:
df.columns

Index(['record_type', 'status', 'source_url', 'glide_id', 'idx_para',
       'source_level_country', 'source_title', 'source_desc',
       'source_original_text', 'reference_url', 'text', 'authoring_org',
       'reported_date', 'references', 'reference_auth_org',
       'reference_date_str', 'reference_date_iso', 'para_id',
       'non_parenthetical_text'],
      dtype='object')

In [199]:
#focus on ongoing for nowd
df_sents = df[df['status'] == 'ongoing'].copy()
df_sents['spacy_para_no_paren'] = df_sents['non_parenthetical_text'].apply(lambda x: nlp(x))
df_sents['spacy_sent_no_paren'] = df_sents['spacy_para_no_paren'].apply(expand_to_sentence_level)
df_sents = df_sents.explode('spacy_sent_no_paren')

# Apply the function to the DataFrame using groupby on 'idx_para'
df_sents = df_sents.groupby(['para_id','idx_para']).apply(generate_sent_id).reset_index(drop=True)
df_sents = df_sents[['glide_id','para_id','idx_para','idx_sent','source_original_text','spacy_sent_no_paren','reference_date_iso']]



## Data Structure Completed


In [200]:
#keyword_indicators
indicators = {
    'i_people' : ['people','person','child','man','woman','civilian','colleague','fatality','individual']
    ,'i_killed' : ['dead','fatal','die','kill','deceased','fatality','fatality','death','deaths'] #think about how to incorporate 2 co-existing terms "648 people who lost their lives"
    ,'i_injured' : ['injure','wound','wounded','injured']
    ,'i_damage' : ['damage','destroy','collapse']
    ,'i_health_infrastructure' : ['hospital','surgery']
    ,'i_education_infrastructure' : ['school','university']
    ,'i_cash_xfer' : ['xx']
    ,'i_wash' : ['sanitation','water','sewer','drain','drainage']
    ,'i_shelter' : ['shelter','tent','camp','blanket']
    ,'i_food' : ['food','cook','stove','feed','feed','nutrient','meal']
    ,'i_health' : ['health','medical','medicine']
    ,'i_gender_vuln' : ['dignity','gender','pregnant','lactate','lactating']
    ,'i_protection' : ['trauma','mental']
    ,'i_response_capacity' : ['personnel']
    ,'i_other_infrastructure' : ['communicate','radio','internet','telecommunication','electric','line']
    ,'i_money' : ['grant','loan','finance','appeal','chf','fund']
    ,'i_other' : ['biometric']
    ,'i_problem' : ['challenge']
    ,'i_demand_side' : ['need','demand','gap','priority', 'receive'] # note receive implies both supply and demand
    ,'i_supply_side' : ['response','contribute','provide','source','address','deploy','receive'] # note receive implies both supply and demand

    ,'i_assessments' : ['assess','assessment']
}

In [201]:
def extract_numeric_value(doc, indicator):
    #indicator needs to be either i_killed, or i_injured
    

    key_values = []
    just_count = []
    
    def check_flags(lst):
        for l in lst:
            if l == -1:
                return False
        return True

    def reset_indicators():
        return -1, -1, -1

    for sent in doc.sents:
        noun, attribute, count = reset_indicators()
    
        
                
        for t in sent:
            #print(t)
            if (t.pos_ == 'NUM') & (t.ent_type_ not in ['DATE','TIME']):
                count = t
    
            if t.lemma_ in indicators[indicator]:
                attribute = t
            if check_flags([attribute,count]):
    
                noun_att_cnt = (attribute,count)
                key_values.append(noun_att_cnt)
                just_count.append(count)
    
                noun, attribute, count = reset_indicators()

    #if more than 1 figure is returned, typically those will be
    #contextualizing numbers, just return the first
    if len(just_count) > 0:
        #return [just_count,key_values]
        return just_count[0]

In [202]:
df_sents['num_killed'] = df_sents['spacy_sent_no_paren'].apply(lambda x: extract_numeric_value(x, 'i_killed'))
df_sents['num_injured'] = df_sents['spacy_sent_no_paren'].apply(lambda x: extract_numeric_value(x, 'i_injured'))


In [203]:
#df_sents['spacy_sent_no_paren'].tolist()

In [205]:
df_sents.to_csv("c://temp//foo.csv", index=False)