In [1]:
import spacy
from spacy import displacy

import re
import pandas as pd
from textacy import extract

from collections import defaultdict 

In [2]:
nlp = spacy.load("en_core_web_md")

In [3]:
def convert_spelled_nums_to_digit(token):
    mappings = {
        'one' : 1,'two' : 2,'three' : 3,'four' : 4,'five' : 5,'six' : 6,'seven' : 7,'eight' : 8,'nine' : 9,'ten' : 10
    }

    if mappings.get(token) is not None:
        return mappings[token]
    else:
        return token

assert convert_spelled_nums_to_digit("one") == 1
assert convert_spelled_nums_to_digit("onet") == 'onet'


In [4]:
STOP_ENTS = ['WASH','PSS','GTC','PFA','NFI','IYCF']

In [5]:
f = '2023-10-13_Afghanistan_Herat_Earthquake_Flash_Update_13Oct2023_FINAL.txt'
f = '2023-10-07_Afghanistan_Herat Earthquake_Flash Update #1_7Oct2023_FINAL.txt'
f = '2023-10-27_Afghanistan_Herat Earthquake Sit Rep - 27.10.23 Final.txt'
#f = '2022-04-10_floodania_handout.txt'

file = f'D://projects//_external_files//surveyor//02_converted_to_text//{f}'

(date_of_report, country) = f.split("_")[0:2]


with open(file, encoding="utf-8") as f:
    text = f.read()
    
def string_preprocess(text):
    text = text.replace("\n"," ")
    text = text.replace("-","_") #replace so tokenization doesn't separate

    #turn 'four' into 4
    text = ' '.join([str(convert_spelled_nums_to_digit(t)) for t in text.split(" ")])


    # remove content in parentheses
    #processed_string = re.sub(r'\([^)]*\)', '', input_string)

    #remove all non alpha numeric and punctuation
    pattern = r'[^a-zA-Z0-9\s\,\.\?\!\-\(\)]'
    text = re.sub(pattern, '', text)

    pattern = r'(\d+)\s+million'
    text = re.sub(r'(\d+)\s+million', r'\1,000,000', text)

    
    #remove commas that serve as thousands separators
    #Hack... fix this so I don't have to run it 3x
    text = re.sub(r'(\d+),(\d+)', r'\1\2', text)
    text = re.sub(r'(\d+),(\d+)', r'\1\2', text)
    text = re.sub(r'(\d+),(\d+)', r'\1\2', text)

    text = text.replace("\s+","\s")

    return text
#date_of_report = '2023-10-13'
source = 'UNOCHA'
text = string_preprocess(text)
full_doc = nlp(text)

def string_remove_parenthetical_content(text):
    # Use regular expression to remove content inside parentheses
    text = re.sub(r'\([^)]*\)', '', text)

    return text
    
non_parens_text = string_remove_parenthetical_content(text)

In [6]:
admin_info = "D:\\projects\\_external_files\\country_info\\afg_adminboundaries_tabulardata_2.xlsx"

def build_admin_reference(admin_info):
    admin_df = pd.read_excel(admin_info, sheet_name="ADM2")
    return admin_df

admin_df = build_admin_reference(admin_info)

admins = []
admins.extend(admin_df['ADM2_EN'].tolist())
admins.extend(admin_df['ADM1_EN'].tolist())
admins.extend(admin_df['ADM0_EN'].tolist())
admins = list(set(admins))
admins.append('Herat')
admin_dict = {}
for ele in admins:
    admin_dict[ele]=1

# NLP Functions

In [7]:
def get_future_tense_verb(doc):
    def is_future_tense(token):
        #Check if a token is indicative of future tense.
        return (
            token.tag_ == "MD" and token.text.lower() == "will"
            or (token.dep_ == "aux" and token.head.lemma_ == "will")
        )

    for t in doc:
        if is_future_tense(t):
            return f"{t.text} {t.head}"

In [8]:
def build_base_dataframe(text, filename='', date_of_report='', country='', source=''):
    string_sent = [s.strip() for s in text.split('.')]

    dataframe_data=[]
    i=0
    for s in string_sent:
        x = [date_of_report,source,country,filename,i,s]
        i+=1
        dataframe_data.append(x)

    

    df = pd.DataFrame(dataframe_data,columns=['report_date','source','country','filename','sent_idx','string_sentence'])
    df['spacy_doc'] = df['string_sentence'].apply(lambda x: nlp(x))
    df['lower_lemmas'] = df['spacy_doc'].apply(lambda x: [w.lemma_.lower() for w in x])
    
    df['string_sent_wo_parens'] = df['string_sentence'].apply(string_remove_parenthetical_content)
    df['spacy_wo_parens'] = df['string_sent_wo_parens'].apply(lambda x: nlp(x))
    df['wo_parens_lower_lemmas'] = df['spacy_wo_parens'].apply(lambda x: [w.lemma_.lower() for w in x])
        

    df['locations'] = df['spacy_doc'].apply(lambda doc: [e.text for e in doc.ents if e.label_ == 'GPE'])
    df['dates'] = df['spacy_doc'].apply(lambda doc: [e.text for e in doc.ents if e.label_ == 'DATE'])
    df['svot'] = df['spacy_wo_parens'].apply(lambda doc: list(extract.subject_verb_object_triples(doc)))
    df['future_verbs'] = df['spacy_doc'].apply(get_future_tense_verb)

    

    return df

df = build_base_dataframe(text, filename=file, date_of_report=date_of_report, country=country, source=source)


In [9]:
#keyword_indicators
indicators = {
    'i_people' : ['people','person','child','man','woman']
    ,'i_civilian' : ['civilian']
    ,'i_killed' : ['dead','fatal','die','kill','deceased'] #think about how to incorporate 2 co-existing terms "648 people who lost their lives"
    ,'i_injured' : ['injure','wound']
    ,'i_damage' : ['damage','destroy','collapse']
    ,'i_health_infrastructure' : ['hospital','surgery']
    ,'i_education_infrastructure' : ['school','university']
    ,'i_cash_xfer' : ['xx']
    ,'i_wash' : ['sanitation','water','sewer','drain','drainage']
    ,'i_shelter' : ['shelter','tent','camp','blanket']
    ,'i_food' : ['food','cook','stove','feed','feed','nutrient','meal']
    ,'i_health' : ['health','medical','medicine']
    ,'i_gender_vuln' : ['dignity','gender','pregnant','lactate','lactating']
    ,'i_protection' : ['trauma','mental']
    ,'i_response_capacity' : ['personnel']
    ,'i_other_infrastructure' : ['communicate','radio','internet','telecommunication','electric','line']
    ,'i_money' : ['grant','loan','finance','appeal','chf','fund']
    ,'i_other' : ['biometric']
    ,'i_problem' : ['challenge']
    ,'i_demand_side' : ['need','demand','gap','priority', 'receive'] # note receive implies both supply and demand
    ,'i_supply_side' : ['response','contribute','provide','source','address','deploy','receive'] # note receive implies both supply and demand

    ,'i_assessments' : ['assess','assessment']
}

#some words, particularly verbs, must be in their conjucated form to be useful
#eg. "sourcing" vs "source" has too much potential overlap
#verb_tense_indicators = {
#    'i_supply_side_v' : ['sourcing','providing']
#}


In [10]:
def find_and_add_indicator(df, indicators):
    ind_counter = []
    for ind in indicators:
  
        df[ind] = df['lower_lemmas'].apply(lambda x: 1 if len([w for w in x if w in indicators[ind]])>0 else 0)
        ind_counter.append(ind)
        #print(ind_counter)
    df['i_count'] = df[ind_counter].sum(axis=1)

    return df

df = find_and_add_indicator(df, indicators)

def declare_primary_record_type(row):

    if row['i_count'] == 0:
        return 'background'
    elif row['i_supply_side']:
        return 'response_details'
    elif row['i_demand_side']:
        return 'demand_side'
    elif row[['i_damage','i_health_infrastructure','i_education_infrastructure']].sum() > 0:
        return 'damage_to_homes_and_infrastructure'
    else:
        return 'other'




df['record_type'] = df.apply(declare_primary_record_type, axis=1)

## Now That Indicators Are set - extract more specific details

In [11]:
def obtain_killed_numeric_value(doc):

    key_values = []
    just_count = []
    
    def check_flags(lst):
        for l in lst:
            if l == -1:
                return False
        return True

    #doc = doc.tolist()[0]
    attribute = -1
    noun = -1
    count = -1

    for t in doc:
        if str(t).isdigit():
            count = t
        if t in indicators['i_people']:
            noun = t
        if t in indicators['i_killed']:
            attribute = t

        if check_flags([noun,attribute,count]):

            noun_att_cnt = (noun,attribute,count)
            key_values.append(noun_att_cnt)
            just_count.append(count)

            noun = -1
            attribute = -1
            count = -1

    #changing to return only the count
    return just_count
    #return key_values
            
    
def obtain_injured_numeric_value(doc):

    key_values = []
    just_count = []
    
    def check_flags(lst):
        for l in lst:
            if l == -1:
                return False
        return True

    #doc = doc.tolist()[0]
    attribute = -1
    noun = -1
    count = -1

    for t in doc:
        if str(t).isdigit():
            count = t
        if t in indicators['i_people']:
            noun = t
        if t in indicators['i_injured']:
            attribute = t

        if check_flags([noun,attribute,count]):

            noun_att_cnt = (noun,attribute,count)
            key_values.append(noun_att_cnt)
            just_count.append(count)

            noun = -1
            attribute = -1
            count = -1

    #changing to return only the count
    return just_count
    #return key_values

def obtain_counted_noun_chunks(doc):
    counted_things = []
    for x in list(extract.noun_chunks(doc)):
        for token in x:
            if str(token).isdigit():
                counted_things.append(x)
                continue
    if len(counted_things) > 0:
        return counted_things
    else:
        return ''



def obtain_all_entities(doc):
    #stop_ents = STOP_ENTS
    ents = list(extract.entities(doc))
    if len(ents) < 1:
        return None
    entities = defaultdict(list) 
    for e in ents:
        #if e.text not in stop_ents:
        entities[e.label_].append(e)

    return entities   
    

#print(obtain_killed_numeric_value(df['wo_parens_lower_lemmas'][df['i_killed'] == 1].tolist()[0]))
#print(obtain_injured_numeric_value(df['wo_parens_lower_lemmas'][df['i_injured'] == 1].tolist()[0]))
        

In [12]:
df['num_killed'] = df['wo_parens_lower_lemmas'][df['i_killed'] == 1].apply(obtain_killed_numeric_value)
df['num_injured'] = df['wo_parens_lower_lemmas'][df['i_injured'] == 1].apply(obtain_injured_numeric_value)
df['num_others'] = df['spacy_wo_parens'].apply(obtain_counted_noun_chunks)
df['noun_chunks'] = df['spacy_wo_parens'].apply(lambda doc: list(extract.noun_chunks(doc)))
df['entities'] = df['spacy_wo_parens'].apply(obtain_all_entities)

In [13]:
df[df['i_injured'] == 1]

Unnamed: 0,report_date,source,country,filename,sent_idx,string_sentence,spacy_doc,lower_lemmas,string_sent_wo_parens,spacy_wo_parens,...,i_demand_side,i_supply_side,i_assessments,i_count,record_type,num_killed,num_injured,num_others,noun_chunks,entities
20,2023-10-27,UNOCHA,Afghanistan,D://projects//_external_files//surveyor//02_co...,20,Latest assessments show that the first 2 earth...,"(Latest, assessments, show, that, the, first, ...","[late, assessment, show, that, the, first, 2, ...",Latest assessments show that the first 2 earth...,"(Latest, assessments, show, that, the, first, ...",...,0,0,1,4,other,[1480],[1950],"[(first, 2, earthquakes), (11, October), (1480...","[(Latest, assessments), (first, 2, earthquakes...","{'ORDINAL': [(first)], 'CARDINAL': [(2), (1480..."


In [16]:
df['locations'][df['i_killed'] == 1]

20    []
Name: locations, dtype: object

In [284]:
#df.to_csv("c://temp//proc.csv")

# Main DF Built... now do more processing

In [285]:

def extract_entities(row):
    entities = row['entities']
    if entities is None:
        return ''
    en=[]
    for label in entities:
        for e in entities.get(label):
            ent = ' '.join([w.text for w in e]).strip()
            en.append([label,ent])
            
    return en
    #else:
    #    return ''


def extract_ncs(row):
    #data type, list of spans
    xs = row['noun_chunks']
    if xs is None:
        return ''
    en=[]

    for e in xs:
        ent = ' '.join([w.text for w in e]).strip()
        en.append(['NOUN_CHUNK',ent])
    return en


def extract_numeric_key_values(row):
    #data type, list of spans
    xs = row['num_others']
    if xs is None:
        return ''
    return_list=[]

    for e in xs:
        prefix = ''
        numeric = ''
        suffix = ''

        for token in e:
            if token.is_alpha == False:
                numeric = token.text
            elif numeric == '': #alpha but numeric not set yet, this is prefix
                prefix = prefix + ' ' + token.text
            else:
                suffix = suffix + ' ' + token.text
        
        return_list.append([prefix.strip(),numeric,suffix.strip()])   
        
    return return_list


def split_key_value_in_df(field,delim=','):

    s = pd.Series({'prefix' : field, 'left_label' : field, 'right_label' : field})
    
    if isinstance(field, list):
        fields = field
    elif isinstance(field, str):
        fields = field.split(delim)
    else:
        print(field)
    
     
    if len(fields) == 2:
        s = pd.Series({'prefix' : '', 'left_label' : fields[0], 'right_label' : fields[1]})
    elif len(fields) == 3:
        s = pd.Series({'prefix' : fields[0], 'left_label' : fields[1], 'right_label' : fields[2]})



    return s

def split_key_value_in_df_orig(field,left_label="d",right_label="f",delim=','):

    s = pd.Series({left_label : field, right_label : field})
    
    if isinstance(field, list):
     
        if len(field) == 2:
            s = pd.Series({left_label : field[0], right_label : field[1]})

    elif isinstance(field, str):
        fields = field.split(delim)
        if len(fields) == 2:
            s = pd.Series({left_label : fields[0], right_label : fields[1]})

    return s
        
    

In [286]:
df[['filename','sent_idx','string_sentence','noun_chunks']][df['noun_chunks'].isna() == False]

Unnamed: 0,filename,sent_idx,string_sentence,noun_chunks
0,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,0,"In December 2021, Tropical Cyclone Ethan struck the central provinces of Floodania, causing extensive damage and a humanitarian crisis","[(December), (Tropical, Cyclone, Ethan), (central, provinces), (Floodania), (extensive, damage), (humanitarian, crisis)]"
1,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,1,"In Floodania, Cyclone Ethan affected 1","[(Floodania), (Cyclone, Ethan)]"
2,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,2,85000000 people including 648 people who lost their lives and 1600 people who were injured,"[(85000000, people), (648, people), (who), (their, lives), (1600, people), (who)]"
3,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,3,240000 homes were damaged or destroyed and 400000 people were displaced,"[(240000, homes), (400000, people)]"
4,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,4,"Six weeks later, tropical Cyclone Gina made landfall in central Floodania, destroying villages, and further impeding the countrys ability to respond to the existing crisis","[(tropical, Cyclone, Gina), (landfall), (central, Floodania), (villages), (countrys, ability), (existing, crisis)]"
...,...,...,...,...
157,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,157,"05 each, though they can cost as low as USD 0","[(they), (USD)]"
158,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,158,02 if you purchase at least 100000 at a time,"[(you), (time)]"
159,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,159,Most vendors who are willing to participate in voucher schemes request that they get an additional 3 on top of market prices in Zibu,"[(Most, vendors), (who), (voucher, schemes), (they), (top), (market, prices), (Zibu)]"
160,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,160,"However, vendors in Matanda typically request a monthly fee of 30000 Pesa for participating in a voucher scheme","[(vendors), (Matanda), (monthly, fee), (30000, Pesa), (voucher, scheme)]"


In [287]:
#use this as a repeatable-ish pattern for expanding on all the qualitative fields
df_entities = df[['filename','sent_idx','string_sentence','entities']][df['entities'].isna() == False].copy()
df_entities['tmp'] = df_entities.apply(extract_entities, axis=1)
df_entities = df_entities.drop(columns=['entities'])
df_entities = df_entities.explode('tmp')
df_entities['rec_type'] = 'ENTITY'
df_entities[['rec_prefix','rec_key','rec_value']] = df_entities.apply(lambda x: split_key_value_in_df(x.tmp), axis=1)

#now noun_chunks
df_nouns = df[['filename','sent_idx','string_sentence','noun_chunks']][df['noun_chunks'].isna() == False].copy()
df_nouns['tmp'] = df_nouns.apply(extract_ncs, axis=1)
df_nouns = df_nouns.drop(columns=['noun_chunks'])
df_nouns = df_nouns.explode('tmp')
df_nouns['rec_type'] = 'NOUN_SEQUENCE'
df_nouns = df_nouns[df_nouns['tmp'].isna() == False].copy()
df_nouns[['rec_prefix','rec_key','rec_value']] = df_nouns.apply(lambda x: split_key_value_in_df(x.tmp), axis=1)

#quantitative values
df_quants = df[['filename','sent_idx','string_sentence','num_others']][df['num_others'] != ''].copy()
df_quants['tmp'] = df_quants.apply(extract_numeric_key_values, axis=1)
df_quants = df_quants.drop(columns=['num_others'])
df_quants = df_quants.explode('tmp')
df_quants['rec_type'] = 'QUANTIFIED_NOUN'
df_quants[['rec_prefix','rec_key','rec_value']] = df_quants.apply(lambda x: split_key_value_in_df(x.tmp), axis=1)


In [288]:
df_attributes = pd.concat([df_quants, df_nouns,df_entities])
df_attributes = df_attributes.drop(columns=['tmp'])
df_attributes


Unnamed: 0,filename,sent_idx,string_sentence,rec_type,rec_prefix,rec_key,rec_value
2,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,2,85000000 people including 648 people who lost their lives and 1600 people who were injured,QUANTIFIED_NOUN,,85000000,people
2,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,2,85000000 people including 648 people who lost their lives and 1600 people who were injured,QUANTIFIED_NOUN,,648,people
2,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,2,85000000 people including 648 people who lost their lives and 1600 people who were injured,QUANTIFIED_NOUN,,1600,people
3,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,3,240000 homes were damaged or destroyed and 400000 people were displaced,QUANTIFIED_NOUN,,240000,homes
3,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,3,240000 homes were damaged or destroyed and 400000 people were displaced,QUANTIFIED_NOUN,,400000,people
...,...,...,...,...,...,...,...
159,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,159,Most vendors who are willing to participate in voucher schemes request that they get an additional 3 on top of market prices in Zibu,ENTITY,,GPE,Zibu
160,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,160,"However, vendors in Matanda typically request a monthly fee of 30000 Pesa for participating in a voucher scheme",ENTITY,,LOC,Matanda
160,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,160,"However, vendors in Matanda typically request a monthly fee of 30000 Pesa for participating in a voucher scheme",ENTITY,,DATE,monthly
160,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,160,"However, vendors in Matanda typically request a monthly fee of 30000 Pesa for participating in a voucher scheme",ENTITY,,CARDINAL,30000


In [289]:
df_joined.columns

Index(['report_date', 'source', 'country', 'filename', 'sent_idx',
       'string_sentence', 'spacy_doc', 'lower_lemmas', 'string_sent_wo_parens',
       'spacy_wo_parens', 'wo_parens_lower_lemmas', 'locations', 'dates',
       'svot', 'future_verbs', 'i_people', 'i_civilian', 'i_killed',
       'i_injured', 'i_damage', 'i_health_infrastructure',
       'i_education_infrastructure', 'i_cash_xfer', 'i_wash', 'i_shelter',
       'i_food', 'i_health', 'i_gender_vuln', 'i_protection',
       'i_response_capacity', 'i_other_infrastructure', 'i_problem',
       'i_demand_side', 'i_supply_side', 'i_assessments', 'i_count',
       'record_type', 'num_killed', 'num_injured', 'num_others', 'noun_chunks',
       'entities', 'rec_type', 'rec_prefix', 'rec_key', 'rec_value'],
      dtype='object')

In [290]:
#Join base and attribute df
df_joined = df.merge(df_attributes[['sent_idx','rec_type','rec_prefix','rec_key','rec_value']], left_on='sent_idx', right_on='sent_idx', how='left').copy()
df_joined.explode('locations')
df_joined['locations'] = df_joined['locations'].apply(lambda x: x[0] if len(x)==1 else '')
df_joined.explode('dates')
df_joined['dates'] = df_joined['dates'].apply(lambda x: x[0] if len(x)==1 else '')


df_joined.explode('svot')
df_joined.shape
df_joined.to_csv("c://temp//output.csv", index=False)

In [291]:
idx=5
print(list(set(df_entities['string_sentence'][df_entities['sent_idx'] == idx].tolist())))
df_attributes[df_attributes['sent_idx'] == idx]




['Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2']


Unnamed: 0,filename,sent_idx,string_sentence,rec_type,rec_prefix,rec_key,rec_value
5,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,5,"Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2",QUANTIFIED_NOUN,,220,kilometres
5,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,5,"Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2",NOUN_SEQUENCE,,NOUN_CHUNK,Gina
5,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,5,"Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2",NOUN_SEQUENCE,,NOUN_CHUNK,strongest cyclone
5,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,5,"Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2",NOUN_SEQUENCE,,NOUN_CHUNK,continent
5,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,5,"Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2",NOUN_SEQUENCE,,NOUN_CHUNK,wind speeds
5,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,5,"Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2",NOUN_SEQUENCE,,NOUN_CHUNK,220 kilometres
5,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,5,"Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2",NOUN_SEQUENCE,,NOUN_CHUNK,hour
5,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,5,"Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2",NOUN_SEQUENCE,,NOUN_CHUNK,floods
5,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,5,"Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2",NOUN_SEQUENCE,,NOUN_CHUNK,height
5,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,5,"Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2",ENTITY,,PERSON,Gina


In [292]:
df_nouns[df_nouns['sent_idx'] == idx]

Unnamed: 0,filename,sent_idx,string_sentence,tmp,rec_type,rec_prefix,rec_key,rec_value
5,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,5,"Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2","[NOUN_CHUNK, Gina]",NOUN_SEQUENCE,,NOUN_CHUNK,Gina
5,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,5,"Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2","[NOUN_CHUNK, strongest cyclone]",NOUN_SEQUENCE,,NOUN_CHUNK,strongest cyclone
5,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,5,"Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2","[NOUN_CHUNK, continent]",NOUN_SEQUENCE,,NOUN_CHUNK,continent
5,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,5,"Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2","[NOUN_CHUNK, wind speeds]",NOUN_SEQUENCE,,NOUN_CHUNK,wind speeds
5,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,5,"Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2","[NOUN_CHUNK, 220 kilometres]",NOUN_SEQUENCE,,NOUN_CHUNK,220 kilometres
5,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,5,"Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2","[NOUN_CHUNK, hour]",NOUN_SEQUENCE,,NOUN_CHUNK,hour
5,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,5,"Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2","[NOUN_CHUNK, floods]",NOUN_SEQUENCE,,NOUN_CHUNK,floods
5,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,5,"Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2","[NOUN_CHUNK, height]",NOUN_SEQUENCE,,NOUN_CHUNK,height


In [293]:
df_quants[df_quants['sent_idx'] == idx]

Unnamed: 0,filename,sent_idx,string_sentence,tmp,rec_type,rec_prefix,rec_key,rec_value
5,D://projects//_external_files//surveyor//02_converted_to_text//2022-04-10_floodania_handout.txt,5,"Gina was the strongest cyclone ever recorded on the continent, with wind speeds peaking at 220 kilometres per hour and floods with a height of 2","[, 220, kilometres]",QUANTIFIED_NOUN,,220,kilometres


# End

In [None]:
#df_org_details = df[['filename','sent_idx','string_sentence','spacy_doc','entities']].copy()
#df_org_details['organization'] = df['entities'].apply(extract_orgs)
#df_org_details = df_org_details.drop(columns=['entities'])
#df_org_details = df_org_details.explode('organization')

#unique_orgs = list(set(df_org_details['organization'].tolist()))

#df_org_details['spacy_doc'][df_org_details['organization'] == 'UNHCR'].tolist()


In [None]:
def extract_kwic(doc, keyword):
    triple = list(extract.keyword_in_context(doc, keyword, window_width = 80,))

    prefix = ''
    keyword = ''
    suffix = ''
    #print(triple)
    if len(triple) == 1:
        prefix = triple[0][0]
        keyword = triple[0][1]
        suffix = triple[0][2]
    return pd.Series({'prefix':prefix,'keyword':keyword,'suffix':suffix})
    

keyword = 'UNHCR'
df_org_details[['prefix','keyword','suffix']] = df_org_details['spacy_doc'].apply(lambda x: extract_kwic(x, keyword))

In [None]:
df_org_details[df_org_details['keyword'] != '']

In [None]:
kwic = list(extract.keyword_in_context(full_doc, 'UNHCR', window_width = 80,))
kwic

In [219]:
df_joined.columns

Index(['report_date', 'source', 'country', 'filename', 'sent_idx',
       'string_sentence', 'spacy_doc', 'lower_lemmas', 'string_sent_wo_parens',
       'spacy_wo_parens', 'wo_parens_lower_lemmas', 'locations', 'dates',
       'svot', 'future_verbs', 'i_people', 'i_civilian', 'i_killed',
       'i_injured', 'i_damage', 'i_health_infrastructure',
       'i_education_infrastructure', 'i_cash_xfer', 'i_wash', 'i_shelter',
       'i_food', 'i_health', 'i_gender_vuln', 'i_protection',
       'i_response_capacity', 'i_other_infrastructure', 'i_problem',
       'i_demand_side', 'i_supply_side', 'i_assessments', 'i_count',
       'record_type', 'num_killed', 'num_injured', 'num_others', 'noun_chunks',
       'entities', 'rec_type', 'rec_prefix', 'rec_key', 'rec_value'],
      dtype='object')

In [222]:
#metadata
df_joined[['report_date','source','country','filename','sent_idx','string_sentence']].sample(5)

Unnamed: 0,report_date,source,country,filename,sent_idx,string_sentence
154,2023-10-27,UNOCHA,Afghanistan,D://projects//_external_files//surveyor//02_converted_to_text//2023-10-27_Afghanistan_Herat Earthquake Sit Rep - 27.10.23 Final.txt,22,"More than half of all earthquakeaffected people are in Injil, with the majority of the more than 3330 destroyed homes assessed to date located in Zindajan"
264,2023-10-27,UNOCHA,Afghanistan,D://projects//_external_files//surveyor//02_converted_to_text//2023-10-27_Afghanistan_Herat Earthquake Sit Rep - 27.10.23 Final.txt,42,"8M 500K Needs According to the Herat Provincial Education Department (PED), 295 educational facilities, including 125 schools and madrassas (Islamic religious learning institutions), have been affected by the earthquakes"
672,2023-10-27,UNOCHA,Afghanistan,D://projects//_external_files//surveyor//02_converted_to_text//2023-10-27_Afghanistan_Herat Earthquake Sit Rep - 27.10.23 Final.txt,92,"To avoid any tension between displaced families and host communities, WFP has begun to disseminate key messages on food assistance through radios twice a day, and through loudspeakers carried by mobile stations in the districts of Gulran, Koshan and Zindajan"
558,2023-10-27,UNOCHA,Afghanistan,D://projects//_external_files//surveyor//02_converted_to_text//2023-10-27_Afghanistan_Herat Earthquake Sit Rep - 27.10.23 Final.txt,81,"Combined, this means that 14824 households, equivalent to 103768 individuals residing across 130 villages in the districts of Gulran, Injil, Kushk and Zindajan, have been reached with food assistance"
1239,2023-10-27,UNOCHA,Afghanistan,D://projects//_external_files//surveyor//02_converted_to_text//2023-10-27_Afghanistan_Herat Earthquake Sit Rep - 27.10.23 Final.txt,163,"AADA, CARE, HealthNet TPO, MOVE provided wellbeing support to 2948 women and girls in villages of Zindajan district at the GTC and the Regional Hospital, and villages at Herat and KushkRabateSangai districts"


In [244]:
df_joined[['locations','dates','i_people','i_killed','i_education_infrastructure','i_health_infrastructure','i_cash_xfer','i_food','i_health','record_type']][(df_joined['locations'] != '') | (df_joined['dates'] != '')].sample(5)

Unnamed: 0,locations,dates,i_people,i_killed,i_education_infrastructure,i_health_infrastructure,i_cash_xfer,i_food,i_health,record_type
827,,24 October,0,0,0,0,0,0,1,response_details
2,Afghanistan,,0,0,0,0,0,0,0,background
195,Afghanistan,,0,0,0,0,0,0,0,response_details
1514,Herat,,0,0,0,0,0,0,0,background
28,,winter,0,0,0,0,0,0,0,response_details


In [259]:
#metadata
df_joined[['svot','future_verbs','string_sentence']].sample(5)

Unnamed: 0,svot,future_verbs,string_sentence
914,[],,"In Herat City, 18 health facilities were deployed"
946,[],,Unavailability of funding for continuity of support and services
686,"[([WFP], [will, conduct], [verification])]",will conduct,WFP will conduct a biometric verification of households previously registered in SCOPE under regular programming
312,"[([Response, UNICEF], [has, secured], [grant])]",,Response UNICEF has secured a 500000 grant from ECHO to create TLS in areas affected by the earthquakes
1111,[],,6K CP Key Figures PEOPLE IN NEED PEOPLE TARGETED PEOPLE REACHED FUNDING REQUIRED FUNDING RECEIVED 88K 69K 49K 3M 1


In [264]:
df_joined[['rec_type', 'rec_prefix', 'rec_key', 'rec_value']].sample(20).sort_values(by='rec_key')

Unnamed: 0,rec_type,rec_prefix,rec_key,rec_value
1366,QUANTIFIED_NOUN,,20,people
416,QUANTIFIED_NOUN,,2666,sanitary napkins
1220,QUANTIFIED_NOUN,,3461,dignity kits
748,ENTITY,,DATE,winter
142,ENTITY,,GPE,Injil
465,NOUN_SEQUENCE,,NOUN_CHUNK,emergency shelter aid
1135,NOUN_SEQUENCE,,NOUN_CHUNK,crisisaffected areas
599,NOUN_SEQUENCE,,NOUN_CHUNK,6 most affected communities
579,NOUN_SEQUENCE,,NOUN_CHUNK,250 kg straw
266,NOUN_SEQUENCE,,NOUN_CHUNK,Herat Provincial Education Department


In [221]:
pd.set_option('display.max_colwidth', None)