# Begin the actual NLP work

In [1]:
import spacy
from spacy import displacy

import re
import pandas as pd
from textacy import extract

from collections import defaultdict 
from fuzzywuzzy import fuzz
import time
import uuid

import os
import json

from datetime import datetime

In [2]:
sitrep_preprocessed_file = "D://projects//_external_files//surveyor//rw_sitrep_preprocessed//sitrep_preprocessed_b41b8e78f66d4e669917ea831f438b73.xlsx"
pcode_file = "D://projects//_external_files//cod_files//combined_locations//locations.csv"

In [3]:
pd.set_option('display.max_columns', None)
print(time.localtime())

time.struct_time(tm_year=2023, tm_mon=12, tm_mday=17, tm_hour=6, tm_min=11, tm_sec=53, tm_wday=6, tm_yday=351, tm_isdst=0)


## Load Location Services

In [4]:
df_location = pd.read_csv(pcode_file)

def get_pcode_from_location(loc, country_prefix='XX', lang_code='all'):

    if country_prefix != 'XX': #if the country prefix is set, limit search to that
        df_loc = df_location[df_location['pcode_prefix'] == country_prefix]
    else:
        df_loc = df_location

    if lang_code != 'all': #secondary filter - especially important to remove dupes with diff langs share the same script
        df_loc = df_loc[df_loc['lang_code'] == lang_code]
        
    matches = df_loc['pcode'][df_loc['location_name'].str.lower() == loc.lower()].tolist()

    #if the match fails, try again on the normalized name
    if len(matches) == 0:
        #remove common variations in names that can cause misses
        n_loc = re.sub(r'[^a-zA-Z]', '', loc)

        #this will cause problems for non-English.. so if then len is 0, exit
        if len(n_loc) == 0:
            return []
            
        matches = df_loc['pcode'][df_loc['location_normalized'].str.lower() == n_loc.lower()].tolist()
        

    #now check results
    if len(matches) > 1:
        #print(f"more than 1 matches... likely due to different granularity of entities with the same name (ie. Herat City in Herat Province) {matches}")
        #print(f"returning the lowest granularity match. {min(matches, key=len)}")
        #print("if the pcodes are all the same granularity.... you get the first element.")
        return min(matches, key=len)
            
        return matches[0]
    elif len(matches) == 1:
        return matches[0]

    else:
        #couldn't find a match, do a fuzzy search
        compare_list = list(set(df_loc['location_name'].tolist()))
        possible_matches=[]
        for i in compare_list:
            if fuzz.ratio(loc,i) > 70:
                possible_matches.append(i)
                print (f"No exact match to '{loc}'. see if these alternative spellings are correct: {possible_matches}")
        return None

    
    return None

assert get_pcode_from_location('istanbul') == 'TUR034'

def get_adm_lvl_from_pcode(pcode):
    return list(set(df_location['adm_lvl'][df_location['pcode'] == pcode].tolist()))
    
def get_name_in_lang(pcode, lang='en'):
    return list(set(df_location['location_name'][(df_location['pcode'] == pcode) & (df_location['lang_code'] == lang)].tolist()))

def get_descendents_of(pcode, lang='en', include_self=True):
    if include_self==True:
        return df_location[df_location['pcode'].str.contains(pcode) & (df_location['lang_code'] == lang)]
    else:
        return df_location[df_location['pcode'].str.contains(pcode) & (df_location['lang_code'] == lang)\
        & (df_location['pcode'] != pcode)]

def get_admin_chain(pcode, lang='en'):
    split_pcode = df_location['split_pcode'][df_location['pcode'] == pcode].tolist()[0]
    levels = split_pcode.split(".")
    pc =''
    admin_chain = []
    #rebuild the pcode one level at a time
    for i in levels:
        pc = pc + i
        admin_chain.append(df_location['location_name'][(df_location['pcode'] == pc) & (df_location['lang_code'] == lang)].tolist()[0])

    return admin_chain

def get_all_locations(lang_code='all'):

    #return all unique location names
    if lang_code == 'all':
        return list(set(df_location['location_name'].to_list()))
    else:
        return list(set(df_location['location_name'][df_location['lang_code'] == lang_code].to_list()))
    

In [5]:
nlp = spacy.load("en_core_web_sm")

# Create patterns and add to the entity ruler to better find locations

all_locs = get_all_locations(lang_code='en')
gpes = []

STOP_LOCS = ['of','can']
all_locs = [e for e in all_locs if e.lower() not in STOP_LOCS]

# create pattern rules for locations based on the COD files
for l in all_locs:
    token_sequence=[]
    for token in l.split('\s+'):
        token_sequence.append({"LOWER":token.lower()})
    x = {'label':'COD_GPE', 'pattern': token_sequence, 'id':get_pcode_from_location(l, lang_code='en')[0]}
    gpes.append(x)
    #print(get_pcode_from_location(l, lang_code='en'))

ruler = nlp.add_pipe('entity_ruler', before='ner')
ruler.add_patterns(gpes)

## Build the DF

In [6]:

df = pd.read_excel(sitrep_preprocessed_file)
df = df.fillna('')



In [7]:
# Narrow the scope for easier testing
df = df[(df['glide_id'] == 'EQ-2023-000015-TUR') | (df['glide_id'] == 'EQ-2023-000214-NPL')]
df = df[(df['glide_id'] == 'EQ-2023-000015-TUR')] # | (df['glide_id'] == 'EQ-2023-000214-NPL')]
set(df['glide_id'].tolist())


{'EQ-2023-000015-TUR'}

In [8]:
def expand_to_sentence_level(doc):
    sentences = []
    #print()
    #print(doc)
    for sent in doc.sents:
        #print(sent)
        #create new doc objects for each sentence and append to a list
        doc_from_span = spacy.tokens.Doc(doc.vocab, words=[token.text for token in sent])
        sentences.append(doc_from_span)

    return sentences


def expand_to_sentence_level(doc):
    sentences = []
    for sent in doc.sents:
        sent_text = sent.text
        if len(sent_text) > 20:
            sentences.append(nlp(sent_text)) # horrendously inefficient but...
    if len(sentences) == 0:
        sentences.append(nlp("No content to return."))
    return sentences

# Function to increment by one for each idx_parad
def generate_sent_id(group, new_column_name='idx_sent'):
    group[new_column_name] = range(0, len(group))
    return group

In [9]:
df.columns

Index(['record_type', 'source_url', 'glide_id', 'idx_para',
       'source_level_country', 'source_title', 'source_desc',
       'source_original_text', 'reference_url', 'text', 'authoring_org',
       'reported_date', 'para_id', 'non_parenthetical_text'],
      dtype='object')

In [10]:
#focus on ongoing for nowd
df_sents = df.copy()
df_sents['spacy_para_no_paren'] = df_sents['non_parenthetical_text'].apply(lambda x: nlp(x))
df_sents['spacy_sent_no_paren'] = df_sents['spacy_para_no_paren'].apply(expand_to_sentence_level)
df_sents = df_sents.explode('spacy_sent_no_paren')

# Apply the function to the DataFrame using groupby on 'idx_para'
df_sents = df_sents.groupby(['para_id','idx_para']).apply(generate_sent_id).reset_index(drop=True)

#to limit the fields but this just seems to cause problems
#df_sents = df_sents[['glide_id','source_level_country','authoring_org','para_id','idx_para','idx_sent','source_original_text','spacy_sent_no_paren','reported_date']]


## Data Structure Completed

In [11]:
#keyword_indicators
indicators = {
    'i_people' : ['people','person','child','man','woman','civilian','colleague','fatality','individual']
    ,'i_killed' : ['dead','fatal','die','kill','deceased','fatality','fatality','death','deaths'] #think about how to incorporate 2 co-existing terms "648 people who lost their lives"
    ,'i_injured' : ['injure','wound','wounded','injured']
    ,'i_damage' : ['damage','destroy','collapse','damaged']
    ,'i_infrastructure' : ['hospital','school','university','dam','bridge','road','highway']
    ,'i_cva' : ['xx']
    ,'i_wash' : ['sanitation','water','sewer','drain','drainage']
    ,'i_shelter' : ['shelter','tent','camp','blanket']
    ,'i_food' : ['food','cook','stove','feed','feed','nutrient','meal']
    ,'i_logistic' : ['logistic','logistics','road']
    ,'i_health' : ['health','medical','medicine','surgery']
    ,'i_gender_pss' : ['dignity','gender','pregnant','lactate','lactating']
    ,'i_protection' : ['trauma','mental','disable','disability']
    #,'i_response_capacity' : ['personnel']
    ,'i_response' : ['personnel']
    ,'i_other_infrastructure' : ['communicate','radio','internet','telecommunication','electric','line']
    ,'i_money' : ['grant','loan','finance','appeal','chf','fund']
    ,'i_other' : ['biometric']
    ,'i_problem' : ['challenge','gap','need_to']
    ,'i_demand_side' : ['need','demand','gap','priority', 'receive','shortage'] # note receive implies both supply and demand
    ,'i_supply_side' : ['response','contribute','provide','source','address','deploy','receive'] # note receive implies both supply and demand
    ,'i_tense_future' : ['xx'] #will populate this from future-tense indicator function

    ,'i_assessments' : ['assess','assessment']
}
file = "D://projects//_external_files//surveyor//word_indicators.xlsx"

def augment_indicators(indicators, file):
    df = pd.read_excel(file)
    for c in df.columns:
        if c[0:2] == 'i_':
            w_list = df['word'][df[c] == 1].tolist()
            try:
                indicators[c].extend(w_list)
            except:
                indicators[c] = w_list

    return indicators

indicators = augment_indicators(indicators, file)


## Data Structure Created

In [15]:
def extract_gpe_entities(doc, adm_lvl='0'):
    #values for adm_lvl = 0,1,2,3,
    # -1 = self, -99 = chain
    entities = []
    admins = []
    #COD_GPE
    ents = list(extract.entities(doc))
    if len(ents) < 1:
        return None
    else:
        for e in ents:
            if e.label_ == 'COD_GPE':
                entities.append(e)

        # -1 means return the actual gpes
        if adm_lvl == -1:
            return entities

        for e in entities:
            pcode = get_pcode_from_location(e.text)
            if (pcode is not None):
                if (len(pcode) != 0):
                    #if the desired level is lower than the actual reference
                    #ignore
                    try:
                        if adm_lvl == -99:
                            admins.append(get_admin_chain(pcode))
                        else:
                            admins.append(get_admin_chain(pcode)[adm_lvl])
                    except:
                        pass

    if adm_lvl != -99:
        admins = list(set(admins))
    if len(admins) == 0:
        return None
    else:
        #changing to return the full list, then can explode later
        return admins #[0] 

df_sents['identified_gpes'] = df_sents['spacy_sent_no_paren'].apply(lambda x: extract_gpe_entities(x, adm_lvl=-1))
df_sents['identified_country'] = df_sents['spacy_sent_no_paren'].apply(lambda x: extract_gpe_entities(x, adm_lvl=0))
df_sents['identified_adm_01'] = df_sents['spacy_sent_no_paren'].apply(lambda x: extract_gpe_entities(x, adm_lvl=1))
df_sents['identified_adm_02'] = df_sents['spacy_sent_no_paren'].apply(lambda x: extract_gpe_entities(x, adm_lvl=2))
df_sents['identified_adm_chain'] = df_sents['spacy_sent_no_paren'].apply(lambda x: extract_gpe_entities(x, adm_lvl=-99))

In [17]:
def find_and_add_indicator(df, indicators):
    ind_counter = []
    for ind in indicators:
  
        df[ind] = df['lower_lemmas'].apply(lambda x: 1 if len([w for w in x if w in indicators[ind]])>0 else 0)
        ind_counter.append(ind)
        #print(ind_counter)
    df['i_count'] = df[ind_counter].sum(axis=1)

    return df

In [18]:
df_sents['lower_lemmas'] = df_sents['spacy_sent_no_paren'].apply(lambda x: [w.lemma_.lower() for w in x])
df_sents = find_and_add_indicator(df_sents, indicators)

In [80]:
df_sents.to_excel("c://temp//training.xlsx")

## Layer on additional interpretations

In [19]:
df_sents.glide_id.unique()

array(['EQ-2023-000015-TUR'], dtype=object)

In [20]:
def get_future_tense_verb(doc):
    def is_future_tense(token):
        #Check if a token is indicative of future tense.
        return (
            token.tag_ == "MD" and token.text.lower() == "will"
            or (token.dep_ == "aux" and token.head.lemma_ == "will")
            or (token.pos_ == 'VERB' and token.head.text == "going" and  "Inf" in token.morph.get("VerbForm"))
        )

    for t in doc:
        if is_future_tense(t):
            return f"{t.text} {t.head}"

    

def declare_primary_record_type(row):

    if row['i_count'] == 0:
        return 'background'
    elif row['i_supply_side']:
        return 'response_details'
    elif row['i_demand_side']:
        return 'demand_side'
    elif row[['i_damage','i_health_infrastructure','i_education_infrastructure']].sum() > 0:
        return 'damage_to_homes_and_infrastructure'
    else:
        return 'other'


df_sents['svot'] = df_sents['spacy_sent_no_paren'].apply(lambda doc: list(extract.subject_verb_object_triples(doc)))
df_sents['future_verbs'] = df_sents['spacy_sent_no_paren'].apply(get_future_tense_verb)
df_sents['i_tense_future'] = df_sents['future_verbs'].apply(lambda x: 1 if x is not None else 0)
#df_sents['collected_indicators'] = df_sents.apply(get_indicators)

In [21]:
### group all the expressed indicators
def get_indicator_columns(df):
    inds=[]
    for c in df.columns:
        if c[0:2] == 'i_':
            inds.append(c)

    return inds

indicator_columns = get_indicator_columns(df_sents)

def find_matching_columns(row):
    return row.index[row.eq(1)].tolist()

# Create a new column containing lists of matching column names for each row
df_sents['collected_indicators'] = df_sents[indicator_columns].apply(find_matching_columns, axis=1)
df_sents.sample(2)

Unnamed: 0,record_type,source_url,glide_id,idx_para,source_level_country,source_title,source_desc,source_original_text,reference_url,text,authoring_org,reported_date,para_id,non_parenthetical_text,spacy_para_no_paren,spacy_sent_no_paren,idx_sent,identified_gpes,identified_country,identified_adm_01,identified_adm_02,identified_adm_chain,lower_lemmas,i_people,i_killed,i_injured,i_damage,i_infrastructure,i_cva,i_wash,i_shelter,i_food,i_logistic,i_health,i_gender_pss,i_protection,i_response,i_other_infrastructure,i_money,i_other,i_problem,i_demand_side,i_supply_side,i_tense_future,i_assessments,i_commodity_market,i_displacement,i_authority,i_statement_certainty,i_severity,i_change_increase,i_change_decrease,i_change_steady,i_geography,i_violence,i_count,svot,future_verbs,collected_indicators
4798,situation report,https://api.reliefweb.int/v1/reports/3937778,EQ-2023-000015-TUR,6,Syria,Syrian Arab Republic: Earthquakes Situation Re...,contributions; coordination; education; food a...,The situation in the affected areas remains di...,https://reliefweb.int/attachments/d075d00c-6d8...,The situation in the affected areas remains di...,OCHA,2023-02-26,rwsitrep_https://reliefweb.int/attachments/d07...,The situation in the affected areas remains di...,"(The, situation, in, the, affected, areas, rem...","(Partners, continue, to, scale, up, the, respo...",1,,,,,,"[partner, continue, to, scale, up, the, respon...",0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,4,"[([Partners], [continue], [to, scale, up, the,...",,"[i_response, i_supply_side, i_authority, i_sev..."
4977,situation report,https://api.reliefweb.int/v1/reports/3934180,EQ-2023-000015-TUR,1,Türkiye,Türkiye/Syria Earthquake - Situation update #1...,logistics and telecommunications,A 7.8 magnitude earthquake hit Türkiye and Syr...,https://reliefweb.int/attachments/de9be69a-35e...,A 7.8 magnitude earthquake hit Türkiye and Syr...,Logistics Cluster,2023-02-10,rwsitrep_https://reliefweb.int/attachments/de9...,A 7.8 magnitude earthquake hit Türkiye and Syr...,"(A, 7.8, magnitude, earthquake, hit, Türkiye, ...","(Aftershocks, followed, the, earthquake, and, ...",1,[],,,,,"[aftershock, follow, the, earthquake, and, a, ...",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,"[([Aftershocks], [followed], [earthquake]), ([...",,"[i_damage, i_geography]"


## Now do analysis


In [24]:
indicators

{'i_people': ['people',
  'person',
  'child',
  'man',
  'woman',
  'civilian',
  'colleague',
  'fatality',
  'individual',
  'internally',
  'internally_displace',
  'displace',
  'person',
  'displace_person',
  'displace_people',
  'people',
  'people_affect',
  'number_people',
  'displace_people',
  'people_need',
  'affect_people',
  'displacement',
  'child',
  'woman',
  'woman_child',
  'include_child',
  'child_woman',
  'child_protection',
  'unaccompanied',
  'people_receive',
  'injure',
  'kill_injure',
  'settler',
  'people_injure',
  'die',
  'idp',
  'idp_site',
  'host',
  'idp_shelter',
  'return',
  'idp_camp',
  'total',
  'total_number',
  'total_people',
  'total_case',
  'community',
  'host_community',
  'affect_community',
  'member',
  'refugee_host',
  'refugee',
  'asylum',
  'seeker',
  'asylum_seeker',
  'refugee_camp',
  'migrant',
  'person',
  'family_person',
  'person_affect',
  'total_family',
  'family',
  'patient',
  'evacuate',
  'protection'

In [37]:
#df_sents[['spacy_sent_no_paren','future_verbs','collected_indicators']][(df_sents['future_verbs'].isna() == False)]

In [None]:
df_sents[['spacy_sent_no_paren','future_verbs','collected_indicators']][(df_sents['i_displacement'] == 1) & (df_sents['future_verbs'].isna() == False)].

In [23]:
indicator = 'i_displacement'

#df_sents[['reported_date','source_original_text','spacy_sent_no_paren','collected_indicators']][(df_sents[indicator] == 1) & (df_sents['future_verbs'].isna() == False)].sort_values(by='reported_date').to_excel(f"c://temp//{indicator}_future.xlsx")
#df_sents[['reported_date','source_original_text','spacy_sent_no_paren','collected_indicators']][(df_sents[indicator] == 1)]
df_sents.to_excel("c://temp//all.xlsx")

In [86]:
def get_indicators(df):
    inds=[]
    for c in df.columns:
        if c[0:2] == 'i_':
            if df[c].tolist()[0] == 1:
                inds.append(c)

    return inds

def get_verb_tense_indicator(doc):
    for token in doc:
        print(f"{token.lemma_} -- {token.pos_} -- {token.morph}")


In [None]:
## test
df_focus = df_sents.sample(10)

for index, row in df_focus[['spacy_sent_no_paren','collected_indicators']].iterrows():
    print(row[0])
    print(row[1])
    print()    

In [None]:
indicators['i_gender_pss']

In [None]:
df_focus  = df_sents.sample(1)

s = df_focus['spacy_sent_no_paren'].tolist()[0]
idx = df_focus['spacy_sent_no_paren'].index
print(idx)
print()
print(re.sub("\n", " ", s.text))
print(get_indicators(df_focus))
print()

#get_verb_tense_indicator(s)


In [None]:
# causal factors

"due to" -- "ADP ADP"

In [40]:
text = 'The greatest increases in population densities were in Mersin , Niğde , and Adana '
doc = nlp(text)
from spacy import displacy
displacy.render(doc, style='ent')

In [None]:
def get_clean_date(date):
    date_object = datetime.strptime(date, "%d %b %Y")
    iso_date = date_object.date().isoformat()