# Process (Filter) Data Extracted From Quotebank For Disaster

**Imports**

In [1]:
import pandas as pd
import re # Used for verifying and running small tests on regex patterns
import datetime

from disaster_extr_constants import *
from disaster_extr_helpers import *

**Define parameters for dataset to be filtered**

In [331]:
YEAR = 2015
disaster_type = 'storm'
disaster_YEAR_pos = storm_tags_2015_pos
#disaster_YEAR_neg = storm_tags_2017_neg
file_path = 'data/'+str(YEAR)+'_'+disaster_type+'_climate_processed_csv.bz2'

**Load EMDAT for date retrieval and extracted quote set (which still needs to be filtered)**

In [332]:
data = 'data/emdat_processed.csv'
parse_dates = ['StartDate', 'EndDate']
df_emdat = pd.read_csv(data, index_col="Dis No", parse_dates = parse_dates)

**Extracted quotes are from larger time interval than disaster itself to account for evolution of discussion around climate change. Specifically, quotes are extracted in the interval [start - 21 days, end + 21 days]. Note that if the disaster lasted more than 30 days the interval is [start - 21 days, end].**

In [333]:
disaster_df_quotes = pd.read_csv(file_path, parse_dates = ['date'], compression='bz2')

**Apply date retrieval (special case for 2020 because StartDate is in 2019)**

In [334]:
if disaster_type == 'heat_wave':
    
    if YEAR == 2020:
        df_heat_wave = get_df_disaster(df_emdat, HEAT_WAVES_2020, HEAT_WAVES_2020_val)
        df_heat_wave_bounds = retrieve_bounding_dates(df_heat_wave)
    else:
        df_heat_wave = get_df_disaster(df_emdat, HEAT_WAVES, HEAT_WAVES_val)
        df_heat_wave_bounds = retrieve_bounding_dates(df_heat_wave)
        
    disaster_df = df_heat_wave
    disaster_df_bounds = df_heat_wave_bounds
        
elif disaster_type == 'storm':
    
    if YEAR == 2020:
        df_storm = get_df_disaster(df_emdat, STORMS_2020, STORMS_2020_val)
        df_storm_bounds = retrieve_bounding_dates(df_storm)
    else:
        df_storm = get_df_disaster(df_emdat, STORMS, STORMS_val)
        df_storm_bounds = retrieve_bounding_dates(df_storm)

    disaster_df = df_storm
    disaster_df_bounds = df_storm_bounds
    
if YEAR == 2020:
    lookupYEAR = YEAR-1
else:
    lookupYEAR = YEAR
    

start_YEAR, end_YEAR = disaster_df_bounds.loc[lookupYEAR].MinStartDate, disaster_df_bounds.loc[lookupYEAR].MaxEndDate

**Overview over all disasters for a given disaster type (note: separate table for 2020)**

In [335]:
disaster_df[['Type', 'Subtype','Name', 'Country','StartDate','EndDate']]

Unnamed: 0_level_0,Type,Subtype,Name,Country,StartDate,EndDate
Dis No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-0470-MEX,Storm,Tropical cyclone,Hurricane Patricia,Mexico,2015-10-22,2015-10-28
2016-0041-FJI,Storm,Tropical cyclone,Cyclone Winston,Fiji,2016-02-20,2016-02-21
2017-0362-USA,Storm,Tropical cyclone,Hurricane Harvey,United States of America (the),2017-08-25,2017-08-29
2018-0341-CHN,Storm,Tropical cyclone,Typhoon Mangkut (Ompong),China,2018-09-10,2018-09-18
2018-0342-USA,Storm,Tropical cyclone,Hurricane Florence,United States of America (the),2018-09-12,2018-09-18
2018-0341-PHL,Storm,Tropical cyclone,Typhoon Mangkut (Ompong),Philippines (the),2018-09-16,2018-09-16
2018-0341-HKG,Storm,Tropical cyclone,Typhoon Mangkut (Ompong),Hong Kong,2018-09-17,2018-09-17
2019-0492-JPN,Storm,Tropical cyclone,Tropical cylone 'Hagibis',Japan,2019-10-12,2019-10-17


**Overview over start and end dates (when a particluar disaster contains several entries pick the earliest StartDate and latest EndDate)**

In [336]:
disaster_df_bounds

Unnamed: 0_level_0,MinStartDate,MaxEndDate
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2015,2015-10-22,2015-10-28
2016,2016-02-20,2016-02-21
2017,2017-08-25,2017-08-29
2018,2018-09-10,2018-09-18
2019,2019-10-12,2019-10-17


In [337]:
print("Disaster [{}] Dates: {} --- {}".format(
        disaster_type,
        start_YEAR.strftime("%Y-%m-%d"), 
        end_YEAR.strftime("%Y-%m-%d")))

Disaster [storm] Dates: 2015-10-22 --- 2015-10-28


**If duration of disaster is less than a month, add 10 days to end date**

In [338]:
# Add 10 days to end date
one_month = datetime.timedelta(days=31)
ten_days = datetime.timedelta(days=10)
two_days = datetime.timedelta(days=2)
if end_YEAR - start_YEAR < one_month:
    end_YEAR += ten_days
    
#start_YEAR -= two_days

In [339]:
print("Used extraction dates: {} --- {}".format(
        start_YEAR.strftime("%Y-%m-%d"), 
        end_YEAR.strftime("%Y-%m-%d")))

Used extraction dates: 2015-10-20 --- 2015-11-07


In [340]:
start_YEAR, end_YEAR = start_YEAR.strftime("%Y-%m-%d"), end_YEAR.strftime("%Y-%m-%d")

**Extract quotes during disaster's time interval**

In [341]:
df_disaster_start_end = df_time_interval(disaster_df_quotes, start_YEAR, end_YEAR,date_attr='date')

In [378]:
df_disaster_start_end.head(2)

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
0,2015-10-30-057627,"It kept me up all night, the wind. The windows...",Jorge Rodriguez,"['Q27574431', 'Q3295308', 'Q6278572', 'Q627857...",2015-10-30 06:09:05,1,"[['Jorge Rodriguez', '0.5058'], ['None', '0.49...",['http://www.kesq.com/news/strong-winds-blast-...,E
1,2015-10-29-104475,The Baker Polito Administration is fully devot...,Peter Lorenz,['Q1336282'],2015-10-29 10:49:41,3,"[['Peter Lorenz', '0.6971'], ['None', '0.2916'...",['http://insideclimatenews.org/news/29102015/m...,E


**Retrieve corresponding positive tag list and verify regex before filtering**

In [343]:
tags_pos_list = disaster_YEAR_pos.tags.values.tolist()

In [344]:
tags_pos_list

['(?=.*\\b([mM]exic(o|ans?)|[tT]ehuantepec|[cC]oasts?|[jJ]alisco)\\b)(?=.*\\b([hH]urricanes?|[sS]torms?)\\b)',
 '(?=.*\\b([mM]exic(o|ans?)|[tT]ehuantepec|[cC]oasts?|[jJ]alisco)\\b)(?=.*\\b([fF]lood(waters?|s|ed|ing)?|[rR]ain(ed|s|fall)?|[lL]andslides?)\\b)',
 '\\b(Patricia)\\b',
 '\\b([cC]ategory 5)\\b',
 '\\b(NOAA)\\b',
 '\\b(National Hurricane Center|NHC)\\b',
 '\\b(Mexican (Red Cross|Army|Navy|Federal Police))\\b']

In [345]:
regex_pattern_pos = r'|'.join(tags_pos_list)

In [346]:
# Verify a regex by performing sample tests
sample_text = 'A boring sentence about experiments being performed Experimentally in an experimental setting.'
re.sub(r'\b([eE]xperiment(s|al|ally|)?)\b', 'FOUND PATTERN',sample_text)

'A boring sentence about FOUND PATTERN being performed FOUND PATTERN in an FOUND PATTERN setting.'

**Apply positive tags filtering (selecting quotes with desired patterns)**

In [347]:
filtered_for_pos = extract_quotes(df_disaster_start_end, regex_pattern_pos).sort_values(by='date')

  mask = df[field].str.contains(regex)


In [348]:
len(filtered_for_pos)

236

In [368]:
filtered_for_pos.head(2)

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
3100,2015-10-20-119076,There was an area of low pressure that crossed...,Ronald Gordon,['Q7364903'],2015-10-20 03:16:14,1,"[['Ronald Gordon', '0.8755'], ['None', '0.1245']]",['http://edition.channel5belize.com/archives/1...,E
2227,2015-10-20-066454,"It took, since the hurricane was still in the ...",Bill Nelson,"['Q22639331', 'Q3272899', 'Q358437', 'Q4910326']",2015-10-20 06:54:11,1,"[['Bill Nelson', '0.9047'], ['None', '0.0953']]",['http://news4jax.com/news/ntsb-to-release-el-...,E


**For checking and verifying relevant and irrelevant quotes, then positive and negative tag lists can be adjusted accordingly**

In [369]:
# Filter based on quotation field
extract_quotes(filtered_for_pos, r'\b(some pattern)\b', complement=False).quotation.values

  mask = df[field].str.contains(regex)


array([], dtype=object)

In [370]:
# Filter based on urls field
extract_quotes(filtered_for_pos, r'some_pattern', field='urls', complement=False).quotation.values

array([], dtype=object)

**Apply negative tags filtering (removing quotes with undesired patterns from set of positive quotes)**

In [371]:
tag_neg_list = disaster_YEAR_neg.tags.values.tolist()

regex_pattern_neg = r'|'.join(tag_neg_list)

In [372]:
filtered_for_pos_then_neg = extract_quotes(filtered_for_pos, regex_pattern_neg, complement=True)

In [373]:
len(filtered_for_pos_then_neg)

236

**In a few cases extra processing (based on URLs) was used**

In [374]:
apply_extra = False
apply_extra_to = filtered_for_pos # or filtered_for_pos

In [375]:
if apply_extra:
    filtered_extra = storm_tags_2015_extra(apply_extra_to)
    print(len(filtered_extra))

**Double check parameters before writing to disk**

In [299]:
YEAR, disaster_type

(2019, 'storm')

In [300]:
output_df = filtered_for_pos_then_neg # Or filtered_extra or filtered_for_pos

In [376]:
len(output_df)

152

In [377]:
output_df.head(2)

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
3450,2019-10-07-012436,Children have also limitations in terms of the...,Jim Clark,"['Q16114343', 'Q23091533', 'Q360002', 'Q556150...",2019-10-07 00:00:00,1,"[['Jim Clark', '0.8462'], ['None', '0.1538']]",['http://news.wfsu.org/post/mental-health-stru...,E
10665,2019-10-07-091917,We are currently monitoring the development of...,,[],2019-10-07 03:47:48,230,"[['None', '0.9076'], ['Warren Gatland', '0.086...",['https://www.the42.ie/ireland-world-cup-typho...,E


In [305]:
write_df_to_disk(output_df, disaster_type, YEAR, additional_text='filtered')