# Imports

In [812]:
import pandas as pd
import bz2
import json
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import matplotlib.gridspec as gridspec
import pickle
import os
from extraction_helpers import *
import datetime
import numpy as np
from tqdm import tqdm

In [237]:
def df_time_interval(df, start, end):
    time_mask = ((df['StartDate']>=start) & (df['StartDate']<=end))
    return df.loc[time_mask]

In [427]:
def get_df_disaster(df, year_to_id_map, set_val=None):
    IDS = sum(year_to_id_map.values(), [])
    mask = df.index.isin(IDS)
    df_disasters = df.loc[mask]
    if set_val:
        for key, value in set_val.items():
            for key_, value_ in value.items():
                df_disasters.at[key, key_] = value_
    return df_disasters

In [762]:
def retrieve_bounding_dates(df_disaster):
    grouped = df_disaster.groupby(df_disaster.StartDate.dt.year)
    aggregated = grouped.agg({'StartDate':'min', 'EndDate':'max'})
    aggregated.index.names = ['Year']
    aggregated.columns = ['MinStartDate', 'MaxEndDate']
    return aggregated

In [None]:
def compute_date_bounds(start, end):
    d = datetime.timedelta(days=21)
    lower = start - d
    duration = end - start 
    if duration.days > 30:
        upper = end
    else: 
        upper = end + d
    return lower.strftime("%Y-%m-%d"), upper.strftime("%Y-%m-%d")

In [546]:
data = 'data/emdat_processed.csv'
parse_dates = ['StartDate', 'EndDate']
df_emdat = pd.read_csv(data, index_col="Dis No", parse_dates = parse_dates)

In [548]:
australia_heat_wave = { 
    'Group': 'Natural',
    'Subgroup':'Meteorological', 
    'Type':'Extreme temperature ', 
    'Subtype':'Heat wave', 
    'Subsubtype': 'NaN',
    'Name':'NaN', 
    'Country':'Australia', 
    'ISO':'AUS', 
    'Region': 'Australia and New Zealand',
    'Continent': 'Oceania',
    'Origin':'NaN', 
    'Magnitude':48.2, 
    'Scale':'°C', 
    'Deaths': 0,
    'Injured':0, 
    'Affected':0,
    'Homeless':0,
    'TotalAffected':0, 
    'Damages':0, 
    'StartDate':'2017-01-30', 
    'EndDate': '2017-02-14',
    'Duration':15
}
# Not necessary, row already added to csv file
#row_series = pd.Series(data=australia_heat_wave, name='2017-9999-AUS')
#df_emdat = df_emdat.append(row_series, ignore_index=False)

In [549]:
df_emdat.loc['2017-9999-AUS']

Group                              Natural
Subgroup                    Meteorological
Type                  Extreme temperature 
Subtype                          Heat wave
Subsubtype                             NaN
Name                                   NaN
Country                          Australia
ISO                                    AUS
Region           Australia and New Zealand
Continent                          Oceania
Origin                                 NaN
Magnitude                             48.2
Scale                                   °C
Deaths                                   0
Injured                                  0
Affected                                 0
Homeless                                 0
TotalAffected                            0
Damages                                  0
StartDate              2017-01-30 00:00:00
EndDate                2017-02-14 00:00:00
Duration                                15
Name: 2017-9999-AUS, dtype: object

In [756]:
STORMS = {
    '2015': ['2015-0470-MEX'],
    '2016': ['2016-0041-FJI'],
    '2017': ['2017-0362-USA'],
    '2018': ['2018-0342-USA', '2018-0341-CHN', '2018-0341-PHL', '2018-0341-HKG'], # two separate storms over same time period
    '2019': ['2019-0492-JPN'],
    '2020': ['2020-0211-LKA', '2020-0211-BGD', '2020-0211-IND']     
}

STORMS_val = {
    '2018-0341-CHN': {'Magnitude': 240},
    '2018-0342-USA': {'Magnitude': 240},
    '2020-0211-BGD': {'Magnitude': 151},
    '2020-0211-LKA': {'Magnitude': 80}
}

HEAT_WAVES = {
    '2015': ['2015-0189-IND'],
    '2016': ['2016-0133-IND'],
    '2017': ['2017-9999-AUS', '2017-0072-AUS'], # heat wave and associated fire
    '2018': ['2018-0226-JPN', '2018-0256-PRK'],
    '2019': ['2019-0366-BEL', '2019-0366-FRA', '2019-0366-NLD', '2019-0366-DEU'], #'2019-0366-AUT' (no value for temp), '2019-0650-GBR' (lasts too long)
    '2020': ['2020-0441-USA'] # Forest fire
}

HEAT_WAVES_val = {
    '2020-0441-USA': {'Magnitude': 4180},
    '2017-0072-AUS': {'Magnitude': 550}
}

In [757]:
df_heat = get_df_disaster(df_emdat, HEAT_WAVES, HEAT_WAVES_val)

In [758]:
df_heat[['Magnitude', 'Scale', 'Name', 'StartDate', 'EndDate','Deaths']]

Unnamed: 0_level_0,Magnitude,Scale,Name,StartDate,EndDate,Deaths
Dis No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-0189-IND,48.0,°C,,2015-05-20,2015-05-31,2248.0
2016-0133-IND,51.0,°C,,2016-04-01,2016-05-20,300.0
2017-9999-AUS,48.2,°C,,2017-01-30,2017-02-14,0.0
2017-0072-AUS,550.0,Km2,Sir Ivan fire,2017-02-09,2017-02-13,0.0
2018-0226-JPN,41.0,°C,,2018-07-01,2018-07-15,119.0
2018-0256-PRK,38.0,°C,,2018-07-11,2018-08-03,0.0
2019-0366-BEL,41.0,°C,,2019-07-19,2019-07-27,400.0
2019-0366-FRA,44.0,°C,,2019-07-21,2019-07-27,868.0
2019-0366-NLD,40.0,°C,,2019-07-22,2019-07-27,400.0
2019-0366-DEU,42.0,°C,,2019-07-24,2019-07-25,0.0


In [763]:
df_heat_bounds = retrieve_bounding_dates(df_heat)

In [764]:
df_heat_bounds

Unnamed: 0_level_0,MinStartDate,MaxEndDate
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2015,2015-05-20,2015-05-31
2016,2016-04-01,2016-05-20
2017,2017-01-30,2017-02-14
2018,2018-07-01,2018-08-03
2019,2019-07-19,2019-07-27
2020,2020-08-16,2020-10-01


In [765]:
df_storm = get_df_disaster(df_emdat, STORMS, STORMS_val)

In [766]:
df_storm_bounds = retrieve_bounding_dates(df_storm)

In [728]:
df_storm[['Magnitude', 'Scale', 'Name', 'StartDate', 'EndDate','Deaths']]

Unnamed: 0_level_0,Magnitude,Scale,Name,StartDate,EndDate,Deaths
Dis No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-0470-MEX,270.0,Kph,Hurricane Patricia,2015-10-22,2015-10-28,14.0
2016-0041-FJI,325.0,Kph,Cyclone Winston,2016-02-20,2016-02-21,45.0
2017-0362-USA,215.0,Kph,Hurricane Harvey,2017-08-25,2017-08-29,88.0
2018-0341-CHN,240.0,Kph,Typhoon Mangkut (Ompong),2018-09-10,2018-09-18,0.0
2018-0342-USA,240.0,Kph,Hurricane Florence,2018-09-12,2018-09-18,53.0
2018-0341-PHL,240.0,Kph,Typhoon Mangkut (Ompong),2018-09-16,2018-09-16,84.0
2018-0341-HKG,240.0,Kph,Typhoon Mangkut (Ompong),2018-09-17,2018-09-17,0.0
2019-0492-JPN,160.0,Kph,Tropical cylone 'Hagibis',2019-10-12,2019-10-17,99.0
2020-0211-LKA,80.0,Kph,Cyclone 'Amphan',2020-05-17,2020-05-20,4.0
2020-0211-BGD,151.0,Kph,Cyclone 'Amphan',2020-05-20,2020-05-20,26.0


In [768]:
df_storm_bounds

Unnamed: 0_level_0,MinStartDate,MaxEndDate
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2015,2015-10-22,2015-10-28
2016,2016-02-20,2016-02-21
2017,2017-08-25,2017-08-29
2018,2018-09-10,2018-09-18
2019,2019-10-12,2019-10-17
2020,2020-05-17,2020-05-20


In [935]:
YEAR = 2019
nrows = 22000000
chunksize = 100000
compression = 'bz2'
#regex_pattern = r'\b([dD]egrees)\b|\b(celsius)\b|\b(waves?)\b|\b(heat)\b|\b([tT]emperatures?)\b|\b([hH]ot(test)?)\b|\b([wW]arm)\b|\b([wW]eather)\b|\b([cC]limate)\b|\b([cC]limate change)\b|\b([gG]lobal warming)\b|\b([gG]reenhouse)\b'

In [936]:
heat_tags = pd.DataFrame(
    {'tags': [
        r'\b([dD]egrees)\b',
        r'\b(celsius)\b',
        r'\b(waves?)\b',
        r'\b(heat)\b',
        r'\b([tT]emperatures?)\b',
        r'\b([hH]ot(test)?)\b',
        r'\b([wW]arm)\b',
        r'\b([wW]eather)\b']
    }
)  

climate_tags = pd.DataFrame(
    {'tags': [
        r'\b([cC]limate)\b',
        r'\b([cC]limate change)\b',
        r'\b([gG]lobal warming)\b',
        r'\b([gG]reenhouse)\b']
    }
) 

heat_tag_list = heat_tags.tags.values.tolist()
climate_tag_list = climate_tags.tags.values.tolist()
all_tags = heat_tag_list + climate_tag_list
regex_pattern = r'|'.join(all_tags)

In [937]:
nrows // chunksize

220

In [922]:
start_YEAR, end_YEAR = df_heat_bounds.loc[YEAR].MinStartDate, df_heat_bounds.loc[YEAR].MaxEndDate

In [923]:
lower_YEAR, upper_YEAR = compute_date_bounds(start_YEAR, end_YEAR)

In [924]:
#quotes = pd.read_json('data/quotes-'+str(YEAR)+'.json.bz2', lines=True, compression='bz2', chunksize=1000000, nrows=1000000)

In [933]:
chunk_interval_list = []
with pd.read_json('data/quotes-'+str(YEAR)+'.json.bz2',lines=True,compression=compression,chunksize=chunksize) as df_reader:
    for chunk in tqdm(df_reader, total=nrows // chunksize):
        chunk_interval = chunk[(chunk['date'] >= lower_YEAR) & (chunk['date'] <= upper_YEAR)]
        chunk_interval_list.append(chunk_interval[chunk_interval['quotation'].str.contains(regex_pattern)])

KeyboardInterrupt: 

In [938]:
quotes = pd.read_json('data/quotes-'+str(YEAR)+'.json.bz2', lines=True, compression='bz2', chunksize=chunksize, nrows=nrows)

In [939]:
quotes

<pandas.io.json._json.JsonReader at 0x7fb40e613d00>

In [940]:
chunk_interval_list = []
for chunk in tqdm(quotes, total=nrows // chunksize):
    chunk_interval = chunk[(chunk['date'] >= lower_YEAR) & (chunk['date'] <= upper_YEAR)]
    chunk_interval_list.append(chunk_interval[chunk_interval['quotation'].str.contains(regex_pattern)])

 99%|█████████▉| 218/220 [24:15<00:13,  6.67s/it]


In [941]:
len(chunk_interval_list)

218

In [942]:
df_interval = pd.concat(chunk_interval_list)

In [943]:
len(df_interval)

25880

In [946]:
df_interval.to_csv('data/heat_climate_processed.bz2',index=False, compression=compression)

In [944]:
df_interval

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
734,2019-08-14-004573,an [ ideological ] inquisition that harasses a...,,[],2019-08-14 17:00:01,1,"[[None, 0.6228], [Ken Cuccinelli, 0.3772]]",[https://instinctmagazine.com/meet-ken-cuccine...,E
1752,2019-07-18-007660,"brought the Fund into the 21st Century, using ...",Christine Lagarde,[Q484605],2019-07-18 19:19:32,1,"[[Christine Lagarde, 0.7709], [None, 0.2231], ...",[https://www.devex.com/news/who-s-ebola-declar...,E
2316,2019-06-28-012580,Climate change represents an existential threa...,Kamala Harris,[Q10853588],2019-06-28 04:10:00,1,"[[Kamala Harris, 0.6462], [Jay Inslee, 0.1847]...",[https://www.vox.com/policy-and-politics/2019/...,E
2393,2019-08-16-054254,My husband John's favorite four-letter word is...,,[],2019-08-16 06:51:00,3,"[[None, 0.7355], [Rachael Ray, 0.2645]]",[http://www.princegeorgecitizen.com/washington...,E
3854,2019-07-11-020788,Hamburger and hot dog buns at Walmart and othe...,,[],2019-07-11 12:00:30,1,"[[None, 0.5619], [Cory Booker, 0.3997], [Jeffr...",[https://thebiglead.com/2019/07/11/roundup-nin...,E
...,...,...,...,...,...,...,...,...,...
21760194,2019-06-28-013693,cut emissions and fight climate change from th...,,[],2019-06-28 16:39:48,2,"[[None, 0.6513], [Eric Garcetti, 0.3487]]",[http://thehill.com/policy/energy-environment/...,E
21760397,2019-07-18-012446,Eighty-seven degrees can make you just as sick...,Adam O'Connor,[Q4679563],2019-07-18 05:04:16,1,"[[Adam O'Connor, 0.8147], [None, 0.1853]]",[http://journalgazette.net/news/local/20190718...,E
21760773,2019-07-15-018460,Given the critical juncture we are at with cli...,Alice Martin,"[Q31664015, Q4725986]",2019-07-15 09:57:25,1,"[[Alice Martin, 0.9452], [None, 0.0548]]",[http://www.itpro.co.uk/automation/34014/tuc-a...,E
21761480,2019-07-21-009702,I cannot tell if it's the high pressure system...,,[],2019-07-21 23:10:14,1,"[[None, 0.7136], [Padma Lakshmi, 0.2864]]",[https://www.inquisitr.com/5542629/padma-laksh...,E


**To remove unwanted quotes based on regex pattern:**

In [954]:
## Option 1

In [952]:
quote='it makes no sense to keep on letting millions of illegal or legal immigrants flood into the United States, and to keep the tens of millions that are already here.'

In [953]:
quote='check out the huge flood migrant'

In [951]:
import re
word = quote
regex_pos = re.compile(r'flood')
regex_neg = re.compile(r'legal|migrant')
if regex_pos.search(word) and not regex_neg.search(word):
    print("match")


In [955]:
# Option 2 (more efficient)

In [None]:
df.drop(df.filter(regex='my_expression').columns,axis=1)