# Process (Filter) Data Extracted From Quotebank For Climate Change

**Imports**

In [15]:
import pandas as pd
import re # Used for verifying and running small tests on regex patterns
import datetime

from disaster_extr_constants import *
from disaster_extr_helpers import *

**Define parameters for dataset to be filtered and load it**

In [2]:
YEAR = 2017
disaster_type = 'heat_wave'
file_path = 'data/'+str(YEAR)+'_'+disaster_type+'_climate_processed_csv.bz2'

In [3]:
disaster_df = pd.read_csv(file_path, parse_dates = ['date'], compression='bz2').sort_values(by='date')

In [4]:
disaster_df.head(4)

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
4111,2017-01-09-001120,"The Witch Fire alone burned 197,000 acres,",Christopher Lyon,"['Q5112776', 'Q58323293']",2017-01-09 18:43:40,1,"[['Christopher Lyon', '0.8254'], ['None', '0.1...",['http://eastcountymagazine.org/sdge-makes-it-...,E
7088,2017-01-09-000073,"And I'm crying, cause I'm like, `My first slee...",Millie Bobby Brown,['Q25936414'],2017-01-09 19:02:43,2,"[['Millie Bobby Brown', '0.9293'], ['None', '0...",['http://www.justjared.com/2017/01/09/millie-b...,E
4959,2017-01-09-000915,So I go over and at like 3:00 in the morning t...,Millie Bobby Brown,['Q25936414'],2017-01-09 19:02:43,2,"[['Millie Bobby Brown', '0.9009'], ['None', '0...",['http://www.justjared.com/2017/01/09/millie-b...,E
4416,2017-01-09-000460,"I thought Toots was on fire tonight,",,[],2017-01-09 19:51:25,1,"[['None', '0.4198'], ['Patrick Kane', '0.3733'...",['http://wgnradio.com/2017/01/09/blackhawks-fo...,E


In [5]:
tags_pos_list = climate_tags_pos.tags.values.tolist()

In [6]:
tags_pos_list

['\\b([cC]limate ([iI]mpact|[cC]hange|[cC]risis|[mM]odel|[eE]mergency))\\b',
 '\\b([gG]lobal [wW]arming)\\b',
 '\\b([gG]reenhouse ([gG]as(es)?)|[eE]ffects?|[eE]missions?)\\b']

In [7]:
regex_pattern_pos = r'|'.join(tags_pos_list)

In [8]:
filtered_for_climate = extract_quotes(disaster_df, regex_pattern_pos).sort_values(by='date')

  mask = df[field].str.contains(regex)


In [9]:
len(filtered_for_climate)

1560

In [10]:
filtered_for_climate.head(4)

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
366,2017-01-11-000270,"But without bolder action, our children won't ...",President Barack Obama,['Q76'],2017-01-11 03:07:35,1,"[['President Barack Obama', '0.859'], ['None',...",['http://www.thewrap.com/obama-says-goodbye-ho...,E
8337,2017-01-11-001557,The truth is that climate change shouldn't be ...,John Kerry,"['Q22316', 'Q6242998']",2017-01-11 12:13:56,1,"[['John Kerry', '0.7591'], ['None', '0.224'], ...",['https://www.ibcworldnews.com/2017/01/11/obam...,E
5217,2017-01-12-000261,Columbia Gas of Ohio is proud to be a part of ...,,[],2017-01-12 05:23:47,1,"[['None', '0.926'], ['G. Thompson', '0.074']]",['http://thelantern.com/2017/01/ohio-state-bri...,E
8608,2017-01-12-001651,These Utahns see the effects of climate change...,Sheldon Whitehouse,['Q652066'],2017-01-12 15:00:05,1,"[['Sheldon Whitehouse', '0.8915'], ['None', '0...",['http://universe.byu.edu/2017/01/12/scientist...,E


In [11]:
extract_quotes(filtered_for_climate, r'\b([cC]onferences?)\b', complement=False).head(4)

  mask = df[field].str.contains(regex)


Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
10864,2017-01-30-006094,may have been a slump in the membership of the...,Mr Ward,['Q6929478'],2017-01-30 14:14:19,1,"[['Mr Ward', '0.9045'], ['None', '0.0826'], ['...",['http://www.dailyclimate.org/t/63981600425202...,E
3834,2017-02-15-086160,This disappearance of the hiatus in global war...,William Happer,"['Q7109821', 'Q8010838']",2017-02-15 13:13:33,1,"[['William Happer', '0.606'], ['None', '0.3143...",['http://www.environmentalhealthnews.org/t/-45...,E
2360,2017-02-17-000358,A high-level whistleblower has told this newsp...,,[],2017-02-17 22:46:52,1,"[['None', '0.8326'], ['John Bates', '0.1674']]",['http://tavoix.wordpress.com/2017/02/17/clima...,E
8109,2017-02-20-024963,Fourth Santa Fe Conference on Global & Regiona...,,[],2017-02-20 22:18:41,1,"[['None', '0.8948'], ['Chris Essex', '0.1052']]",['http://www.desmogblog.com/christopher-essex'],E


In [12]:
YEAR, disaster_type

(2017, 'heat_wave')

In [13]:
output_df = filtered_for_climate

In [14]:
output_df.head(4)

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
366,2017-01-11-000270,"But without bolder action, our children won't ...",President Barack Obama,['Q76'],2017-01-11 03:07:35,1,"[['President Barack Obama', '0.859'], ['None',...",['http://www.thewrap.com/obama-says-goodbye-ho...,E
8337,2017-01-11-001557,The truth is that climate change shouldn't be ...,John Kerry,"['Q22316', 'Q6242998']",2017-01-11 12:13:56,1,"[['John Kerry', '0.7591'], ['None', '0.224'], ...",['https://www.ibcworldnews.com/2017/01/11/obam...,E
5217,2017-01-12-000261,Columbia Gas of Ohio is proud to be a part of ...,,[],2017-01-12 05:23:47,1,"[['None', '0.926'], ['G. Thompson', '0.074']]",['http://thelantern.com/2017/01/ohio-state-bri...,E
8608,2017-01-12-001651,These Utahns see the effects of climate change...,Sheldon Whitehouse,['Q652066'],2017-01-12 15:00:05,1,"[['Sheldon Whitehouse', '0.8915'], ['None', '0...",['http://universe.byu.edu/2017/01/12/scientist...,E


In [None]:
write_df_to_disk(output_df, disaster_type, YEAR, additional_text='climate_filtered')