In [1]:

### -------- IMPORTS -----------------------------------------------------------------------------------------

import pandas as pd
import bz2
import json
import extraction_helpers as eh
import pickle



### ------------------ HELPER FUNCTIONS-----------------------------------------------------------------------

def extract_quotes_keyword(chunk, keyword, not_keywords):
    ### Helper function to extract all quotes containing a list of keywords (case-sensitive)
    ### Returns a dictionary of dataframes

    
    # For each keyword, extract the quotes from the chunk
    
    print(f"Processing chunk with {len(chunk)} rows, looking for quotes containing '{keyword}' but not '{not_keywords}'.")
    df = chunk[chunk["quotation"].str.contains(keyword, na=False)]
    for not_keyword in not_keywords:
        df = df[~df["quotation"].str.contains(not_keyword, na=False)]
    return df

def extract_quotes(data, keyword, not_keywords, chunksize = 1000000, compression = 'bz2'):
 
    with pd.read_json(data, lines=True, compression=compression, chunksize=chunksize) as df_reader:
        df = pd.DataFrame({'quoteID':'', 'quotation':'', 'speaker':'',\
            'qids':'', 'date':'', 'probas':'', 'numOccurences':'', 'phase':''}, index = [0])

        for chunk in df_reader:
            quotes = extract_quotes_keyword(chunk, keyword, not_keywords)
            df = pd.concat([df, quotes], ignore_index=True)
    
        return df.iloc[1:,:]


In [2]:
filename = open("extracted_data_other/output_2020 speakers_False keywords_['flood'].pickle", "rb")
dict_df = pickle.load(filename)

Shape of the unfiltered extracted data

In [3]:
dict_df['flood'].shape

(5033, 10)

In [4]:
not_keywords = ['flooded by', 'flooded with', 'a flood of', 'floodgate', 'flood gate', 'flood plain', 'floodplain']
data = 'data/quotes-2020.json.bz2'
keyword = 'flood'

In [5]:
df_filter = extract_quotes(data, keyword=keyword, not_keywords=not_keywords)

Processing chunk with 1000000 rows, looking for quotes containing 'flood' but not '['flooded by', 'flooded with', 'a flood of', 'floodgate', 'flood gate', 'flood plain', 'floodplain']'.
Processing chunk with 1000000 rows, looking for quotes containing 'flood' but not '['flooded by', 'flooded with', 'a flood of', 'floodgate', 'flood gate', 'flood plain', 'floodplain']'.
Processing chunk with 1000000 rows, looking for quotes containing 'flood' but not '['flooded by', 'flooded with', 'a flood of', 'floodgate', 'flood gate', 'flood plain', 'floodplain']'.
Processing chunk with 1000000 rows, looking for quotes containing 'flood' but not '['flooded by', 'flooded with', 'a flood of', 'floodgate', 'flood gate', 'flood plain', 'floodplain']'.
Processing chunk with 1000000 rows, looking for quotes containing 'flood' but not '['flooded by', 'flooded with', 'a flood of', 'floodgate', 'flood gate', 'flood plain', 'floodplain']'.
Processing chunk with 244449 rows, looking for quotes containing 'floo

Shape of the filtered extracted data

In [6]:
df_filter.shape

(4476, 10)

We have been able to filter about 10% of data that does not relate to actual floods. 

In [7]:
df_filter['quotation'].head(12)

1     How will they attend to the problems of the pe...
2     Having witnessed significant devastation this ...
3     When the flood goes to 12 metres... Ergon will...
4     Nebraskans in District 23 and across the state...
5     As water recedes over the weekend, the highway...
6     Because of the heavy rain predicted again this...
7     Opening the dams caused soil erosion and cause...
8     Some other church events might be canceled -- ...
9     The 14 volunteer Flood Wardens in the communit...
10    The heartbreak of shame and guilt came floodin...
11    I don't know why we're not flooding the zone w...
12    Recent weather in the Barkly region has floode...
Name: quotation, dtype: object

As we can see, there are still a few quotations that do not talk about actual floods. These will be filtered in more detail in the last milestone, however we also guess that these unwanted quotes should be evenly distributed across a period and should thus not pose great issues. Indeed, their rate of occurence is normally not related to any exterior events, as they are simply part of the language. 