In [None]:

### -------- IMPORTS -----------------------------------------------------------------------------------------

import pandas as pd
import bz2
import json
import extraction_helpers as eh
import pickle



### ------------------ HELPER FUNCTIONS-----------------------------------------------------------------------

def extract_quotes_keyword(chunk, keyword, not_keywords):
    ### Helper function to extract all quotes containing a list of keywords (case-sensitive)
    ### Returns a dictionary of dataframes

    
    # For each keyword, extract the quotes from the chunk
    
    print(f"Processing chunk with {len(chunk)} rows, looking for quotes containing '{keyword}' but not '{not_keywords}'.")
    df = chunk[chunk["quotation"].str.contains(keyword, na=False)]
    for not_keyword in not_keywords:
        df = df[~df["quotation"].str.contains(not_keyword, na=False)]
    return df

def extract_quotes(data, keyword, not_keywords, chunksize = 1000000, compression = 'bz2'):
 
    with pd.read_json(data, lines=True, compression=compression, chunksize=chunksize) as df_reader:
        df = pd.DataFrame({'quoteID':'', 'quotation':'', 'speaker':'',\
            'qids':'', 'date':'', 'probas':'', 'numOccurences':'', 'phase':''}, index = [0])

        for chunk in df_reader:
            quotes = extract_quotes_keyword(chunk, keyword, not_keywords)
            df = pd.concat([df, quotes], ignore_index=True)
    
        return df.iloc[1:,:]


In [None]:
filename = open("extracted_data_other/output_2020 speakers_False keywords_['flood'].pickle", "rb")
dict_df = pickle.load(filename)

Shape of the unfiltered extracted data

In [None]:
dict_df['flood'].shape

In [None]:
not_keywords = ['flooded by', 'flooded with', 'a flood of', 'floodgate', 'flood gate', 'flood plain', 'floodplain']
data = 'data/quotes-2020.json.bz2'
keyword = 'flood'

In [None]:
df_filter = extract_quotes(data, keyword=keyword, not_keywords=not_keywords)

Shape of the filtered extracted data

In [None]:
df_filter.shape

We have been able to filter about 10% of data that does not relate to actual floods. 

In [None]:
df_filter['quotation'].head(12)

As we can see, there are still a few quotations that do not talk about actual floods. These will be filtered in more detail in the last milestone, however we also guess that these unwanted quotes should be evenly distributed across a period and should thus not pose great issues. Indeed, their rate of occurence is normally not related to any exterior events, as they are simply part of the language. 