# Libraries

In [1]:
import pandas as pd
import numpy as np

# Functions

In [53]:
#Processing on chunk
                
def process_chunk(chunk, vocabulary):
    print(f'Processing chunk with {len(chunk)} rows')
    #print(chunk.columns)
    occurences = np.zeros(len(vocabulary))
    for index, word in enumerate(vocabulary):
        occurences[index] = np.sum(chunk['quotation'].str.contains(word)) 
    return occurences

#Select quotes containing keywords
def select_quotes_chunk(chunk, keywords):
    print(f'Processing chunk with {len(chunk)} rows')
    return chunk[chunk['quotation'].str.contains('|'.join(keywords))]

#Use the selection function on each chunk of the full dataset 
def select_quotes_one_year(path_to_file, vocabulary, chunksize = 10 ** 4):
    with pd.read_json(path_to_file, lines=True, compression='bz2', chunksize=chunksize) as df_reader:
        for index, chunk in enumerate(df_reader):
            if not index==0:
                selected_df = pd.concat([selected_df, select_quotes_chunk(chunk, vocabulary)])
            else: 
                selected_df = select_quotes_chunk(chunk, vocabulary)
    return selected_df

#Use the selection function on each chunk of the full dataset 
#Dumps the selected quotes into a new json file
def select_and_dump(path_to_file, vocabulary, chunksize = 10 ** 4):
    with pd.read_json(path_to_file, lines=True, compression='bz2', chunksize=chunksize) as df_reader:
        for index, chunk in enumerate(df_reader):
            #Dump selected quotes
            selected_df = select_quotes_chunk(chunk, vocabulary)
            pickle_file_name =randomword(10) +'.pkl'
            selected_df.to_pickle('files/'+pickle_file_name)
            #if not index==0:
                #selected_df = pd.concat([selected_df, select_quotes_chunk(chunk, vocabulary)])
            #else: 
               # selected_df = select_quotes_chunk(chunk, vocabulary)
    return selected_df


import random, string

def randomword(length):
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(length))

### INPUT PARAMETERS

In [62]:
DATA_FOLDER = '../data/'
QUOTEBANK_2020 = DATA_FOLDER+ "quotes-2020.json.bz2"
QUOTEBANK_2019 = DATA_FOLDER+ "quotes-2019.json.bz2"

KEYWORDS_LIST = ('women\'s right','Equal opportunities','Equal rights','Equal status','equal pay',
              'gender gap','Gender discrimination','Gender equality','Sexual harrassment',
              'Women empowerment','women victim','women immigration','Women emancipation',
              'women\'s participation','Western women','non-western woman','Muslim women',
              'Equal wages','Gender equality','gender equity','Men and women','women and men',
              'women oppression','niqab ban','struggle of girls','struggle of women','war against women',
              'oppression of girls','oppression of women','women oppression','women\'s opression','liberate women',
              'religious oppresion','abuse of women','Male oppression','Female oppression','Exploitation of women',
              'Indigenous women','Patriarchal culture','gender equality','child care','men pay','percentage men',
              'pay percentage','sexual harassment','women girls','girls women','first time',
              'rates women','women according','female mayors','share women','women movement',
              'see women','gender stereotypes','gender gap',
              'women representation','sex discrimination','states women','lose weight',
              'women rights','woman time',
            'based gender',
              'proportional electoral','female candidates','gender-based violence','entirely female','cities female',)

## Test on 2020 dataset

In [32]:
%time QOI_2020_DF = select_quotes_one_year(QUOTEBANK_2020,KEYWORDS_LIST,10 ** 4)

Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing

It tooks 6min 55s minutes to select quotes in the 2020 set.

In [34]:
len(QOI_2020_DF)

19016

In [37]:
QOI_2020_DF.head(15)['quotation']

433     For the first time ever with #SilentWitness I ...
649     I feel like 11 years I've been playing in the ...
918     I'm grateful to our NBCUniversal Content Studi...
1703    The first time I met him, I saw that he's a re...
1961    This case is the first time that procedure has...
1992    This is really, really exciting. I'm so excite...
2124    We are in the midst of one of the greatest gen...
2288    We will certainly see it for the very first ti...
2391    When I was still shooting [ 2016's ] `Suicide ...
2971    For the first time in almost half a century, p...
3350    I thought, `Well, this is a new way to age. Wh...
3539    In this particular moment, we really feel like...
3668    It's an extraordinary privilege to be elected ...
3793    Look, health care is a crisis in this country....
4197    That was around the time I was getting involve...
Name: quotation, dtype: object

Let's try now with the dumping

In [54]:
%time QOI_2020_DF_Dumped = select_and_dump(QUOTEBANK_2020,KEYWORDS_LIST,10 ** 4)

Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing

It tooks 7min 4s to select and dump

In [61]:
#loading the dataframes from pickle
import os
files = os.listdir('./files')
qoi_2020_df_from_pickle = pd.concat([pd.read_pickle('files/'+fp) for fp in files], ignore_index=True)
qoi_2020_df_from_pickle.head()
len(qoi_2020_df_from_pickle)

19016

In [63]:
%time QOI_2019_DF_Dumped = select_and_dump(QUOTEBANK_2019,KEYWORDS_LIST,10 ** 4)

Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing chunk with 10000 rows
Processing

In [65]:
#loading the dataframes from pickle
import os
files = os.listdir('./files')
qoi_2020_df_from_pickle = pd.concat([pd.read_pickle('files/'+fp) for fp in files], ignore_index=True)
qoi_2020_df_from_pickle.head()
len(qoi_2020_df_from_pickle)

106233

# Scratch

In [18]:
QUOTEBANK_2020

'data/quotes-2020.json.bz2'

In [24]:
pd.read_json(QUOTEBANK_2020, lines=True, compression='bz2', chunksize=10 ** 4)


for chunk in pd.read_json(QUOTEBANK_2020, lines=True, compression='bz2', chunksize=10 ** 4):
    print('chunk')
            

ValueError: Expected object or value

for the entire dataset

In [14]:
QUOTES_2020 = GENERATED_DATA_FOLDER + "quotes-2020-nytimes.json.bz2"
QUOTES_2019 = GENERATED_DATA_FOLDER + "quotes-2019-nytimes.json.bz2"
QUOTES_2018 = GENERATED_DATA_FOLDER + "quotes-2018-nytimes.json.bz2"
QUOTES_2017 = GENERATED_DATA_FOLDER + "quotes-2017-nytimes.json.bz2"
QUOTES_2016 = GENERATED_DATA_FOLDER + "quotes-2016-nytimes.json.bz2"
QUOTES_2015 = GENERATED_DATA_FOLDER + "quotes-2015-nytimes.json.bz2"

Quotebank_tuple = (QUOTES_2020, QUOTES_2019, QUOTES_2018, QUOTES_2017, QUOTES_2016, QUOTES_2015)

NameError: name 'GENERATED_DATA_FOLDER' is not defined

# Counting occurences on whole dataset code test

In [25]:
def process_chunk(chunk, vocabulary):
    print(f'Processing chunk with {len(chunk)} rows')
    #print(chunk.columns)
    occurences = np.zeros(len(vocabulary))
    for index, word in enumerate(vocabulary):
        occurences[index] = np.sum(chunk['quotation'].str.contains(word)) 
    return occurences

def colab_count_occurences_word(path_to_file, vocabulary, chunksize = 10 ** 4):
    occurences = np.zeros(len(vocabulary))
    for chunk in pd.read_json(path_to_file, lines=True, compression='bz2', chunksize=chunksize):
            occurences = occurences + process_chunk(chunk, vocabulary)
    return occurences

def count_occurences_word(path_to_file, vocabulary, chunksize = 10 ** 4):
    occurences = np.zeros(len(vocabulary))
    with pd.read_json(path_to_file, lines=True, compression='bz2', chunksize=chunksize) as df_reader:
        for chunk in df_reader:
            occurences = occurences + process_chunk(chunk, vocabulary)
    return occurences

def count_occurences_total(Quotebank_tuple, vocabulary, chunksize = 10 ** 6):
    years = ('2020','2019','2018','2017','2016','2015')
    occ = np.zeros(len(vocabulary))
    for i in range(len(Quotebank_tuple)):
        print('||||||||||||||||||||||||||year : ', years[i],'||||||||||||||||||||||||||')
        occurences = count_occurences_word(Quotebank_tuple[i], vocabulary, chunksize = chunksize)
        occ += occurences
        for index, word in enumerate(vocabulary):
            print(occ[index], 'occurences of the word' , word)
    return occ

In [7]:
vocabulary= ('Muslim','Islam ','Veil ','Burqa','sharia','ideology',
             'Western','migrant','Immigration','Victims ','Protect',
             'Save','Violence','Men','Dangerous','extremist','attacks',
             'terrorism','misogyny','sexism','Equal wages','Gender equality',
             'equity','gender gap','femonationalism')

In [8]:
%time count_occurences_word(Quotebank_tuple_test[0], vocabulary, chunksize = 10 ** 6)

Processing chunk with 207527 rows
Wall time: 16.6 s


array([211.,  34.,   3.,   0.,   0.,  96., 167., 367.,  21.,   5.,  45.,
        29.,  24., 157.,  20.,  67., 244., 166.,  20.,  33.,   0.,   0.,
        82.,   1.,   0.])

In [9]:
%time count_occurences_total(Quotebank_tuple_test, vocabulary, chunksize = 10 ** 6)

||||||||||||||||||||||||||year :  2020 ||||||||||||||||||||||||||
Processing chunk with 207527 rows
211.0 occurences of the word Muslim
34.0 occurences of the word Islam 
3.0 occurences of the word Veil 
0.0 occurences of the word Burqa
0.0 occurences of the word sharia
96.0 occurences of the word ideology
167.0 occurences of the word Western
367.0 occurences of the word migrant
21.0 occurences of the word Immigration
5.0 occurences of the word Victims 
45.0 occurences of the word Protect
29.0 occurences of the word Save
24.0 occurences of the word Violence
157.0 occurences of the word Men
20.0 occurences of the word Dangerous
67.0 occurences of the word extremist
244.0 occurences of the word attacks
166.0 occurences of the word terrorism
20.0 occurences of the word misogyny
33.0 occurences of the word sexism
0.0 occurences of the word Equal wages
0.0 occurences of the word Gender equality
82.0 occurences of the word equity
1.0 occurences of the word gender gap
0.0 occurences of the wo

array([1266.,  204.,   18.,    0.,    0.,  576., 1002., 2202.,  126.,
         30.,  270.,  174.,  144.,  942.,  120.,  402., 1464.,  996.,
        120.,  198.,    0.,    0.,  492.,    6.,    0.])

# Whole dataset : selecting and saving the quotes we want to keep

In [26]:



def select_quotes_all_years(Quotebank_tuple, vocabulary, chunksize = 10 ** 6):
    
    years = ('2020','2019','2018','2017','2016','2015')
    selected_df_all_years = [None] * 6
    for i in range(len(Quotebank_tuple)):
        print('____________________________________________________ YEAR : ', years[i],'')
        selected_df_all_years[i] = select_quotes_one_year(Quotebank_tuple[i], vocabulary, chunksize = chunksize)
    return pd.concat(selected_df_all_years)

In [None]:
chunk[chunk['quotation'].str.contains('|'.join(vocabulary))]

In [17]:
%time select_quotes_one_year(Quotebank_tuple_test[0], vocabulary, chunksize = 10 ** 6)

Processing chunk with 207527 rows
Wall time: 15.2 s


Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
38,2019-09-24-069091,"prejudice, the ignorance, the bigotry as well ...",Recep Tayyip Erdogan,[Q39259],2019-09-24 20:33:32,1,"[[Recep Tayyip Erdogan, 0.6337], [None, 0.3663]]",[http://www.nytimes.com/2019/09/24/world/unite...,E
163,2019-05-01-131407,What we do in our churches is to teach that Ch...,Brad Williams,"[Q4954351, Q4954354, Q4954356]",2019-05-01 00:22:40,1,"[[Brad Williams, 0.8584], [None, 0.1416]]",[http://www.nytimes.com/2019/04/30/opinion/ala...,E
307,2019-08-15-004935,"And I'm like, all immigrants want to come here...",,[],2019-08-15 09:00:17,1,"[[None, 0.7696], [Howard Lee, 0.2304]]",[https://www.nytimes.com/2019/08/15/arts/telev...,E
311,2019-03-15-008674,As New Zealand has stood by us so we stand sho...,Prime Minister Theresa May,[Q264766],2019-03-15 14:12:01,10,"[[Prime Minister Theresa May, 0.8443], [None, ...",[http://news.smh.com.au/world/oceania/police-d...,E
372,2019-11-21-132462,You may be aware of some of the attacks on Col...,"Johnson , Brad",[Q22984506],2019-11-21 23:28:32,2,"[[Johnson , Brad, 0.616], [None, 0.384]]",[http://www.nytimes.com/2019/11/21/podcasts/th...,E
...,...,...,...,...,...,...,...,...,...
206950,2019-08-22-098340,Voters have genuine affection for Joe Biden. T...,,[],2019-08-22 12:10:27,2,"[[None, 0.8161], [Jill Biden, 0.184]]",[http://www.nytimes.com/2019/08/22/us/politics...,E
206954,2019-06-23-054073,We preserved these agreements because terroris...,Mahmoud Abbas,"[Q10515624, Q127998, Q45193693]",2019-06-23 22:28:16,1,"[[Mahmoud Abbas, 0.8939], [None, 0.1061]]",[https://www.nytimes.com/2019/06/23/world/midd...,E
206979,2019-10-04-025648,He was pleased with your phone call. Mentioned...,,[],2019-10-04 13:15:28,5,"[[None, 0.8218], [Gordon Sondland, 0.1343], [R...",[http://nytimes.com/2019/10/04/us/politics/ukr...,E
207372,2019-07-10-003775,already apprehend and deport hundreds of thous...,President Barack Obama,[Q76],2019-07-10 21:34:59,1,"[[President Barack Obama, 0.8946], [None, 0.09...",[http://www.nytimes.com/2019/07/10/magazine/dr...,E


In [18]:
select_quotes_chunk(quotebank_2019, vocabulary)

Processing chunk with 207527 rows


Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
38,2019-09-24-069091,"prejudice, the ignorance, the bigotry as well ...",Recep Tayyip Erdogan,[Q39259],2019-09-24 20:33:32,1,"[[Recep Tayyip Erdogan, 0.6337], [None, 0.3663]]",[http://www.nytimes.com/2019/09/24/world/unite...,E
163,2019-05-01-131407,What we do in our churches is to teach that Ch...,Brad Williams,"[Q4954351, Q4954354, Q4954356]",2019-05-01 00:22:40,1,"[[Brad Williams, 0.8584], [None, 0.1416]]",[http://www.nytimes.com/2019/04/30/opinion/ala...,E
307,2019-08-15-004935,"And I'm like, all immigrants want to come here...",,[],2019-08-15 09:00:17,1,"[[None, 0.7696], [Howard Lee, 0.2304]]",[https://www.nytimes.com/2019/08/15/arts/telev...,E
311,2019-03-15-008674,As New Zealand has stood by us so we stand sho...,Prime Minister Theresa May,[Q264766],2019-03-15 14:12:01,10,"[[Prime Minister Theresa May, 0.8443], [None, ...",[http://news.smh.com.au/world/oceania/police-d...,E
372,2019-11-21-132462,You may be aware of some of the attacks on Col...,"Johnson , Brad",[Q22984506],2019-11-21 23:28:32,2,"[[Johnson , Brad, 0.616], [None, 0.384]]",[http://www.nytimes.com/2019/11/21/podcasts/th...,E
...,...,...,...,...,...,...,...,...,...
206950,2019-08-22-098340,Voters have genuine affection for Joe Biden. T...,,[],2019-08-22 12:10:27,2,"[[None, 0.8161], [Jill Biden, 0.184]]",[http://www.nytimes.com/2019/08/22/us/politics...,E
206954,2019-06-23-054073,We preserved these agreements because terroris...,Mahmoud Abbas,"[Q10515624, Q127998, Q45193693]",2019-06-23 22:28:16,1,"[[Mahmoud Abbas, 0.8939], [None, 0.1061]]",[https://www.nytimes.com/2019/06/23/world/midd...,E
206979,2019-10-04-025648,He was pleased with your phone call. Mentioned...,,[],2019-10-04 13:15:28,5,"[[None, 0.8218], [Gordon Sondland, 0.1343], [R...",[http://nytimes.com/2019/10/04/us/politics/ukr...,E
207372,2019-07-10-003775,already apprehend and deport hundreds of thous...,President Barack Obama,[Q76],2019-07-10 21:34:59,1,"[[President Barack Obama, 0.8946], [None, 0.09...",[http://www.nytimes.com/2019/07/10/magazine/dr...,E


In [20]:
%time big_dataframe = select_quotes_all_years(Quotebank_tuple_test, vocabulary, chunksize = 10 ** 6)

____________________________________________________ YEAR :  2020 
Processing chunk with 207527 rows
____________________________________________________ YEAR :  2019 
Processing chunk with 207527 rows
____________________________________________________ YEAR :  2018 
Processing chunk with 207527 rows
____________________________________________________ YEAR :  2017 
Processing chunk with 207527 rows
____________________________________________________ YEAR :  2016 
Processing chunk with 207527 rows
____________________________________________________ YEAR :  2015 
Processing chunk with 207527 rows
Wall time: 1min 26s


In [22]:
big_dataframe.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
38,2019-09-24-069091,"prejudice, the ignorance, the bigotry as well ...",Recep Tayyip Erdogan,[Q39259],2019-09-24 20:33:32,1,"[[Recep Tayyip Erdogan, 0.6337], [None, 0.3663]]",[http://www.nytimes.com/2019/09/24/world/unite...,E
163,2019-05-01-131407,What we do in our churches is to teach that Ch...,Brad Williams,"[Q4954351, Q4954354, Q4954356]",2019-05-01 00:22:40,1,"[[Brad Williams, 0.8584], [None, 0.1416]]",[http://www.nytimes.com/2019/04/30/opinion/ala...,E
307,2019-08-15-004935,"And I'm like, all immigrants want to come here...",,[],2019-08-15 09:00:17,1,"[[None, 0.7696], [Howard Lee, 0.2304]]",[https://www.nytimes.com/2019/08/15/arts/telev...,E
311,2019-03-15-008674,As New Zealand has stood by us so we stand sho...,Prime Minister Theresa May,[Q264766],2019-03-15 14:12:01,10,"[[Prime Minister Theresa May, 0.8443], [None, ...",[http://news.smh.com.au/world/oceania/police-d...,E
372,2019-11-21-132462,You may be aware of some of the attacks on Col...,"Johnson , Brad",[Q22984506],2019-11-21 23:28:32,2,"[[Johnson , Brad, 0.616], [None, 0.384]]",[http://www.nytimes.com/2019/11/21/podcasts/th...,E


In [21]:
big_dataframe['quotation']

38        prejudice, the ignorance, the bigotry as well ...
163       What we do in our churches is to teach that Ch...
307       And I'm like, all immigrants want to come here...
311       As New Zealand has stood by us so we stand sho...
372       You may be aware of some of the attacks on Col...
                                ...                        
206950    Voters have genuine affection for Joe Biden. T...
206954    We preserved these agreements because terroris...
206979    He was pleased with your phone call. Mentioned...
207372    already apprehend and deport hundreds of thous...
207469    They have built capabilities to mount large-sc...
Name: quotation, Length: 10518, dtype: object

In [23]:
big_dataframe['quotation'].iloc[0]

'prejudice, the ignorance, the bigotry as well as the attempts of marginalizing toward migrants, particularly Muslims.'

In [26]:
big_dataframe['quotation'].str.contains('|'.join(vocabulary))

38        True
163       True
307       True
311       True
372       True
          ... 
206950    True
206954    True
206979    True
207372    True
207469    True
Name: quotation, Length: 10518, dtype: bool