Important links : https://docs.dask.org/en/stable/dataframe.html

In [1]:
import pandas as pd
import numpy as np

In [2]:
generated_data = 'generated_data/'
parquet_file_path = generated_data +  "speaker_attributes.parquet"

### Loading the parquet file

In [3]:
%time parquet_df = pd.read_parquet(parquet_file_path)

Wall time: 17.8 s


In [4]:
%time parquet_df.head()

Wall time: 0 ns


Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
0,"[Washington, President Washington, G. Washingt...",[+1732-02-22T00:00:00Z],"[Q161885, Q30]",[Q6581097],1395141751,,W000178,"[Q82955, Q189290, Q131512, Q1734662, Q294126, ...",[Q327591],,Q23,George Washington,"[Q698073, Q697949]",item,[Q682443]
1,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",[+1952-03-11T00:00:00Z],[Q145],[Q6581097],1395737157,[Q7994501],,"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",,,Q42,Douglas Adams,,item,
2,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",[+1868-08-23T00:00:00Z],[Q31],[Q6581097],1380367296,,,"[Q36180, Q40348, Q182436, Q1265807, Q205375, Q...",,,Q1868,Paul Otlet,,item,
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
4,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",[+1599-06-06T00:00:00Z],[Q29],[Q6581097],1391704596,,,[Q1028181],,,Q297,Diego Velázquez,,item,


In [5]:
%time parquet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9055981 entries, 0 to 9055980
Data columns (total 15 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   aliases             object
 1   date_of_birth       object
 2   nationality         object
 3   gender              object
 4   lastrevid           int64 
 5   ethnic_group        object
 6   US_congress_bio_ID  object
 7   occupation          object
 8   party               object
 9   academic_degree     object
 10  id                  object
 11  label               object
 12  candidacy           object
 13  type                object
 14  religion            object
dtypes: int64(1), object(14)
memory usage: 1.0+ GB
Wall time: 32 ms


In [6]:
%time mem_usage = parquet_df.memory_usage(deep=True).sum()

Wall time: 13 s


In [7]:
print('the parquet dataframe takes about {:.2f} gigabytes in memory'.format(mem_usage*1e-9 ))

the parquet dataframe takes about 6.41 gigabytes in memory


### Loading the quote sample file

In [8]:
%time quotebank_2019_sample = pd.read_json('generated_data/quotes-2019-nytimes.json.bz2', lines=True, compression='bz2')
%time quotebank_2019_sample.head()

Wall time: 11.7 s
Wall time: 0 ns


Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
0,2019-04-17-024782,"It is not a low-income immigration,",James Fisher,"[Q16213953, Q20707104, Q43143598, Q58886302, Q...",2019-04-17 13:31:18,1,"[[James Fisher, 0.7475], [None, 0.2525]]",[https://www.nytimes.com/2019/04/17/realestate...,E
1,2019-04-02-001128,a champion figure skater switching to roller s...,John Updike,[Q105756],2019-04-02 14:58:33,2,"[[John Updike, 0.5856], [None, 0.4144]]",[https://www.nytimes.com/2019/04/02/opinion/vl...,E
2,2019-05-09-055187,It makes it much more difficult for him to mak...,,[],2019-05-09 18:11:29,1,"[[None, 0.6493], [President Bill Clinton, 0.27...",[http://mobile.nytimes.com/2019/05/09/world/as...,E
3,2019-10-31-056366,"It puts me in a predicament,",Xavier Becerra,[Q1855840],2019-10-31 16:45:15,3,"[[Xavier Becerra, 0.9065], [None, 0.0909], [St...",[http://www.nytimes.com/2019/10/31/technology/...,E
4,2019-01-04-001792,A Pile of Leaves.,,[],2019-01-04 10:00:07,1,"[[None, 0.8737], [Jason Fulford, 0.1263]]",[https://www.nytimes.com/2019/01/04/books/revi...,E


In [9]:
%time mem_usage2 = quotebank_2019_sample.memory_usage(deep=True).sum()
print('the quotebank sample dataframe takes about {:.2f} megabytes in memory'.format(mem_usage2*1e-6 ))

Wall time: 286 ms
the quotebank sample dataframe takes about 135.38 megabytes in memory


Selecting only the first qid in qids column of sample data and dropping rows with no qids

In [10]:
def preprocess(df):
    # remove lines with empty qids
    df = df[~df['qids'].str.len().eq(0)]
    # take only the first qid
    df['qids'] = df['qids'].apply(lambda x: x[0]) # this gives a warning
    return df

In [11]:
quotebank_2019_sample_clean = preprocess(quotebank_2019_sample)
quotebank_2019_sample_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
0,2019-04-17-024782,"It is not a low-income immigration,",James Fisher,Q16213953,2019-04-17 13:31:18,1,"[[James Fisher, 0.7475], [None, 0.2525]]",[https://www.nytimes.com/2019/04/17/realestate...,E
1,2019-04-02-001128,a champion figure skater switching to roller s...,John Updike,Q105756,2019-04-02 14:58:33,2,"[[John Updike, 0.5856], [None, 0.4144]]",[https://www.nytimes.com/2019/04/02/opinion/vl...,E
3,2019-10-31-056366,"It puts me in a predicament,",Xavier Becerra,Q1855840,2019-10-31 16:45:15,3,"[[Xavier Becerra, 0.9065], [None, 0.0909], [St...",[http://www.nytimes.com/2019/10/31/technology/...,E
5,2019-08-15-002017,A Senator we can call our own.,Tom Rath,Q7817334,2019-08-15 22:36:33,1,"[[Tom Rath, 0.7598], [None, 0.1993], [Warren R...",[http://www.nytimes.com/2019/08/15/us/politics...,E
8,2019-07-22-032883,"It's a success, a relief and a technical feat,",Florence Parly,Q3074013,2019-07-22 02:37:50,21,"[[Florence Parly, 0.9262], [None, 0.0738]]",[http://www.breitbart.com/news/french-submarin...,E


### Merging the parquet file and the qid samples

In [12]:
%time merged_df = quotebank_2019_sample_clean.merge(parquet_df, left_on = 'qids', right_on = 'id')

Wall time: 8.44 s


In [13]:
merged_df.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,aliases,...,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
0,2019-04-17-024782,"It is not a low-income immigration,",James Fisher,Q16213953,2019-04-17 13:31:18,1,"[[James Fisher, 0.7475], [None, 0.2525]]",[https://www.nytimes.com/2019/04/17/realestate...,E,[James Scott Fisher],...,,,"[Q33999, Q10798782, Q3282637]",,,Q16213953,James Fisher,,item,
1,2019-11-01-016433,Dispossession is coming for you,James Fisher,Q16213953,2019-11-01 15:59:32,1,"[[James Fisher, 0.84], [None, 0.1063], [Dua Li...",[http://www.nytimes.com/2019/11/01/arts/music/...,E,[James Scott Fisher],...,,,"[Q33999, Q10798782, Q3282637]",,,Q16213953,James Fisher,,item,
2,2019-10-15-061398,Medicare for All can benefit the trans communi...,James Fisher,Q16213953,2019-10-15 14:03:36,2,"[[James Fisher, 0.9272], [None, 0.0729]]",[http://nytimes.com/2019/10/15/us/politics/you...,E,[James Scott Fisher],...,,,"[Q33999, Q10798782, Q3282637]",,,Q16213953,James Fisher,,item,
3,2019-04-17-059333,"which improves the affordability,",James Fisher,Q16213953,2019-04-17 13:31:18,1,"[[James Fisher, 0.9139], [None, 0.0861]]",[https://www.nytimes.com/2019/04/17/realestate...,E,[James Scott Fisher],...,,,"[Q33999, Q10798782, Q3282637]",,,Q16213953,James Fisher,,item,
4,2019-10-15-054424,It's embedded in the language of Medicare for ...,James Fisher,Q16213953,2019-10-15 14:03:36,2,"[[James Fisher, 0.8908], [None, 0.1093]]",[http://nytimes.com/2019/10/15/us/politics/you...,E,[James Scott Fisher],...,,,"[Q33999, Q10798782, Q3282637]",,,Q16213953,James Fisher,,item,


In [14]:
%time mem_usage3 = merged_df.memory_usage(deep=True).sum()*1e-6
print('the quotebank merged dataframe takes about {:.2f} megabytes in memory'.format(mem_usage3))

Wall time: 446 ms
the quotebank merged dataframe takes about 206.87 megabytes in memory


### Other stuff

TODO : make the labels from the parquet file readable using the QID code from the QID to label notebook

In [15]:
type(merged_df['occupation'][1])

numpy.ndarray

In [16]:
merged_df.dtypes

quoteID                       object
quotation                     object
speaker                       object
qids                          object
date                  datetime64[ns]
numOccurrences                 int64
probas                        object
urls                          object
phase                         object
aliases                       object
date_of_birth                 object
nationality                   object
gender                        object
lastrevid                      int64
ethnic_group                  object
US_congress_bio_ID            object
occupation                    object
party                         object
academic_degree               object
id                            object
label                         object
candidacy                     object
type                          object
religion                      object
dtype: object

Just looking at the types of every column of the new dataframe

In [17]:
obj_types = {col: set(map(type, merged_df[col])) for col in merged_df.select_dtypes(include=[object])}

print(obj_types)

{'quoteID': {<class 'str'>}, 'quotation': {<class 'str'>}, 'speaker': {<class 'str'>}, 'qids': {<class 'str'>}, 'probas': {<class 'list'>}, 'urls': {<class 'list'>}, 'phase': {<class 'str'>}, 'aliases': {<class 'numpy.ndarray'>, <class 'NoneType'>}, 'date_of_birth': {<class 'numpy.ndarray'>, <class 'NoneType'>}, 'nationality': {<class 'numpy.ndarray'>, <class 'NoneType'>}, 'gender': {<class 'numpy.ndarray'>, <class 'NoneType'>}, 'ethnic_group': {<class 'numpy.ndarray'>, <class 'NoneType'>}, 'US_congress_bio_ID': {<class 'str'>, <class 'NoneType'>}, 'occupation': {<class 'numpy.ndarray'>, <class 'NoneType'>}, 'party': {<class 'numpy.ndarray'>, <class 'NoneType'>}, 'academic_degree': {<class 'numpy.ndarray'>, <class 'NoneType'>}, 'id': {<class 'str'>}, 'label': {<class 'str'>}, 'candidacy': {<class 'numpy.ndarray'>, <class 'NoneType'>}, 'type': {<class 'str'>}, 'religion': {<class 'numpy.ndarray'>, <class 'NoneType'>}}


In [18]:
merged_df.iloc[1]

quoteID                                               2019-11-01-016433
quotation                               Dispossession is coming for you
speaker                                                    James Fisher
qids                                                          Q16213953
date                                                2019-11-01 15:59:32
numOccurrences                                                        1
probas                [[James Fisher, 0.84], [None, 0.1063], [Dua Li...
urls                  [http://www.nytimes.com/2019/11/01/arts/music/...
phase                                                                 E
aliases                                            [James Scott Fisher]
date_of_birth                                   [+1972-04-20T00:00:00Z]
nationality                                                      [Q145]
gender                                                       [Q6581097]
lastrevid                                                    139

A lot of the columns contain multiple QID entries : 

In [19]:
print('maximum number of QID entry for each of the new columns')
print('nationality :', merged_df['nationality'].apply(lambda x : len(x) if x is not None else 0).max())
print('gender :', merged_df['gender'].apply(lambda x : len(x) if x is not None else 0).max())
print('ethnic_group :', merged_df['ethnic_group'].apply(lambda x : len(x) if x is not None else 0).max())
print('US_congress_bio_ID :', merged_df['US_congress_bio_ID'].apply(lambda x : len(x) if x is not None else 0).max())
print('occupation :', merged_df['occupation'].apply(lambda x : len(x) if x is not None else 0).max())
print('party :', merged_df['party'].apply(lambda x : len(x) if x is not None else 0).max())
print('academic_degree :', merged_df['academic_degree'].apply(lambda x : len(x) if x is not None else 0).max())
print('id :', merged_df['id'].apply(lambda x : len(x) if x is not None else 0).max())
print('label :', merged_df['label'].apply(lambda x : len(x) if x is not None else 0).max())
print('candidacy :', merged_df['candidacy'].apply(lambda x : len(x) if x is not None else 0).max())
print('type :', merged_df['type'].apply(lambda x : len(x) if x is not None else 0).max())
print('religion :', merged_df['religion'].apply(lambda x : len(x) if x is not None else 0).max())

maximum number of QID entry for each of the new columns
nationality : 6
gender : 2
ethnic_group : 6
US_congress_bio_ID : 7
occupation : 24
party : 9
academic_degree : 5
id : 9
label : 50
candidacy : 13
type : 4
religion : 7


Since this time it is a reasonable number ( vers 451 in the 2019 sample dataset) we could potentially match all entries to terms inside of 

Instead of manually checking we use library re to scan every column if there are QIDs. 
Most columns are composed of a numpy nd.array containing a tuple of strings, every string being a qid. So we take the first access the tuple, then take the first element, then check if it's a QID.

In [20]:
import re

In [21]:
def check_if_qid(df):
    '''
    take first term of list out of ndarray, and checks if it's a QID
    '''
    return df.apply(lambda x: x[0] if x is not None else None).str.contains(r'[Q][0-9]+').any()

In [22]:
print('column : nationality ', check_if_qid(merged_df['nationality']))
print('column : gender ', check_if_qid(merged_df['gender']))
print('column : ethnic_group ', check_if_qid(merged_df['ethnic_group']))
print('column : US_congress_bio_ID ', check_if_qid(merged_df['US_congress_bio_ID']))
print('column : occupation ', check_if_qid(merged_df['occupation']))
print('column : party ', check_if_qid(merged_df['party']))
print('column : academic_degree ', check_if_qid(merged_df['academic_degree']))
print('column : id ', check_if_qid(merged_df['id']))
print('column : label ', check_if_qid(merged_df['label']))
print('column : candidacy ', check_if_qid(merged_df['candidacy']))
print('column : type ', check_if_qid(merged_df['type']))
print('column : religion ', check_if_qid(merged_df['religion']))

column : nationality  True
column : gender  True
column : ethnic_group  True
column : US_congress_bio_ID  False
column : occupation  True
column : party  True
column : academic_degree  True
column : id  False
column : label  False
column : candidacy  True
column : type  False
column : religion  True


Selected columns that need QID to label translations : 
 - nationality
 - gender
 - ethnic_group
 - occupation
 - party
 - academic_degree
 - candidacy
 - religion

In [23]:
def process_chunk(chunk, qids_clean_merged):
    """
    this function processes one chunk of data
    """
    qids_clean_merged = qids_clean_merged.reindex(columns=qids_clean_merged.columns.union(chunk.columns))
    
    qids_clean_merged.update(chunk)

    return qids_clean_merged

def process_qid_one(path_to_file, qids_clean, chunksize = 10 ** 4):
    """
    this function processes a csv file in chunks
    
    here it attributes qid to their respective labels and descriptions 
    ( by joining the qid label df to the df not containing them)
    """
    qids_clean_merged = qids_clean.copy()
    qids_clean_merged['Label']=np.NaN
    qids_clean_merged['Description']=np.NaN
    qids_clean_merged.set_index('qids',inplace=True)
    qids_clean_merged.index.rename('QID', inplace=True)
    
    with pd.read_csv(path_to_file, compression='bz2', index_col='QID', chunksize=chunksize) as df_reader:
        for chunk in df_reader:
            qids_clean_merged = process_chunk(chunk, qids_clean_merged)
    return qids_clean_merged

In [24]:
qids_onlyquotebank_path = generated_data + "wikidata_labels_descriptions_quotebank.csv.bz2"

In [25]:
def first_qid(df):
    '''
    take first term of list out of ndarray
    '''
    return df.apply(lambda x: x[0] if x is not None else None)

In [26]:
first_qid(merged_df['religion']).dropna()

8         Q6423963
9         Q6423963
10        Q6423963
11        Q6423963
12        Q6423963
            ...   
130872      Q75809
130886     Q288928
130922       Q9592
130923       Q9268
130955       Q7066
Name: religion, Length: 28849, dtype: object

In [27]:
def prep_df(df):
    df = first_qid(df).to_frame().copy()
    return df.rename(columns = {df.columns[0]:'qids'})

In [28]:
nationality_df = prep_df(merged_df['nationality'])
gender_df = prep_df(merged_df['gender'])
ethnic_group_df = prep_df(merged_df['ethnic_group'])
occupation_df = prep_df(merged_df['occupation'])
party_df = prep_df(merged_df['party'])
academic_degree_df = prep_df(merged_df['academic_degree'])
candidacy_df = prep_df(merged_df['candidacy'])
religion_df = prep_df(merged_df['religion'])
religion_df

Unnamed: 0,qids
0,
1,
2,
3,
4,
...,...
130956,
130957,
130958,
130959,


In [29]:
%time nationality_df = process_qid_one(qids_onlyquotebank_path, nationality_df , chunksize = 10 ** 6)
%time gender_df = process_qid_one(qids_onlyquotebank_path, gender_df , chunksize = 10 ** 6)
%time ethnic_group_df = process_qid_one(qids_onlyquotebank_path, ethnic_group_df , chunksize = 10 ** 6)
%time occupation_df = process_qid_one(qids_onlyquotebank_path, occupation_df , chunksize = 10 ** 6)
%time party_df = process_qid_one(qids_onlyquotebank_path, party_df , chunksize = 10 ** 6)
%time academic_degree_df = process_qid_one(qids_onlyquotebank_path, academic_degree_df , chunksize = 10 ** 6)
%time candidacy_df = process_qid_one(qids_onlyquotebank_path, candidacy_df , chunksize = 10 ** 6)
%time religion_df = process_qid_one(qids_onlyquotebank_path, religion_df , chunksize = 10 ** 6)

Wall time: 199 ms
Wall time: 184 ms
Wall time: 191 ms
Wall time: 185 ms
Wall time: 243 ms
Wall time: 232 ms
Wall time: 213 ms
Wall time: 227 ms


In [32]:
religion_df

Unnamed: 0_level_0,Label,Description
QID,Unnamed: 1_level_1,Unnamed: 2_level_1
,,
,,
,,
,,
,,
...,...,...
,,
,,
,,
,,


In [None]:
df.rename(columns = {df.columns[0]:'qids'})