# Parsing Dates

In [1]:
%load_ext autoreload
%autoreload 2
from cord.cord19 import *
from cord.core import parallel, ifnone, describe_dataframe
import pandas as pd
import numpy as np
import pendulum
MAX_ROWS = 2000
pd.options.display.max_colwidth = 120
pd.options.display.max_rows = MAX_ROWS

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dwight\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
research_papers = ResearchPapers.from_data_dir()
research_papers.save()

Loading metadata from data\CORD-19-research-challenge
Cleaning metadata
Fixing dates that are a list e.g. ['2020-02-05', '2020-02']
Fixing dates with the seasons e.g. 2014 Autumn
Fix dates like 2016 Nov 9 Jan-Feb
Fix dates like 2012 Jan-Mar
Convert Dates like 2020 Apr 13
Converting Dates like 2020 Apr
Converting Dates like 2020
Converting Dates like 2020-01-21
Indexing research papers


In [None]:
research_papers = ResearchPapers.from_pickle()

## Published Dates

In [29]:
metadata = ResearchPapers.load_metadata()

Loading metadata from data\CORD-19-research-challenge
Cleaning metadata
Fixing dates that are a list e.g. ['2020-02-05', '2020-02']
Fixing dates with the seasons e.g. 2014 Autumn
Fix dates like 2016 Nov 9 Jan-Feb
Fix dates like 2012 Jan-Mar
Convert Dates like 2020 Apr 13
Converting Dates like 2020 Apr
Converting Dates like 2020
Converting Dates like 2020-01-21


In [None]:
describe_dataframe(research_papers.metadata)

In [5]:
from functools import partial
def format_date(date, format):
    try:
        return pd.to_datetime(date, format=format)
    except ValueError:
        if re.match(YYYY_MON_DD, date):
            date = f'{date[:8]} 28'
            return pd.to_datetime(date, format=format)
        return f'ValueError'

mdates = metadata[['publish_time','published']].copy().fillna('')

# Fix dates that are a list e.g. ['2020-02-05', '2020-02']
print("Fixing dates that are a list e.g. ['2020-02-05', '2020-02']")
idx_list = mdates.publish_time.str.match("\[.*")
mdates.loc[idx_list, 'publish_time'] = mdates.loc[idx_list] \
                                        .publish_time.apply(lambda d: d[2:12])

## Fix dates with the seasons e.g. 2014 Autumn
print("Fixing dates with the seasons e.g. 2014 Autumn")
idx_seasons = mdates.publish_time.str.match('.*(Spring|Summer|Fall|Autumn|Winter)')
mdates.loc[idx_seasons, 'publish_time'] = mdates.loc[idx_seasons].publish_time \
                                                .str.replace('Spring', 'Apr 01')\
                                                .str.replace('Summer', 'Jul 01')\
                                                .str.replace('Autumn', 'Oct 01')\
                                                .str.replace('Fall', 'Oct 01')\
                                                .str.replace('Winter', 'Dec 21')

## Fix dates like 2016 Nov 9 Jan-Feb
print("Fix dates like 2016 Nov 9 Jan-Feb")
idx_YYYY_MON_DD_extra = mdates.publish_time.str.match('\d{4} \w{3} \d{1,2}.+$')
mdates.loc[idx_YYYY_MON_DD_extra, 'publish_time'] = \
    mdates.loc[idx_YYYY_MON_DD_extra].publish_time.apply(lambda d: d[:11].strip())

## Fix dates like 2012 Jan-Mar
print("Fix dates like 2012 Jan-Mar")
idx_YYYY_MON_MON = mdates.publish_time.str.match('\d{4} \w{3}-\w{3}$')
mdates.loc[idx_YYYY_MON_MON, 'publish_time'] = \
    mdates.loc[idx_YYYY_MON_MON].publish_time.apply(lambda d: d[:8].strip())

# Convert Dates like 2020 Apr 13
print("Convert Dates like 2020 Apr 13")
YYYY_MON_DD = '\d{4} \w{3} \d{1,2}$'
idx_YYYY_MON_DD = mdates.publish_time.str.match(YYYY_MON_DD, case=False)
## Now parse the date
mdates.loc[idx_YYYY_MON_DD,'publish_date']  = \
        mdates.loc[idx_YYYY_MON_DD, 'publish_time'].apply(partial(format_date, format='%Y %b %d'))

# Convert Dates like 2020 Apr
print("Convert Dates like 2020 Apr")
YYYY_MON = '\d{4} \w{3}$'
idx_YYYY_MON = mdates.publish_time.str.match(YYYY_MON, case=False)
mdates.loc[idx_YYYY_MON,'publish_date']  = \
        mdates.loc[idx_YYYY_MON, 'publish_time'].apply(partial(format_date, format='%Y %b'))

# Convert Dates like 2020
print("Convert Dates like 2020")
YYYY = '\d{4}$'
idx_YYYY = mdates.publish_time.str.match(YYYY, case=False)
mdates.loc[idx_YYYY,'publish_date']  = \
        mdates.loc[idx_YYYY, 'publish_time'].apply(partial(format_date, format='%Y'))

# Convert Dates like 2020-01-21
print("Convert Dates like 2020-01-21")
YYYY_MM_DD = '\d{4}\-\d{2}\-\d{2}$'
idx_YYYY_MM_DD  = mdates.publish_time.str.match(YYYY_MM_DD, case=False)
mdates.loc[idx_YYYY_MM_DD,'publish_date']  = \
        mdates.loc[idx_YYYY_MM_DD, 'publish_time'].apply(partial(format_date, format='%Y-%m-%d'))

value_error = mdates.publish_date =='ValueError'
mdates

Fixing dates that are a list e.g. ['2020-02-05', '2020-02']
Fixing dates with the seasons e.g. 2014 Autumn
Fix dates like 2016 Nov 9 Jan-Feb
Fix dates like 2012 Jan-Mar
Convert Dates like 2020 Apr 13
Convert Dates like 2020 Apr
Convert Dates like 2020
Convert Dates like 2020-01-21


Unnamed: 0,publish_time,published,publish_date
0,1972-12-31,1972-01,1972-12-31
1,1980-03-31,1980-01,1980-03-31
2,1980-03-31,1980-01,1980-03-31
3,1973-08-31,1973-01,1973-08-31
4,1985-06-28,1985-01,1985-06-28
...,...,...,...
44215,2017 Nov 17,2017-11,2017-11-17
44216,2009-02-13,2009-01,2009-02-13
44217,2018 Jul 3,2018-07,2018-07-03
44218,2011-03-15,2011-01,2011-03-15


In [197]:
mdates.loc[(mdates.publish_time ==''), 'publish_date'] = np.nan

In [7]:
mdates[~mdates.publish_date.isnull()]

Unnamed: 0,publish_time,published,publish_date
0,1972-12-31,1972-01,1972-12-31
1,1980-03-31,1980-01,1980-03-31
2,1980-03-31,1980-01,1980-03-31
3,1973-08-31,1973-01,1973-08-31
4,1985-06-28,1985-01,1985-06-28
...,...,...,...
44215,2017 Nov 17,2017-11,2017-11-17
44216,2009-02-13,2009-01,2009-02-13
44217,2018 Jul 3,2018-07,2018-07-03
44218,2011-03-15,2011-01,2011-03-15


## Fix dates on ResearchPapers

In [25]:
research_papers = ResearchPapers.from_data_dir()

Loading metadata from data\CORD-19-research-challenge
Cleaning metadata
Fixing dates that are a list e.g. ['2020-02-05', '2020-02']
Fixing dates with the seasons e.g. 2014 Autumn
Fix dates like 2016 Nov 9 Jan-Feb
Fix dates like 2012 Jan-Mar
Convert Dates like 2020 Apr 13
Converting Dates like 2020 Apr
Converting Dates like 2020
Converting Dates like 2020-01-21
Indexing research papers
Finished Indexing in 67.0 seconds


In [26]:
research_papers.metadata.sample(2000)[['publish_date', 'date_diff']]

Unnamed: 0,publish_date,date_diff
8472,2016-04-30,4 years ago
2289,2011-12-31,8 years ago
27668,2017-09-27,2 years ago
19154,2020-01-01,2 months ago
13943,2003-07-31,17 years ago
2982,2018-12-31,1 year ago
14372,2012-09-30,7 years ago
32039,NaT,
16896,2015-03-31,5 years ago
20163,2010-10-12,9 years ago
