# Home task: text mining

In [576]:
import os
import numpy as np
import pandas as pd
from datetime import datetime

cwd = os.getcwd() 
fn = os.path.join('dates.txt')

with open(fn, 'r') as f:
    content = f.read()
print (content[:500])

03/25/93 Total time of visit (in minutes):
6/18/85 Primary Care Doctor:
sshe plans to move as of 7/8/71 In-Home Services: None
7 on 9/27/75 Audit C Score Current:
2/6/96 sleep studyPain Treatment Pain Level (Numeric Scale): 7
.Per 7/06/79 Movement D/O note:
4, 5/18/78 Patient's thoughts about current substance abuse:
10/24/89 CPT Code: 90801 - Psychiatric Diagnosis Interview
3/7/86 SOS-10 Total Score:
(4/10/71)Score-1Audit C Score Current:
(5/11/85) Crt-1.96, BUN-26; AST/ALT-16/22; WBC_12.6Activ


In [577]:
content = content.split("\n")

df = pd.DataFrame({'notes': content, 'index': list(range(len(content)))})

In [578]:
patterns = [
    r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',  # 04/20/2009; 04/20/09; 4/20/09; 4/3/09
    r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*[-.]* \d{1,2}[-,]* \d{4}',  # Mar-20-2009; Mar 20, 2009; March 20, 2009; Mar. 20, 2009; Mar 20 2009
    r'\d{1,2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*[-.,]* \d{4}',  # 20 Mar 2009; 20 March 2009; 20 Mar. 2009; 20 March, 2009
    r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2}[a-z]{2}, \d{4}',  # Mar 20th, 2009; Mar 21st, 2009; Mar 22nd, 2009
    r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}',  # Feb 2009; Sep 2009; Oct 2010
    r'\d{1,2}/\d{4}',  # 6/2008; 12/2009
    r'[12][0-9]{3}'  # 2009; 2010
]

df['dates'] = df['notes'].str.extract('(' + '|'.join(patterns) + ')', expand=False)
# Clear out the 'nan' dates
df = df[df['dates'].notna()]

In [579]:
# Unifiying the date format
def normalize_date(date):
    formats = ['%m/%d/%Y', '%m/%d/%y', '%b-%d-%Y', '%m-%d-%y', '%b %d, %Y', '%B %d, %Y', 
               '%b. %d, %Y', '%b %d %Y', '%d %b %Y', '%d %B %Y', '%d %b. %Y', 
               '%d %B, %Y', '%b %dth, %Y', '%b %dst, %Y', '%B. %d, %Y', '%b %dnd, %Y', 
               '%b %Y', '%B %d %Y', '%B %Y', '%m/%Y', '%Y']

    normalized_date = np.nan
    for fmt in formats:
        try:
           normalized_date = datetime.strptime(date, fmt)
        except ValueError:
            continue
        else:
            break

    return normalized_date

df['dates'] = df['dates'].apply(normalize_date)
df.sort_values(by='dates', inplace=True)
df = df[df['dates'].notna()]
df

Unnamed: 0,notes,index,dates
9,(4/10/71)Score-1Audit C Score Current:,9,1971-04-10
84,5/18/71 Total time of visit (in minutes):,84,1971-05-18
2,sshe plans to move as of 7/8/71 In-Home Servic...,2,1971-07-08
53,7/11/71 SOS-10 Total Score:,53,1971-07-11
28,9/12/71 [report_end],28,1971-09-12
...,...,...,...
427,6e. monitor pt's depressive experience in cont...,427,2016-05-01
141,30 May 2016 SOS-10 Total Score:,141,2016-05-30
186,13 Oct 2016 Primary Care Doctor:,186,2016-10-13
161,19 Oct 2016 Communication with referring physi...,161,2016-10-19
