# Date Parser Using Basic Text Processing

This notebook implements a comprehensive date parser that can extract dates from text and convert them to DD/MM/YYYY format using only Python's built-in libraries and regex.

## Features
- Handles multiple date formats (Month DD, YYYY | DD Month YYYY | YYYY-MM-DD | DD/MM/YYYY | DD.MM.YYYY | DD-MM-YYYY)
- Supports ordinal numbers (1st, 2nd, 3rd, etc.)
- Handles month abbreviations (Jan, Feb, Mar, etc.)
- Normalizes 2-digit years to 4-digit years


## Requirements
- Only uses Python built-in libraries (`re` and `csv`)
- No external dependencies or ML models

In [13]:
import re
import csv

def parse_date(text):
    """
    Enhanced date parser that handles various date formats and achieves 99% accuracy.
    
    Supported formats:
    - Month DD, YYYY (e.g., "March 5, 2023", "Dec 20th, 2021")
    - DD Month YYYY (e.g., "1st January 2000", "15th September 2021")
    - YYYY-MM-DD (ISO format)
    - YYYY/MM/DD, YYYY.MM.DD
    - DD/MM/YYYY, DD.MM.YYYY, DD-MM-YYYY
    - Handles ordinal suffixes (1st, 2nd, 3rd, etc.)
    - Handles "including YYYY" patterns
    
    Args:
        text (str): Input text containing a date
        
    Returns:
        str: Date in DD/MM/YYYY format or None if no date found
    """
    
    # Month name mappings
    month_names = {
        'january': '01', 'jan': '01',
        'february': '02', 'feb': '02',
        'march': '03', 'mar': '03',
        'april': '04', 'apr': '04',
        'may': '05',
        'june': '06', 'jun': '06',
        'july': '07', 'jul': '07',
        'august': '08', 'aug': '08',
        'september': '09', 'sep': '09', 'sept': '09',
        'october': '10', 'oct': '10',
        'november': '11', 'nov': '11',
        'december': '12', 'dec': '12'
    }
    
    def normalize_year(year_str):
        """Convert 2-digit year to 4-digit year"""
        year = int(year_str)
        if len(year_str) == 2:
            if year >= 50:
                return str(1900 + year)
            else:
                return str(2000 + year)
        return year_str
    
    def extract_day_number(day_str):
        """Extract the numeric part from day string with ordinals"""
        match = re.search(r'(\d+)', day_str)
        if match:
            return match.group(1)
        return day_str
    
    text_lower = text.lower()
    
    # Pattern 1: DD Month YYYY (e.g., "1st January 2000", "15th September 2021")
    pattern1 = r'\b(\d{1,2}(?:st|nd|rd|th)?)\s+(?:of\s+)?(january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec),?\s*(\d{2,4})\b'
    match = re.search(pattern1, text_lower)
    if match:
        day = extract_day_number(match.group(1)).zfill(2)
        month = month_names[match.group(2)]
        year = normalize_year(match.group(3))
        return f"{day}/{month}/{year}"
    
    # Pattern 2: "the Nth of Month Year" cases
    pattern2 = r'\bthe\s+(\d{1,2}(?:st|nd|rd|th)?)\s+of\s+(january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec),?\s*(\d{2,4})\b'
    match = re.search(pattern2, text_lower)
    if match:
        day = extract_day_number(match.group(1)).zfill(2)
        month = month_names[match.group(2)]
        year = normalize_year(match.group(3))
        return f"{day}/{month}/{year}"
    
    # Pattern 3: "Nth of Month, including YYYY" or similar patterns
    pattern3 = r'\b(\d{1,2}(?:st|nd|rd|th)?)\s+(?:of\s+)?(january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec).*?(?:including|in)\s+(\d{4})\b'
    match = re.search(pattern3, text_lower)
    if match:
        day = extract_day_number(match.group(1)).zfill(2)
        month = month_names[match.group(2)]
        year = match.group(3)
        return f"{day}/{month}/{year}"
    
    # Pattern 4: Month DD, YYYY (e.g., "March 5, 2023", "Dec 20th, 2021")
    pattern4 = r'\b(january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)\s+(\d{1,2}(?:st|nd|rd|th)?),?\s*(\d{2,4})\b'
    match = re.search(pattern4, text_lower)
    if match:
        month = month_names[match.group(1)]
        day = extract_day_number(match.group(2)).zfill(2)
        year = normalize_year(match.group(3))
        return f"{day}/{month}/{year}"
    
    # Pattern 5: "Month DD.*including YYYY"
    pattern5 = r'\b(january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)\s+(\d{1,2}(?:st|nd|rd|th)?).*?(?:including|in)\s+(\d{4})\b'
    match = re.search(pattern5, text_lower)
    if match:
        month = month_names[match.group(1)]
        day = extract_day_number(match.group(2)).zfill(2)
        year = match.group(3)
        return f"{day}/{month}/{year}"
    
    # Pattern 6: YYYY-MM-DD (ISO format)
    pattern6 = r'\b(\d{4})-(\d{1,2})-(\d{1,2})\b'
    match = re.search(pattern6, text)
    if match:
        year = match.group(1)
        month = match.group(2).zfill(2)
        day = match.group(3).zfill(2)
        return f"{day}/{month}/{year}"
    
    # Pattern 7: YYYY/MM/DD
    pattern7 = r'\b(\d{4})/(\d{1,2})/(\d{1,2})\b'
    match = re.search(pattern7, text)
    if match:
        year = match.group(1)
        month = match.group(2).zfill(2)
        day = match.group(3).zfill(2)
        return f"{day}/{month}/{year}"
    
    # Pattern 8: YYYY.MM.DD (dot separated)
    pattern8 = r'\b(\d{4})\.(\d{1,2})\.(\d{1,2})\b'
    match = re.search(pattern8, text)
    if match:
        year = match.group(1)
        month = match.group(2).zfill(2)
        day = match.group(3).zfill(2)
        return f"{day}/{month}/{year}"
    
    # Pattern 9: DD/MM/YYYY or MM/DD/YYYY (handle ambiguity)
    pattern9 = r'\b(\d{1,2})/(\d{1,2})/(\d{2,4})\b'
    match = re.search(pattern9, text)
    if match:
        first_num = int(match.group(1))
        second_num = int(match.group(2))
        year = normalize_year(match.group(3))
        
        # If first number > 12, it must be day
        if first_num > 12:
            day = match.group(1).zfill(2)
            month = match.group(2).zfill(2)
        # If second number > 12, it must be day
        elif second_num > 12:
            month = match.group(1).zfill(2)
            day = match.group(2).zfill(2)
        else:
            # Both could be valid - assume DD/MM
            day = match.group(1).zfill(2)
            month = match.group(2).zfill(2)
        
        return f"{day}/{month}/{year}"
    
    # Pattern 10: DD.MM.YYYY
    pattern10 = r'\b(\d{1,2})\.(\d{1,2})\.(\d{2,4})\b'
    match = re.search(pattern10, text)
    if match:
        day = match.group(1).zfill(2)
        month = match.group(2).zfill(2)
        year = normalize_year(match.group(3))
        return f"{day}/{month}/{year}"
    
    # Pattern 11: DD-MM-YYYY
    pattern11 = r'\b(\d{1,2})-(\d{1,2})-(\d{2,4})\b'
    match = re.search(pattern11, text)
    if match:
        day = match.group(1).zfill(2)
        month = match.group(2).zfill(2)
        year = normalize_year(match.group(3))
        return f"{day}/{month}/{year}"
    
    return None

# Test the parser with example cases
test_examples = [
    "I went to London on 21st June, 2024",
    "The event will take place on March 5, 2023.",
    "Her birthday is on 07/08/1990.",
    "The deadline is 2022-12-31.",
    "We met on 1st of January 2000.",
    "We celebrate Christmas every year on 25th Dec, including 2024.",
    "The exam date is 2021.11.10."
]

print("Testing the date parser:")
for test in test_examples:
    result = parse_date(test)
    print(f"Input: {test}")
    print(f"Output: {result}")
    print()

Testing the date parser:
Input: I went to London on 21st June, 2024
Output: 21/06/2024

Input: The event will take place on March 5, 2023.
Output: 05/03/2023

Input: Her birthday is on 07/08/1990.
Output: 07/08/1990

Input: The deadline is 2022-12-31.
Output: 31/12/2022

Input: We met on 1st of January 2000.
Output: 01/01/2000

Input: We celebrate Christmas every year on 25th Dec, including 2024.
Output: 25/12/2024

Input: The exam date is 2021.11.10.
Output: 10/11/2021



In [14]:
# Test the parser against the complete dataset
def test_date_parser():
    """Test the date parser against the complete dataset"""
    
    # Read the test cases
    test_cases = []
    with open('Data/date_parser_testcases.csv', 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            # Skip malformed rows
            if 'Input' not in row or 'Expected Output' not in row:
                continue
            # Skip header rows that got duplicated
            if row['Input'] == 'Input' or row['Expected Output'] == 'Expected Output':
                continue
            test_cases.append({
                'input': row['Input'].strip('"'),
                'expected': row['Expected Output'].strip('"')
            })
    
    # Test each case
    correct = 0
    total = len(test_cases)
    
    for case in test_cases:
        result = parse_date(case['input'])
        if result == case['expected']:
            correct += 1
    
    accuracy = correct/total*100
    print(f"Date Parser Accuracy: {correct}/{total} correct ({accuracy:.1f}%)")
    
    return correct, total

# Run the test
correct, total = test_date_parser()

Date Parser Accuracy: 99/99 correct (100.0%)
