In [15]:
import re
import requests
import json
import string
from datetime import datetime, timezone, timedelta
from constants import ROOM_IDS, tz, PROPER_NAMES
from typing import Dict
from dateutil.parser import parse, ParserError

def damerau_levenshtein_distance(s1, s2):
    '''
    replacing original method
    '''
    d = {}
    lenstr1 = len(s1)
    lenstr2 = len(s2)
    for i in range(-1,lenstr1+1):
        d[(i,-1)] = i+1
    for j in range(-1,lenstr2+1):
        d[(-1,j)] = j+1

    for i in range(lenstr1):
        for j in range(lenstr2):
            if s1[i] == s2[j]:
                cost = 0
            else:
                cost = 1
            d[(i,j)] = min(
                           d[(i-1,j)] + 1, # deletion
                           d[(i,j-1)] + 1, # insertion
                           d[(i-1,j-1)] + cost, # substitution
                          )
            if i and j and s1[i]==s2[j-1] and s1[i-1] == s2[j]:
                d[(i,j)] = min (d[(i,j)], d[i-2,j-2] + cost) # transposition

    return d[lenstr1-1,lenstr2-1]

day_abs = {
    'thứ 2': 0,
    'thứ2': 0,
    'thứ hai': 0,
    'thứ 3': 1,
    'thứ3': 1,
    'thứ ba': 1,
    'thứ 4': 2,
    'thứ4': 2,
    'thứ tư': 2,
    'thứ 5': 3,
    'thứ5': 3,
    'thứ năm': 3,
    'thứ 6': 4,
    'thứ6': 4,
    'thứ sáu': 4,
}

day_rel = {
    'mai': 1,
    'nay': 0,
    'hôm qua': -1,
    'ngày kia': 2
}

at_the_moment = {
    'bây giờ': 0,
    'hiện nay': 0,
    'từ giờ': 0,
    'hiện giờ': 0,
}

START_TIME = '08:00:00'
END_TIME = '17:00:00'
START_NOON = '12:00:00'
END_NOON = '13:00:00'

# to be reused
date_regexes = [
    r'((sáng|chiều)?\s*(ngày)*\s*(hôm)*(\s)+(nay|qua)+)',
    r'((sáng|chiều)?\s*(ngày)*(\s)+(mai|kia)+)',
    r'(((sáng|chiều)?\s*(ngày)*\s*(thứ)(\s)*([2-7]|hai|ba|tư|năm|sáu|bảy))(( tuần)* (này|sau|tới)( nữa)*)*)',
    # for experimental support with t2-6 // EDIT: re.sub in correct_sentence handled it
    # r'(((sáng|chiều)?\s*(ngày)?\s*t[2-7])(( tuần)* (này|sau|tới)( nữa)*)*)',
    r'((0*[1-9]|[12][0-9]|3[01])[\/.\-](1[0-2]{1}|0*[1-9]{1})([\/.\-](20)?[0-9]{2})*\b)',
    # combined with the above
    # r'((0*[1-9]|[12][0-9]|3[01])[-](1[0-2]{1}|0*[1-9]{1})[-](20)?[0-9]{2})',
    # only use this if needed (like with TTS)
    # r'((sáng|chiều)?(ngày)*(\s)*[0-9]+(\s)*(tháng)(\s)*[0-9]+)'
]

def date_regex(message):
    date = re.findall('(' + '|'.join(date_regexes) + ')', message)
    # print(date)
    return date

def time1_regex(message):
    # proper format
    time = re.findall(r'((1[0-9]|2[0-3]|0?[0-9]):([1-5][0-9]|0?[0-9])(:([1-5][0-9]|0?[0-9]))*)', message)
    time += re.findall(r'((1[0-9]|2[0-3]|0?[0-9])(\s)*(giờ|h|g|rưỡi|r|am|pm|:)(\s)*([1-5][0-9]|0?[0-9])*\b(?!(/|-|\s*t)))', message)
    if len(time) > 0:
        return [x[0] for x in time]

def time_regex(message):
    time = re.findall(r'(\b((1[0-9]|2[0-3]|0?[0-9])(\s)*(giờ|h|g|rưỡi|r|am|pm|:)*(\s)*([1-5][0-9]|0?[0-9])*)(\s)*(-|~|đến|->|>|tới|to)(\s)*((1[0-9]|2[0-3]|0?[0-9])(\s)*(giờ|h|g|rưỡi|r|am|pm|:)(\s)*([1-5][0-9]|0?[0-9])*)(?!\/|\-))', message)
    if len(time) > 0:
        time_split = re.split(r'(-|~|đến|->|>|tới|to)', time[0][0])
        time = [[time_split[0]], [time_split[-1]]]
        time += re.findall(r'(((từ|hiện|bây)(\s))+(giờ|nay))', message)
    # print(time)
    return time

def room_regex(message):
    # patch first: NOTE: có thể cần comment lại 2 dòng này trước khi implement chức năng vùng.
    message = re.sub(r"\b(tầng 13|[Ff]13|13f)\b", "13F", message)
    message = re.sub(r"\b(tầng 18|[Ff]18|18f)\b", "18F", message)
    for word in message.split():
        # match official names first
        for room, proper_name in PROPER_NAMES.items():
            if proper_name == word:
                return room
        if word in ROOM_IDS:
            if word != '13F':
                return word
            # in case someone pull a 13F Fizz (it happens.)
            elif 'fizz' in message:
                return 'fizz'
            elif 'buzz' in message:
                return 'buzz'
            return word
    return None    

def capacity_regex(message):
    capacity = re.findall(r'((số lượng|sl|size|số|so luong|số lương|sô lượng|sô lương)(\s)*(người)*(:)+(\s)*[0-9]+)', message)
    capacity += re.findall(r'([0-9]+(\s)*(người|ng|mạng|nhân)\b)', message)
    if len(capacity) > 0:
        number = re.findall(r'\d+', capacity[0][0])
        if len(number) > 0:
            return number[0]
    return None

def repeat_regex(message):
    '''
    returns day start, day end, and mode (weekly/monthly/daily)
    may return None for the fields it doesn't get (fallback on normal)
    '''
    repeat = re.findall(r'(?i)((hằng|hàng|mỗi|cách)( (hai|2))? (tuần|tháng|ngày))', message)
    repeat += re.findall(r'(?i)((month|(bi)?week|dai)ly)', message)
    repeat += re.findall(r'(?i)(every\s?(two|2)?\s?(month|week|day))', message)
    recurring = ["định kỳ", "định kì", "lặp lại"]
    for regex in recurring:
        repeat += re.findall(r'(?i)(' + regex + ')', message)

    if len(repeat) == 0:
        return None, None, None

    date_start = re.search(r'(?i)((từ |bắt đầu (vào )?)(ngày )?|from |start(ing)? (on )?)(' + '|'.join(date_regexes) + ')', message)
    if date_start is None:
        date_start = None
    else:
        date_start = normalize_date(date_start.group(7))[0]
    
    date_end = re.search(r'(?i)((đến |kết thúc (vào )?)(ngày )?|to |(un)?til(l)? |end(ing)? (on )?)(' + '|'.join(date_regexes) + ')', message)
    if date_end is None:
        date_end = None
    else:
        date_end = normalize_date(date_end.group(9))[0]
    
    repeat_str = repeat[0][0]

    if 'tuần' in repeat_str or 'week' in repeat_str:
        repeat = 'W'
    if 'tháng' in repeat_str or 'month'in repeat_str:
        repeat = 'M'
    if 'ngày' in repeat_str or 'dai' in repeat_str or 'day' in repeat_str:
        repeat = 'D'
    if '2' in repeat_str or 'hai' in repeat_str or 'cách' in repeat_str:
        repeat += '-2'
    # experimental
    if 'cách nhật' in message:
        repeat = 'D-2'
    if message in recurring:
        repeat = '_'
    return date_start, date_end, repeat

def subject_regex(message):
    message = message.split('\n')
    subject = []
    for line in message:
        regex = re.findall(r'(?i)(title|tiêu đề|nội dung|dự án|project)\s*(\:|là)*\s*([^,]+)', line)
        if len(regex) > 0:
            subject += regex
    if len(subject) > 0:
        title = ''
        for t in subject:
            if t[0].lower() in ['dự án', 'project']:
                title += '[' + t[2] + '] '
        for t in subject:
            if t[0].lower() in ['tiêu đề', 'title']:
                title += t[2] + ' '
        for t in subject:   
            if t[0].lower() == 'nội dung':
                title += t[2] + ' '
        title = title.strip()
        if title != '':
            return title
    return None

def correct_sentence(sentence):
    sentence = re.sub(r'booth(\s)*', "booth", sentence)
    sentence = sentence.translate(str.maketrans('\n', ' ', ";,!%."))
    # HACK: correct t2-t6
    name_map = ['', '', 'hai', 'ba', 'tư', 'năm', 'sáu', 'bảy']
    for i in range(2, 7):
        sentence = re.sub(r'\bt'+ str(i) + r'\b', 'thứ ' + str(i), sentence)
        sentence = re.sub(r'\bthứ '+ str(i) + r'\b', f'thứ {name_map[i]}', sentence)

    new_sentence = []
    for word in sentence.split():
        # ignore multiple consecutive spaces
        if word == '':
            continue
        budget = 2
        n = len(word)
        if n <= 3:
            budget = 0
        elif 3 < n < 6:
            budget = 1            
        if budget:
            costs = {}           
            for keyword in ROOM_IDS:
                val = damerau_levenshtein_distance(word.lower(), keyword)
                if val <= budget:
                    costs[keyword] = val
            if len(costs) == 0:
                new_sentence.append(word)
            else:
                new_sentence.append(min(costs, key=costs.get))
        else:
            new_sentence.append(word)      
    return " ".join(new_sentence)

def normalize_date(date):
    now = datetime.now(tz)
    monday = now + timedelta(days=-now.weekday())

    if 'chiều' in date:
        apm = 'pm'
    elif 'sáng' in date:
        apm = 'am'
    else:
        apm = None

    try:
        return parse(date, dayfirst=True).strftime("%Y-%m-%d"), apm
    except ParserError:
        day_delta = 0
        week_delta = 0

        for key in day_rel:
            if key in date:
                return (now + timedelta(days=day_rel[key])).strftime("%Y-%m-%d"), apm

        for key in day_abs:
            if key in date:
                day_delta += day_abs[key]
                break
        
        if 'tuần' in date:
            if 'sau' in date:
                week_delta += 1
            elif 'tới' in date:
                week_delta += 1
            # tuần sau nữa nữa
            week_delta += date.count('nữa')
        
        normalized_date = monday + timedelta(days=day_delta, weeks=week_delta)
        if now > normalized_date:
            normalized_date += timedelta(weeks=1)

        return normalized_date.strftime("%Y-%m-%d"), apm

def afternoon_normalize(time):
    if time.hour < 7:
        time +=  timedelta(hours=12)
    return time.strftime("%H:%M:00")

def normalize_time(time):
    time = time.strip()
    try:
        # if it's just a number
        if time.isnumeric():
            raise ParserError
        return afternoon_normalize(parse(time))
    except ParserError:
        now = datetime.now(tz)
        zero = now + timedelta(hours=-now.hour, minutes=-now.minute)

        delta_hours = 0
        delta_minutes = 0

        for key in at_the_moment:
            if key in time:
                return now.strftime("%H:%M:00")
        
        int_time = [int(t) for t in re.findall(r'\d+', time)]
        if len(int_time) > 0:
            delta_hours = int_time[0]
        if len(int_time) > 1:
            delta_minutes = int_time[1]
        if 'rưỡi' in time:
            delta_minutes = 30
        if 'r' in time:
            delta_minutes = 30
        return afternoon_normalize(zero + timedelta(hours=delta_hours, minutes=delta_minutes))

def email_regex(message):
    # pattern = r'([^@\s,]+@([^@\s\.,]+\.)+[^@\s\.,]+)'
    # from here: https://stackoverflow.com/questions/201323/how-to-validate-an-email-address-using-a-regular-expression
    pattern = r'''((?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\]))'''
    return [x[0] for x in re.findall(pattern, message)]

In [16]:
message = "Cho anh đặt phòng hàng tuần, phòng Bangkok từ 8-9h. Từ 23/4 đến 30/5."

In [17]:
attendees = email_regex(message)
subject = subject_regex(message)

message = message.lower()
# message = correct_sentence(message)
room_id = room_regex(message)

date = date_regex(message)
time = time_regex(message)
repeat_start, repeat_end, repeat = repeat_regex(message)
capacity = capacity_regex(message)
time1 = time1_regex(message)

date_start = date_end = time_start = time_end = \
datetime_ = datetime_1 = None

In [18]:
print(attendees)
print(subject)
print(message)
print(room_id)
print(date)
print(time)
print(time1)
print(capacity)
print(repeat_start, repeat_end, repeat)

[]
None
cho anh đặt phòng hàng tuần, phòng bangkok từ 8-9h. từ 23/4 đến 30/5.
bangkok
[('23/4', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '23/4', '23', '4', '', ''), ('30/5', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '30/5', '30', '5', '', '')]
[['8'], ['9h']]
['9h']
None
2020-04-23 2020-05-30 W


In [19]:
res = re.search(r'(\d{1,2})[\/.\-](\d{1,2})[\/.\-](\d{2}|\d{4})\s+(\d{1,2}):(\d{1,2})(:\d{1,2})?\s*(?:->|~|-|>|đến|tới)\s*(\d{1,2})[\/.\-](\d{1,2})[\/.\-](\d{2}|\d{4})\s+(\d{1,2}):(\d{1,2})(:\d{1,2})?',
                    message) or \
          re.search(r'(\d{2}|\d{4})[\/.\-](\d{1,2})[\/.\-](\d{1,2})\s+(\d{1,2}):(\d{1,2})(:\d{1,2})?\s*(?:->|~|-|>|đến|tới)\s*(\d{2}|\d{4})[\/.\-](\d{1,2})[\/.\-](\d{1,2})\s+(\d{1,2}):(\d{1,2})(:\d{1,2})?',
                    message)
res

In [20]:
    fullsize = False
    if res is not None:
        d_f, d_t = res.group(0).split('->')
        try:
            d_f = parse(d_f)
            d_t = parse(d_t)

            date_start = d_f.strftime("%Y-%m-%d")
            date_end = d_t.strftime("%Y-%m-%d")
            time_start = d_f.strftime("%H:%M:%S")
            time_end = d_t.strftime("%H:%M:%S")

            datetime_1 = f'{date_start} {time_start}'
            fullsize = True
        except ParserError:
            pass

    if not fullsize:
        # deal with captured dates
        if len(date) > 0:
            date_start, apm_start = normalize_date(date[0][0])
            date_end, apm_end = date_start, apm_start
            if len(date) > 1:
                date_end, apm_end = normalize_date(date[1][0])

            list_date = []
            for d in date:
                list_date.append(normalize_date(d[0])[0])
            if len(list_date) > 1 and repeat_end is not None:
                repeat_end = max(list_date)

        # deal with captured times
        if len(time) > 0:
            time_start = time_end = normalize_time(time[0][0])
            if len(time) > 1:
                time_end = normalize_time(time[1][0])
        elif time1 is not None:
            time_start = time_end = normalize_time(time1[0])

            if len(time1) > 1:
                time_end = normalize_time(time1[1])
            else:
                h_to = int(time_start[:2])
                if h_to < 23:
                    h_to += 1
                    time_end = f'{h_to:02}{time_end[2:]}'

        if date_start is not None and time_start is None:
            if apm_start == 'pm':
                time_start = END_NOON
            else:
                time_start = START_TIME
            if apm_end == 'am':
                time_end = START_NOON
            else:
                time_end = END_TIME

            if time1 is not None:
                datetime_1 = f'{date_start} {normalize_time(time1[0])}'
        elif date_start is None and time_start is not None:
            date_start = date_end = datetime.now(tz).strftime("%Y-%m-%d")
        elif date_start is not None and time_start is not None:
            datetime_1 = f'{date_start} {time_start}'
        
        # deal with repeat_start: replace time if needed
        if repeat is not None:
            if repeat_start is not None:
                if repeat_start > date_start:
                    date_start = repeat_start
                    if repeat_start > date_end:
                        date_end = repeat_start
                    # if time_start > time_end, tough fucking luck.
            else:
                repeat_start = date_start

In [21]:
date_start, date_end

('2020-04-23', '2020-05-30')

In [22]:
time_start, time_end

('08:00:00', '09:00:00')

In [23]:
datetime_1

'2020-04-23 08:00:00'

In [24]:
repeat_start, repeat_end

('2020-04-23', '2020-05-30')

In [25]:
    if date_start is not None:
        datetime_ = f'{date_start} {time_start} -> {date_end} {time_end}'

    if len(attendees) == 0:
        attendees = None
    else:
        attendees = ','.join(attendees)

    def wrap(title:str) -> Dict or None:
        # to trigger locals
        subject, room_id, capacity, repeat, attendees
        datetime_, datetime_1, repeat_start, repeat_end
        # and once triggered this works (LOL)
        obj = eval(title)
        if obj is None:
            return None
        return {
            "start": 0,
            "end": 1,
            "value": obj,
            "confidence": 1.0,
            "entity": title,
        }

    extracted = [x for x in map(wrap,
                                ["subject", "room_id", "capacity",
                                 "datetime_", "datetime_1",
                                 "repeat", "repeat_start", "repeat_end",
                                 'attendees']) \
                            if x is not None]

In [27]:
extracted

[{'start': 0,
  'end': 1,
  'value': 'bangkok',
  'confidence': 1.0,
  'entity': 'room_id'},
 {'start': 0,
  'end': 1,
  'value': '2020-04-23 08:00:00 -> 2020-05-30 09:00:00',
  'confidence': 1.0,
  'entity': 'datetime_'},
 {'start': 0,
  'end': 1,
  'value': '2020-04-23 08:00:00',
  'confidence': 1.0,
  'entity': 'datetime_1'},
 {'start': 0, 'end': 1, 'value': 'W', 'confidence': 1.0, 'entity': 'repeat'},
 {'start': 0,
  'end': 1,
  'value': '2020-04-23',
  'confidence': 1.0,
  'entity': 'repeat_start'},
 {'start': 0,
  'end': 1,
  'value': '2020-05-30',
  'confidence': 1.0,
  'entity': 'repeat_end'}]

In [6]:
res = re.search(r'(\d{1,2})[\/.\-](\d{1,2})[\/.\-](\d{2}|\d{4})\s+(\d{1,2}):(\d{1,2})(:\d{1,2})?\s*(?:->|~|-|>|đến|tới)\s*(\d{1,2})[\/.\-](\d{1,2})[\/.\-](\d{2}|\d{4})\s+(\d{1,2}):(\d{1,2})(:\d{1,2})?',
                    message) or \
          re.search(r'(\d{2}|\d{4})[\/.\-](\d{1,2})[\/.\-](\d{1,2})\s+(\d{1,2}):(\d{1,2})(:\d{1,2})?\s*(?:->|~|-|>|đến|tới)\s*(\d{2}|\d{4})[\/.\-](\d{1,2})[\/.\-](\d{1,2})\s+(\d{1,2}):(\d{1,2})(:\d{1,2})?',
                    message)

In [7]:
res

In [8]:
    fullsize = False
    if res is not None:
        d_f, d_t = res.group(0).split('->')
        try:
            d_f = parse(d_f)
            d_t = parse(d_t)

            date_start = d_f.strftime("%Y-%m-%d")
            date_end = d_t.strftime("%Y-%m-%d")
            time_start = d_f.strftime("%H:%M:%S")
            time_end = d_t.strftime("%H:%M:%S")

            datetime_1 = f'{date_start} {time_start}'
            fullsize = True
        except ParserError:
            pass

    if not fullsize:
        # deal with captured dates
        if len(date) > 0:
            date_start, apm_start = normalize_date(date[0][0])
            date_end, apm_end = date_start, apm_start
            if len(date) > 1:
                date_end, apm_end = normalize_date(date[1][0])

            list_date = []
            for d in date:
                list_date.append(normalize_date(d[0])[0])
            if len(list_date) > 1 and repeat_end is not None:
                repeat_end = max(list_date)

        # deal with captured times
        if len(time) > 0:
            time_start = time_end = normalize_time(time[0][0])
            if len(time) > 1:
                time_end = normalize_time(time[1][0])
        elif time1 is not None:
            time_start = time_end = normalize_time(time1[0])

            if len(time1) > 1:
                time_end = normalize_time(time1[1])
            else:
                h_to = int(time_start[:2])
                if h_to < 23:
                    h_to += 1
                    time_end = f'{h_to:02}{time_end[2:]}'

        if date_start is not None and time_start is None:
            if apm_start == 'pm':
                time_start = END_NOON
            else:
                time_start = START_TIME
            if apm_end == 'am':
                time_end = START_NOON
            else:
                time_end = END_TIME

            if time1 is not None:
                datetime_1 = f'{date_start} {normalize_time(time1[0])}'
        elif date_start is None and time_start is not None:
            date_start = date_end = datetime.now(tz).strftime("%Y-%m-%d")
        elif date_start is not None and time_start is not None:
            datetime_1 = f'{date_start} {time_start}'
        
        # deal with repeat_start: replace time if needed
        if repeat is not None:
            if repeat_start is not None:
                if repeat_start > date_start:
                    date_start = repeat_start
                    if repeat_start > date_end:
                        date_end = repeat_start
                    # if time_start > time_end, tough fucking luck.
            else:
                repeat_start = date_start

In [9]:
print(date_start)
print(date_end)
print(time_start)
print(time_end)

None
None
None
None


In [9]:
import re
from datetime import datetime, timezone, timedelta

In [35]:
message = "Cho anh đặt phòng vào thứ 2 hàng tuần từ 13/4 đến 27/5, phòng Tokyo lúc 9-10h sáng"

In [36]:
repeat = re.findall(
        r'(?i)((hằng|hàng|mỗi|cách)( (hai|2))? (tuần|tháng|ngày))', message)
repeat

[('hàng tuần', 'hàng', '', '', 'tuần')]

In [37]:
repeat += re.findall(r'(?i)((month|(bi)?week|dai)ly)', message)
repeat += re.findall(r'(?i)(every\s?(two|2)?\s?(month|week|day))', message)
recurring = ["định kỳ", "định kì", "lặp lại"]
for regex in recurring:
    repeat += re.findall(r'(?i)(' + regex + ')', message)

In [38]:
date_regexes = [
    r'((sáng|chiều)?\s*(ngày)*\s*(hôm)*(\s)+(nay|qua)+)',
    r'((sáng|chiều)?\s*(ngày)*(\s)+(mai|kia)+)',
    r'(((sáng|chiều)?\s*(ngày)*\s*(thứ)(\s)*([2-7]|hai|ba|tư|năm|sáu|bảy))(( tuần)* (này|sau|tới)( nữa)*)*)',
    # for experimental support with t2-6 // EDIT: re.sub in correct_sentence handled it
    # r'(((sáng|chiều)?\s*(ngày)?\s*t[2-7])(( tuần)* (này|sau|tới)( nữa)*)*)',
    r'((0*[1-9]|[12][0-9]|3[01])[\/.\-](1[0-2]{1}|0*[1-9]{1})([\/.\-](20)?[0-9]{2})*\b)',
    # combined with the above
    # r'((0*[1-9]|[12][0-9]|3[01])[-](1[0-2]{1}|0*[1-9]{1})[-](20)?[0-9]{2})',
    # only use this if needed (like with TTS)
    # r'((sáng|chiều)?(ngày)*(\s)*[0-9]+(\s)*(tháng)(\s)*[0-9]+)'
]

In [39]:

def date_regex(message):
    date = re.findall('(' + '|'.join(date_regexes) + ')', message)
    # print(date)
    return date

[(' thứ 2',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  ' thứ 2',
  ' thứ 2',
  '',
  '',
  'thứ',
  ' ',
  '2',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  ''),
 ('13/4',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '13/4',
  '13',
  '4',
  '',
  ''),
 ('27/5',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '27/5',
  '27',
  '5',
  '',
  '')]

In [27]:
# if len(repeat) == 0:
#     return None, None, None

date_start = re.search(r'(?i)((từ |bắt đầu (vào )?)(ngày )?|from |start(ing)? (on )?)(' + '|'.join(date_regexes) + ')', message)

In [30]:
date_start

<_sre.SRE_Match object; span=(28, 45), match='từ thứ 2 tuần sau'>

In [29]:
'(?i)((từ |bắt đầu (vào )?)(ngày )?|from |start(ing)? (on )?)(((sáng|chiều)?\\s*(ngày)*\\s*(hôm)*(\\s)+(nay|qua)+)|((sáng|chiều)?\\s*(ngày)*(\\s)+(mai|kia)+)|(((sáng|chiều)?\\s*(ngày)*\\s*(thứ)(\\s)*([2-7]|hai|ba|tư|năm|sáu|bảy))(( tuần)* (này|sau|tới)( nữa)*)*)|((0*[1-9]|[12][0-9]|3[01])[\\/.\\-](1[0-2]{1}|0*[1-9]{1})([\\/.\\-](20)?[0-9]{2})*\\b))'

'(?i)((từ |bắt đầu (vào )?)(ngày )?|from |start(ing)? (on )?)(((sáng|chiều)?\\s*(ngày)*\\s*(hôm)*(\\s)+(nay|qua)+)|((sáng|chiều)?\\s*(ngày)*(\\s)+(mai|kia)+)|(((sáng|chiều)?\\s*(ngày)*\\s*(thứ)(\\s)*([2-7]|hai|ba|tư|năm|sáu|bảy))(( tuần)* (này|sau|tới)( nữa)*)*)|((0*[1-9]|[12][0-9]|3[01])[\\/.\\-](1[0-2]{1}|0*[1-9]{1})([\\/.\\-](20)?[0-9]{2})*\\b))'

In [19]:
date_end = re.search(r'(?i)((đến |kết thúc (vào )?)(ngày )?|to |(un)?til(l)? |end(ing)? (on )?)(' + '|'.join(date_regexes) + ')', message)
    

In [20]:
date_end

In [38]:
def normalize_date(date):
    now = datetime.now(tz)
    monday = now + timedelta(days=-now.weekday())

    if 'chiều' in date:
        apm = 'pm'
    elif 'sáng' in date:
        apm = 'am'
    else:
        apm = None

    try:
        return parse(date, dayfirst=True).strftime("%Y-%m-%d"), apm
    except ParserError:
        day_delta = 0
        week_delta = 0

        for key in day_rel:
            if key in date:
                return (now + timedelta(days=day_rel[key])).strftime("%Y-%m-%d"), apm

        for key in day_abs:
            if key in date:
                day_delta += day_abs[key]
                break

        if 'tuần' in date:
            if 'sau' in date:
                week_delta += 1
            elif 'tới' in date:
                week_delta += 1
            # tuần sau nữa nữa
            week_delta += date.count('nữa')

        normalized_date = monday + timedelta(days=day_delta, weeks=week_delta)
        if now > normalized_date:
            normalized_date += timedelta(weeks=1)

        return normalized_date.strftime("%Y-%m-%d"), apm


In [43]:
date_start.group(7)

'23/3'

In [40]:
date_start = re.search(r'(?i)((từ |bắt đầu (vào )?)(ngày )?|from |start(ing)? (on )?)(' + '|'.join(date_regexes) + ')', message)
if date_start is None:
    date_start = None
else:
    date_start = normalize_date(date_start.group(7))[0]

date_end = re.search(r'(?i)((đến |kết thúc (vào )?)(ngày )?|to |(un)?til(l)? |end(ing)? (on )?)(' + '|'.join(date_regexes) + ')', message)
if date_end is None:
    date_end = None
else:
    date_end = normalize_date(date_end.group(9))[0]

NameError: name 'tz' is not defined

In [25]:
def repeat_regex(message):
    '''
    returns day start, day end, and mode (weekly/monthly/daily)
    may return None for the fields it doesn't get (fallback on normal)
    '''
    repeat = re.findall(
        r'(?i)((hằng|hàng|mỗi|cách)( (hai|2))? (tuần|tháng|ngày))', message)
    repeat += re.findall(r'(?i)((month|(bi)?week|dai)ly)', message)
    repeat += re.findall(r'(?i)(every\s?(two|2)?\s?(month|week|day))', message)
    recurring = ["định kỳ", "định kì", "lặp lại"]
    for regex in recurring:
        repeat += re.findall(r'(?i)(' + regex + ')', message)

    if len(repeat) == 0:
        return None, None, None

    date_start = re.search(
        r'(?i)((từ |bắt đầu (vào )?)(ngày )?|from |start(ing)? (on )?)(' + '|'.join(date_regexes) + ')', message)
    if date_start is None:
        date_start = None
    else:
        date_start = normalize_date(date_start.group(7))[0]

    date_end = re.search(
        r'(?i)((đến |kết thúc (vào )?)(ngày )?|to |(un)?til(l)? |end(ing)? (on )?)(' + '|'.join(date_regexes) + ')', message)
    if date_end is None:
        date_end = None
    else:
        date_end = normalize_date(date_end.group(9))[0]

    repeat_str = repeat[0][0]

    if 'tuần' in repeat_str or 'week' in repeat_str:
        repeat = 'W'
    if 'tháng' in repeat_str or 'month'in repeat_str:
        repeat = 'M'
    if 'ngày' in repeat_str or 'dai' in repeat_str or 'day' in repeat_str:
        repeat = 'D'
    if '2' in repeat_str or 'hai' in repeat_str or 'cách' in repeat_str:
        repeat += '-2'
    # experimental
    if 'cách nhật' in message:
        repeat = 'D-2'
    if message in recurring:
        repeat = '_'
    return date_start, date_end, repeat


In [26]:
print(repeat_regex(message))

NameError: name 'datetime' is not defined