Skip to content

Commit

Permalink
Merge pull request #51 from edissyum/20.03
Browse files Browse the repository at this point in the history
Improve process date
  • Loading branch information
nathan30 committed Jul 5, 2021
2 parents 8121209 + c025124 commit 2122a9a
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 31 deletions.
2 changes: 1 addition & 1 deletion src/locale/fr_FR.json
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
"Dec."
]
},
"dateRegex" : "(\\d{1,2}|\\d{1}\\w{2})\\s?([JFMASONDjfmasond][a-zA-Z_À-ÿ\\.,-]*|[/,-\\.]\\d{2}[/,-\\.])\\s?\\d{4}",
"dateRegex" : "(\\d{1,2}|\\d{1}\\w{2})\\s?([JFMASONDjfmasond][a-zA-Z_À-ÿ\\.,-]*|[/,-\\.\\s+]\\d{2}[/,-\\.\\s+])\\s?\\d{4}",
"dateTimeFormat": "%d %m %Y",
"dateFormat" : "%d-%m-%Y",
"subjectRegex" : "([o,O]bje[c]?t|[v,V,n,N]os\\s*[r,R][e,é]f(s?|[e,é]rence)+(\\.)?|[s,S]u[b]?je[c]?t|[a,A]vis\\s* d[',e])((\\s*:\\s*)|\\s+)\\s*.*",
Expand Down
66 changes: 36 additions & 30 deletions src/process/FindDate.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,42 +29,48 @@ def __init__(self, text, locale, log, config):
self.Locale = locale
self.Config = config

def format_date(self, _date):
self.date = _date.group().replace('1er', '01') # Replace some possible inconvenient char
self.date = self.date.replace(',', ' ') # Replace some possible inconvenient char
self.date = self.date.replace('/', ' ') # Replace some possible inconvenient char
self.date = self.date.replace('-', ' ') # Replace some possible inconvenient char
self.date = self.date.replace('.', ' ') # Replace some possible inconvenient char
date_convert = self.Locale.arrayDate
for key in date_convert:
for month in date_convert[key]:
if month.lower() in self.date.lower():
self.date = (self.date.lower().replace(month.lower(), key))
break

try:
self.date = datetime.strptime(self.date, self.Locale.dateTimeFormat).strftime(self.Locale.formatDate)
# Check if the date of the document isn't too old. 62 (default value) is equivalent of 2 months
today = datetime.now()
doc_date = datetime.strptime(self.date, self.Locale.formatDate)
timedelta = today - doc_date

if int(self.Config.cfg['OCForMaarch']['timedelta']) != -1:
if timedelta.days > int(self.Config.cfg['OCForMaarch']['timedelta']) or timedelta.days < 0:
self.Log.info("Date is older than " + str(self.Config.cfg['OCForMaarch']['timedelta']) + " days or in the future: " + self.date)
self.date = ''
self.Log.info("Date found : " + self.date)
return True
except ValueError:
self.Log.info("Date wasn't in a good format : " + self.date)
self.date = ''

def run(self):
"""
Override the default run function of threading package
This will search for a date into the text of original PDF
"""

for _date in re.finditer(r"" + self.Locale.regexDate + "", re.sub(r'(\d)\s+(\d)', r'\1\2', self.text)): # The re.sub is useful to fix space between numerics
self.date = _date.group().replace('1er', '01') # Replace some possible inconvenient char
self.date = self.date.replace(',', ' ') # Replace some possible inconvenient char
self.date = self.date.replace('/', ' ') # Replace some possible inconvenient char
self.date = self.date.replace('-', ' ') # Replace some possible inconvenient char
self.date = self.date.replace('.', ' ') # Replace some possible inconvenient char

date_convert = self.Locale.arrayDate
for key in date_convert:
for month in date_convert[key]:
if month.lower() in self.date.lower():
self.date = (self.date.lower().replace(month.lower(), key))
break
if self.format_date(_date):
return True

try:
self.date = datetime.strptime(self.date, self.Locale.dateTimeFormat).strftime(self.Locale.formatDate)
# Check if the date of the document isn't too old. 62 (default value) is equivalent of 2 months
today = datetime.now()
doc_date = datetime.strptime(self.date, self.Locale.formatDate)
timedelta = today - doc_date
if not self.date:
for _date in re.finditer(r"" + self.Locale.regexDate + "", self.text):
if self.format_date(_date):
return True

if int(self.Config.cfg['OCForMaarch']['timedelta']) != -1:
if timedelta.days > int(self.Config.cfg['OCForMaarch']['timedelta']) or timedelta.days < 0:
self.Log.info("Date is older than " + str(self.Config.cfg['OCForMaarch']['timedelta']) + " days or in the future: " + self.date)
self.date = ''
continue
self.Log.info("Date found : " + self.date)
break
except ValueError:
self.Log.info("Date wasn't in a good format : " + self.date)
self.date = ''
continue

0 comments on commit 2122a9a

Please sign in to comment.