In [11]:
import os

In [2]:
DIRECTORY_NAME = "text"

In [3]:
filenames = {int(i.split("_")[0]): i for i in os.listdir(DIRECTORY_NAME)}
sorted_filenames = []
for i in range(1, len(filenames)+1):
    sorted_filenames.append(filenames[i])

In [4]:
sorted_filenames

['1_0000.txt',
 '2_0011.txt',
 '3_0022.txt',
 '4_0033.txt',
 '5_0044.txt',
 '6_0054.txt',
 '7_0055.txt',
 '8_0056.txt',
 '9_0057.txt',
 '10_0001.txt',
 '11_0002.txt',
 '12_0003.txt',
 '13_0004.txt',
 '14_0005.txt',
 '15_0006.txt',
 '16_0007.txt',
 '17_0008.txt',
 '18_0009.txt',
 '19_0010.txt',
 '20_0012.txt',
 '21_0013.txt',
 '22_0014.txt',
 '23_0015.txt',
 '24_0016.txt',
 '25_0017.txt',
 '26_0018.txt',
 '27_0019.txt',
 '28_0020.txt',
 '29_0021.txt',
 '30_0023.txt',
 '31_0024.txt',
 '32_0025.txt',
 '33_0026.txt',
 '34_0027.txt',
 '35_0028.txt',
 '36_0029.txt',
 '37_0030.txt',
 '38_0031.txt',
 '39_0032.txt',
 '40_0034.txt',
 '41_0035.txt',
 '42_0036.txt',
 '43_0037.txt',
 '44_0038.txt',
 '45_0039.txt',
 '46_0040.txt',
 '47_0041.txt',
 '48_0042.txt',
 '49_0043.txt',
 '50_0045.txt',
 '51_0046.txt',
 '52_0047.txt',
 '53_0048.txt',
 '54_0049.txt',
 '55_0050.txt',
 '56_0051.txt',
 '57_0052.txt',
 '58_0053.txt']

In [5]:
import codecs


def read_text(filename):
    with codecs.open(os.path.join(DIRECTORY_NAME, filename), "r", encoding="utf-8") as f:
        return f.read()


def split_page(text):
    return [i.replace("\n", " ") for i in text.split(".\n")]


class Abbreviations:
    B = "B."
    M = "M."
    MC = "M.C."
    MT = "M.T."
    SAT = "S.A.T."
    COMM = "Comm."
    CONF = "Conf."
    CORRESP = "Corresp."
    PUBLIC = "Public."
    FAM = "Fam."
    ILL = "Ill."
    SQ = "sq."

import re

class Extractor:

    @staticmethod
    def extract_citation(text: str):
        """
        >>> "Abilly - Archéolab, B.2002, 9 - Forges, B.1996, 669 ; sidérurgie, voir Guichard - Habitat, voir Rousseau - Néolithique, voir Marquet - Tessons protohistoriques, voir Geslin et Schoenstein - Lieux-dits : Le Fouion, fouilles 1994 : voir Millet-Richard ; B.2000, 382 - Le Petit- Paulmy, B.1997, 344 ; B.2000, 382 - Site de la Grosse Coue, voir Millet-Richard"

        :param text:
        :return:
        """
        l = []

        bulletin_pattern = re.compile(r"B\.( )?(?P<year>[0-9]+), (?P<page>[0-9]+)( sq\.)?")
        m = bulletin_pattern.search(text)
        while m:
            l.append(dict(work=Abbreviations.B, year=m.group("year"), page=m.group("page")))
            beginning_i = m.end() + 1
            m = bulletin_pattern.search(text, pos=beginning_i)

        memoire_pattern = re.compile(r"M\.( )?(?P<number>[IVXLCDM]+), (?P<page>[0-9]+)(sq\.)?")
        m = memoire_pattern.search(text)
        while m:
            l.append(dict(work=Abbreviations.M, number=m.group("number"), page=m.group("page")))
            beginning_i = m.end() + 1
            m = memoire_pattern.search(text, pos=beginning_i)
        return l

    @staticmethod
    def extract_named_entity(text: str):
        """
        >>> "Abilly - Archéolab, B.2002, 9 - Forges, B.1996, 669 ; sidérurgie, voir Guichard - Habitat, voir Rousseau - Néolithique, voir Marquet - Tessons protohistoriques, voir Geslin et Schoenstein - Lieux-dits : Le Fouion, fouilles 1994 : voir Millet-Richard ; B.2000, 382 - Le Petit- Paulmy, B.1997, 344 ; B.2000, 382 - Site de la Grosse Coue, voir Millet-Richard"

        :param text:
        :return:
        """
        named_entity = text.split(" - ")[0]

        return named_entity

    @staticmethod
    def see_this(text: str):
        l = []
        bulletin_pattern = re.compile(r"voir( aussi)? (?P<named_entity>[ \w]+)")
        m = bulletin_pattern.search(text)
        while m:
            l.append(dict(named_entity=m.group("named_entity")))
            beginning_i = m.end() + 1
            m = bulletin_pattern.search(text, pos=beginning_i)




In [6]:
pages = []
for filename in sorted_filenames:
    text = read_text(filename)
    pages.extend([split_page(page) for page in text.split("\n\n") if page])

In [7]:
#pages[0:2]

In [8]:
for i, page in enumerate(pages):
    for entry in page:
        citations = Extractor.extract_citation(entry)
        if citations:
            print(i+1, Extractor.extract_named_entity(entry), citations)

1 A Abbeville [{'work': 'M.', 'number': 'LXIII', 'page': '167'}]
1 ABD-EL-KADER [{'work': 'B.', 'year': '2000', 'page': '403'}]
1 Abilly [{'work': 'B.', 'year': '2002', 'page': '9'}, {'work': 'B.', 'year': '1996', 'page': '669'}, {'work': 'B.', 'year': '2000', 'page': '382'}, {'work': 'B.', 'year': '1997', 'page': '344'}, {'work': 'B.', 'year': '2000', 'page': '382'}]
1 Ablevois [{'work': 'M.', 'number': 'LXII', 'page': '113'}]
1 Abreuvoir [{'work': 'B.', 'year': '2002', 'page': '51'}]
1 Abysme de Candes (l') [{'work': 'M.', 'number': 'LXII', 'page': '120'}]
1 Académie des Sciences, Arts et Belles-Lettres de Touraine [{'work': 'B.', 'year': '1994', 'page': '69'}]
1 Académie française [{'work': 'B.', 'year': '2003', 'page': '171'}]
1 ACHON (François d') [{'work': 'B.', 'year': '1997', 'page': '75'}]
1 1998, 384, 671 [{'work': 'B.', 'year': '1997', 'page': '350'}]
1 ACIGNÉ (Anne-Marguerite d’} [{'work': 'B.', 'year': '2001', 'page': '209'}]
1 ADALARD [{'work': 'M.', 'number': 'LXIL', 'pa

In [9]:
from sat_biblio_referencement.database.database_manager import DatabaseManager

In [13]:
dbm = DatabaseManager()
dbm.create_database()
dbm.prepare()
session = dbm.get_session()

2022-11-27 18:15:28,908 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-11-27 18:15:28,910 INFO sqlalchemy.engine.Engine COMMIT
