In [153]:
import os
import re

import pandas as pd

from bs4 import BeautifulSoup
from tqdm import tqdm

In [150]:
def convert_to_8digit(photo_id):
    return "0" * (8 - len(photo_id)) + photo_id

In [151]:
df = pd.read_csv("./datasets/tolstoy_photos_no_empty_columns.csv", sep=";", encoding="utf-8")
df["id"] = df["id"].astype("str").apply(convert_to_8digit)
df.head()

Unnamed: 0,id,AUTHOR,COMPNAM,CREAT,CREAT1,CREAT2,DESCRI,GEOGR,IDENTIF,INSCR,...,IZGOT,NCOMP,NFOND,SIZES,SPIEX,SPIRUBR,VLAD,path_xml,path_miniature,path_fullsize
0,115964,Кулаков П.Е.,Фотография для стереоскопа,1908,1908,1908,-1,Ясная Поляна,ГМТ КП-11486/21,-1,...,"Стереоскопическое издательство ""Свет""",-1,Фотофонд,"7,4х7,1 (одна фотография) 8,8х17,9 (бланк)",-1,Портреты (в т.ч. групповые),Баскаков Л.И.,./xmls/00115964.xml,./miniatures/00115964.xml,./fullsize/00115964.xml
1,136165,Бодянский И.А.,Увеличенный переснимок,1903 г.,1903,1903,-1,Ясная Поляна,ГМТ КП-5954/3,-1,...,-1,-1,Фотофонд,"17,8х23,5",-1,1900-е годы (юбилей в 1908г – отдельная рубрика),Долинино-Иванская Александра Леонидовна (Толстая),./xmls/00136165.xml,./miniatures/00136165.xml,./fullsize/00136165.xml
2,136603,Смирнов,-1,9 ноября 1910 г.,1910,1910,-1,Ясная Поляна,ГМТ КП-5680/6,-1,...,-1,-1,Фотофонд,"10,3х16,5",-1,Похороны Л.Н.Толстого в Ясной Поляне,Замятина Н. П.,./xmls/00136603.xml,./miniatures/00136603.xml,./fullsize/00136603.xml
3,106918,Соловьев В.С.,Две могилы в общей чугунной ограде. За решётко...,1932 г.,1932,1932,-1,с. Никольское Московской губ.,ГМТ КП-7448/1,-1,...,-1,-1,Фотофонд,"10,3х16,7",-1,Могилы и похороны разных лиц,Соловьев Владимир Сергеевич,./xmls/00106918.xml,./miniatures/00106918.xml,./fullsize/00106918.xml
4,142182,-1,-1,1930-е гг.?,1930,1939,-1,-1,ГМТ КП-13123/82,-1,...,-1,Альбом фотографий Е.С. Денисенко.,Фотофонд,"6,0х5,4",-1,"Писатели, поэты, литераторы",из архива Денисенко Е.С.,./xmls/00142182.xml,./miniatures/00142182.xml,./fullsize/00142182.xml


# `<titleStmt>`

In [155]:
def assemble_title_stmt(row_dict):
    if row_dict["AUTHOR"] != "-1":
        author = "<author>{}</author>".format(row_dict["AUTHOR"])
    else:
        author = ""
    funder = "<funder>Государственный музей Л.Н. Толстого</funder>"
    titleStmt = "<titleStmt>{}{}</titleStmt>".format(author, funder)
    return titleStmt

# `<publicationStmt>`

In [133]:
def assemble_publication_stmt(row_dict):
    acquisition = "<acquisition>{}</acquisition>".format(row_dict["IWAY"]) \
                if row_dict["IWAY"] != "-1" else ""
    nfond = "<collection type=\"nfond\">{}</collection>".format(row_dict["NFOND"]) \
            if row_dict["NFOND"] != "-1" else ""
    spiex = "<collection type=\"spiex\">{}</collection>".format(row_dict["SPIEX"]) \
            if row_dict["SPIEX"] != "-1" else ""
    identif = "<idno type=\"identif\">{}</idno>".format(row_dict["IDENTIF"]) \
            if row_dict["IDENTIF"] != "-1" else ""
    inv = "<idno type=\"inv\">{}</idno>".format(row_dict["INV"]) \
        if row_dict["INV"] != "-1" else ""
    category = "<category>{}</category>".format(row_dict["SPIRUBR"]) \
            if row_dict["SPIRUBR"] != "-1" else ""
    publisher = "<publisher>{}</publisher>".format(row_dict["VLAD"]) \
            if row_dict["VLAD"] != "-1" else ""
    publicationStmt = "<publicationStmt>{}{}{}{}{}{}{}</publicationStmt>".format(acquisition, 
                        nfond, spiex, identif, inv, category, publisher)
    return publicationStmt

Проверка:

In [134]:
stmt = assemble_publication_stmt(sample_dict)
root = etree.fromstring(stmt)
print(etree.tostring(root, pretty_print=True, encoding="utf-8").decode())

<publicationStmt>
  <acquisition>Закупка</acquisition>
  <collection type="nfond">Фотофонд</collection>
  <idno type="identif">ГМТ КП-10419</idno>
  <idno type="inv">Ф-9603</idno>
  <category>Дороги от станции к Усадьбе</category>
  <publisher>Шмелькин М. Г.</publisher>
</publicationStmt>



# `<sourceDesc>`

In [100]:
def extract_creat(row_dict):
    if row_dict["CREAT"] == "-1":
        dateCreat = ""
        if row_dict["CREAT1"] != "-1":
            dateCreat += "<date type=\"created notBefore\">{}</date>".format(row_dict["CREAT1"])
        if row_dict["CREAT2"] != "-1":
            dateCreat += "<date type=\"created notAfter\">{}</date>".format(row_dict["CREAT2"])
    else:
        dateCreat = "<date type=\"created\">{}</date>".format(row_dict["CREAT"])
    return dateCreat

In [59]:
def extract_sizes(sizes_str, reg_sizes):
    extraction = re.search(reg_sizes, line)
    width = extraction.group(1)
    height = extraction.group(2)
    return width, height

In [141]:
def assemble_source_desc(row_dict, reg_sizes):
    origPlace = "<origPlace>{}</origPlace>".format(row_dict["GEOGR"]) \
        if row_dict["GEOGR"] != "-1" else ""
    dateCreat = extract_creat(row_dict)
    distributor = "<distributor>{}</distributor>".format(row_dict["IZGOT"]) \
        if row_dict["IZGOT"] != "-1" else ""
    source = "<source>{}</source>".format(row_dict["NCOMP"]) \
        if row_dict["NCOMP"] != "-1" else ""
    if row_dict["SIZES"] != "-1":
        width, height = extract_sizes(row_dict["SIZES"], reg_sizes)
        dimensions = "<dimensions><width>{} см</width><height>{} см</height></dimensions>".format(width, height)
    else:
        dimensions = ""
    desc = "<desc>{}</desc>".format(row_dict["COMPNAM"]) \
        if row_dict["COMPNAM"] != "-1" else ""
    metamark = "<metamark>{}</metamark>".format(row_dict["INSCR"]) \
        if row_dict["INSCR"] != "-1" else ""
    sourceDesc = "<sourceDesc>{}{}{}{}{}{}{}</sourceDesc>".format(origPlace, 
                dateCreat, distributor, source, dimensions, desc, metamark)
    return sourceDesc

Проверка:

In [143]:
stmt = assemble_source_desc(sample_dict, reg_sizes)
root = etree.fromstring(stmt)
print(etree.tostring(root, pretty_print=True, encoding="utf-8").decode())

<sourceDesc>
  <origPlace>Ясная Поляна</origPlace>
  <date type="created">1903 г.</date>
  <dimensions>
    <width>7,4 см</width>
    <height>7,1 см</height>
  </dimensions>
  <desc>Увеличенный переснимок</desc>
</sourceDesc>



# `<revisionDesc>`

In [83]:
def assemble_revision_desc(row_dict):
    return "<revisionDesc><change when=\"2020-04-15\">конвертация в TEI из КАМИС</change></revisionDesc>"

## Всё вместе

In [139]:
# fileDesc
title = assemble_title_stmt(sample_dict)
publication = assemble_publication_stmt(sample_dict)
source = assemble_source_desc(sample_dict)

fileDesc = "<fileDesc>{}{}{}</fileDesc>".format(title, publication, source)
revisionDesc = assemble_revision_desc(sample_dict)

XMLHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
overallTEI = "<TEI xmlns=\"http://www.tei-c.org/ns/1.0\" xml:lang=\"rus\">\
<teiHeader>{}{}\
</teiHeader></TEI>".format(fileDesc, revisionDesc)

overallXML = XMLHeader + overallTEI

# MAIN

In [156]:
for row in tqdm(range(len(df))):
    row_dict = df.iloc[row].to_dict()

    photo_id = row_dict["id"]
    title = assemble_title_stmt(row_dict)
    source = assemble_source_desc(row_dict, reg_sizes)
    publication = assemble_publication_stmt(row_dict)

    fileDesc = "<fileDesc>{}{}{}</fileDesc>".format(title, publication, source)
    revisionDesc = assemble_revision_desc(row_dict)

    XMLHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
    overallTEI = "<TEI xmlns=\"http://www.tei-c.org/ns/1.0\" xml:lang=\"rus\"><teiHeader>{}{}</teiHeader></TEI>".format(fileDesc, revisionDesc)
    overallXML = XMLHeader + overallTEI

    soup = BeautifulSoup(overallXML, "lxml")
    with open("./data/tei/{}.xml".format(photo_id), "w", encoding="utf-8") as f:
        f.write(soup.prettify())

100%|██████████| 4077/4077 [00:07<00:00, 533.88it/s]
