In [1]:
!pip install -U sqlalchemy



In [2]:
from sqlalchemy import Column
from sqlalchemy import ForeignKey
from sqlalchemy import Integer
from sqlalchemy import String
from sqlalchemy import JSON
from sqlalchemy import Date
from sqlalchemy.orm import declarative_base
from sqlalchemy.orm import relationship
from sqlalchemy import create_engine
from sqlalchemy.orm import Session

from pymarc import MARCReader
import zipfile
import json
from datetime import datetime

engine = create_engine('sqlite:///record_data.db')

Base = declarative_base()

class Record(Base):
    __tablename__ = "record"

    id = Column(Integer, primary_key=True)
    
    title = Column(String, nullable=True)

    # 998 subfield a - "Location"
    location = Column(String, nullable=True)
    
    # 998 subfield b - "Cat. Date"
    cat_date = Column(Date, nullable=True)
    
    # 998 subfield c - "Bib Level"
    bib_level = Column(String, nullable=True)
    
    # 998 subfield d - "Format (MatType)"
    mat_type = Column(String, nullable=True)
    
    # 907 subfield c - "Created"
    create_date = Column(Date, nullable=True)    
    
    # 907 subfield b - "Last Updated"
    last_update = Column(Date, nullable=True)
    
    # (record['773']['a'], record['773']['g'])
    pub_date = Column(Date, nullable=True)
    citation = Column(String, nullable=True)
        
    # record_json = Column(JSON)
    
    # subjects = relationship(
    #     "Subject", back_populates="record", cascade="all, delete-orphan"
    # )

    # def __repr__(self):
    #     return f"User(id={self.id!r}, name={self.name!r}, fullname={self.fullname!r})"

# class Subject(Base):
#     __tablename__ = "subject"

#     id = Column(Integer, primary_key=True)
#     value = Column(String, nullable=False)
#     occ_num = Column(Integer)
#     record_id = Column(Integer, ForeignKey("record.id"), nullable=False)

#     record = relationship("Record", back_populates="subjects")

#     # def __repr__(self):
#     #     return f"Address(id={self.id!r}, email_address={self.email_address!r})"


# # create the schema    
Base.metadata.create_all(engine, )

In [3]:
with Session(engine) as session:
    with zipfile.ZipFile('2023-01-24_newsdex-full-marc-export.zip') as marc_data_file:
        with marc_data_file.open(marc_data_file.filelist[0]) as fh:
            record_list = []
            # was getting this error 
            # https://github.com/edsu/pymarc/issues/129
            reader = MARCReader(fh, to_unicode=True, force_utf8=True)
            for i, record in enumerate(reader):
                # subjects = []
                # for j, s in enumerate([subject.value() for subject in record.subjects()]):
                #     subjects.append(Subject(value=s, occ_num=j))
                
                try:
                    title=record.title()
                except:
                    title=None
                
                # 998 subfield a - "Location"
                try:
                    location=record['998']['a']
                except:
                    location=None

                # 998 subfield b - "Cat. Date"
                try:
                    cat_date=datetime.strptime(record['998']['b'], '%m-%d-%y')
                except:
                    cat_date=None
                    
                # 907 subfield c - "Created"
                try:
                    create_date=datetime.strptime(record['907']['c'], '%m-%d-%y')
                except:
                    create_date=None

                # 907 subfield b - "Last Updated"
                try:
                    last_update=datetime.strptime(record['907']['b'], '%m-%d-%y')
                except:
                    last_update=None

                # 998 subfield c - "Bib Level"
                try:
                    bib_level=record['998']['c']
                except:
                    bib_level=None

                # 998 subfield d - "Format (MatType)"
                try:
                    mat_type=record['998']['d']
                except:
                    mat_type=None

                
                
                # 260$c - Date of publication, distribution, etc. (R)

                # try:
                #     record_json=json.loads(record.as_json())
                # except:
                #     record_json=None
                
                r = Record(
                    title=title,
                    location=location,
                    cat_date=cat_date,
                    create_date=create_date,
                    last_update=last_update,
                    bib_level=bib_level,
                    mat_type=mat_type
                    
                    # record_json=record_json,
                    # subjects=subjects
                )
                record_list.append(r)
                
                if (i % 10000 == 0):
                    session.add_all(record_list)
                    session.commit()
                    del(record_list)
                    record_list = []
                    print(i, '.', end='')
                    
            session.add_all(record_list)
            session.commit()
            print(i, '.', end='')

0 .10000 .20000 .30000 .40000 .50000 .60000 .70000 .80000 .90000 .100000 .110000 .120000 .130000 .140000 .150000 .160000 .170000 .180000 .190000 .200000 .210000 .220000 .230000 .240000 .250000 .260000 .270000 .280000 .290000 .300000 .310000 .320000 .330000 .340000 .350000 .360000 .370000 .380000 .390000 .400000 .410000 .420000 .430000 .440000 .450000 .460000 .470000 .480000 .490000 .500000 .510000 .520000 .530000 .540000 .550000 .560000 .570000 .580000 .590000 .600000 .610000 .620000 .630000 .640000 .650000 .660000 .670000 .680000 .690000 .700000 .710000 .720000 .730000 .740000 .750000 .760000 .770000 .780000 .790000 .800000 .810000 .820000 .830000 .840000 .850000 .860000 .870000 .880000 .890000 .900000 .910000 .920000 .930000 .940000 .950000 .960000 .970000 .980000 .990000 .1000000 .1010000 .1020000 .1030000 .1040000 .1050000 .1060000 .1070000 .1080000 .1090000 .1100000 .1110000 .1120000 .1130000 .1140000 .1150000 .1160000 .1170000 .1180000 .1190000 .1200000 .1210000 .1220000 .1230000

In [4]:
for field in record.fields:
    print(field.tag, field.value(), end="\n")
    # print([(field.tag, field.value()) for field in record.fields], sep="\n")

008                                         
245 Public library staff: Instructive talks heard and a social hour enjoyed by employees
260 1900
650 Public Library
650 Public Library, Personnel
650 Hodges, Nathaniel Dana Carlisle
773 Commercial Tribune 10/02/1900 6:6
907 .b28911532 03-06-20 03-06-20
998 2ma 03-06-20 m - - eng 0


In [5]:
record['907'].subfields_as_dict()['c']

['03-06-20']

In [6]:
# [subfield for subfield in record['773'].get_subfields]

In [7]:
# with Session(engine) as session:
#     with zipfile.ZipFile('2023-01-24_newsdex-full-marc-export.zip') as marc_data_file:
#         # should only contain one file ...
#         print(marc_data_file.filelist[0].filename)
#         # get a file handle for the marc data file
#         with marc_data_file.open(marc_data_file.filelist[0]) as fh:
#             reader = MARCReader(fh)
#             for i, record in enumerate(reader):
#                 subjects = []
#                 for j, s in enumerate([subject.value() for subject in record.subjects()]):
#                     subjects.append(Subject(value=s, occ_num=j))
#                 r = Record(
#                     title=record.title(),
#                     record_json=record.as_json(),
#                     subjects=subjects
#                 )

#                 session.add(r)
#                 if (i % 100000 == 0 ):
#                     print(f'{i}.', end='')
#                     session.commit()

#                 # print(i, record['001'].value(), record.title(), [subject.value() for subject in record.subjects()])
#                 # if i>=100:
#                 #     break
#     session.commit()