# Exploring the papers past archive

In this notebook I explore the papers past archive data

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows', 100)

In [2]:
metadata = pd.read_csv('../data/NLNZ_newspaperData.csv')
metadata['downloadSize'] = metadata.downloadSize.str.replace(',', '').astype(np.int32)
metadata['filesize'] = ('../data/raw/' + metadata.packageName).apply(os.path.getsize)
metadata['size_ratio'] = (metadata.downloadSize / metadata.filesize)

In [3]:
metadata

Unnamed: 0,packageName,title,year,region,issues,pages,downloadSize,link,filesize,size_ratio
0,ALG_1862.tar.gz,Albertland Gazette,1862,Auckland,2,8,230632,https://paperspast.natlib.govt.nz/opendata/tit...,230653,0.999909
1,ALG_1863.tar.gz,Albertland Gazette,1863,Auckland,5,20,869397,https://paperspast.natlib.govt.nz/opendata/tit...,869413,0.999982
2,ALG_1864.tar.gz,Albertland Gazette,1864,Auckland,4,16,670881,https://paperspast.natlib.govt.nz/opendata/tit...,670901,0.999970
3,AG_1879.tar.gz,Ashburton Guardian,1879,Canterbury,36,214,29531453,https://paperspast.natlib.govt.nz/opendata/tit...,29531469,0.999999
4,AG_1880.tar.gz,Ashburton Guardian,1880,Canterbury,188,763,132103901,https://paperspast.natlib.govt.nz/opendata/tit...,132103917,1.000000
...,...,...,...,...,...,...,...,...,...,...
1651,WOODEX_1895.tar.gz,Woodville Examiner,1895,Manawatu-Wanganui,150,600,93508924,https://paperspast.natlib.govt.nz/opendata/tit...,93508943,1.000000
1652,WOODEX_1896.tar.gz,Woodville Examiner,1896,Manawatu-Wanganui,151,605,94384096,https://paperspast.natlib.govt.nz/opendata/tit...,94384112,1.000000
1653,WOODEX_1897.tar.gz,Woodville Examiner,1897,Manawatu-Wanganui,139,560,82942874,https://paperspast.natlib.govt.nz/opendata/tit...,82942891,1.000000
1654,WOODEX_1898.tar.gz,Woodville Examiner,1898,Manawatu-Wanganui,152,615,85277935,https://paperspast.natlib.govt.nz/opendata/tit...,85277954,1.000000


In [4]:
metadata.sort_values('size_ratio', ascending = False)

Unnamed: 0,packageName,title,year,region,issues,pages,downloadSize,link,filesize,size_ratio
607,LT_1891.tar.gz,Lyttelton Times,1891,Canterbury,311,2488,531014362,https://paperspast.natlib.govt.nz/opendata/tit...,196280320,2.705388
606,LT_1890.tar.gz,Lyttelton Times,1890,Canterbury,311,2488,539441499,https://paperspast.natlib.govt.nz/opendata/tit...,440270848,1.225249
608,LT_1892.tar.gz,Lyttelton Times,1892,Canterbury,313,2504,516499883,https://paperspast.natlib.govt.nz/opendata/tit...,516499883,1.000000
609,LT_1893.tar.gz,Lyttelton Times,1893,Canterbury,310,2480,497683348,https://paperspast.natlib.govt.nz/opendata/tit...,497683348,1.000000
940,ODT_1884.tar.gz,Otago Daily Times,1884,Otago,311,1373,485122619,https://paperspast.natlib.govt.nz/opendata/tit...,485122627,1.000000
...,...,...,...,...,...,...,...,...,...,...
349,GBARG_1889.tar.gz,Golden Bay Argus,1889,Nelson,1,4,155154,https://paperspast.natlib.govt.nz/opendata/tit...,155175,0.999865
350,GBARG_1890.tar.gz,Golden Bay Argus,1890,Nelson,4,17,640284,https://paperspast.natlib.govt.nz/opendata/tit...,640377,0.999855
731,MTBM_1895.tar.gz,Mt Benger Mail,1895,Otago,1,4,120111,https://paperspast.natlib.govt.nz/opendata/tit...,120130,0.999842
344,GBARG_1884.tar.gz,Golden Bay Argus,1884,Nelson,4,16,501376,https://paperspast.natlib.govt.nz/opendata/tit...,501472,0.999809


In [5]:
%%time
xml_files = []
for root, dirs, files in os.walk('../data/unzipped'):
    for f in files:
        fp = os.path.join(root, f)
        if f.endswith('.xml') and not f.endswith("mets.xml"):
            xml_files.append(fp)

CPU times: user 9.32 s, sys: 3.58 s, total: 12.9 s
Wall time: 13 s


In [6]:
print(f'Found {len(xml_files)} xml files in the papers past data')

Found 1 xml files in the papers past data


In [7]:
xml_files[:10]

['../data/unzipped/ODT/1898/ODT_18981020/MM_01/0005.xml']

In [8]:
def build_corpus_data(filelist):
    corpus_data = pd.DataFrame({'filepath': filelist})
    corpus_data['newspaper_id'] = corpus_data.filepath.str.extract('../data/unzipped/([^/]+)')
    corpus_data['archive_name'] = (corpus_data
        .filepath
        .str.extract('../data/unzipped/([^/]+/[^/]+)', expand = False)
        .str.replace('/', '_')
    )
    return corpus_data

In [9]:
xml_data = build_corpus_data(xml_files)

In [10]:
xml_data

Unnamed: 0,filepath,newspaper_id,archive_name
0,../data/unzipped/ODT/1898/ODT_18981020/MM_01/0...,ODT,ODT_1898


In [11]:
archives = [f'../data/unzipped/.{archive}.done' for archive in xml_data.archive_name.unique()]
for arch in archives:
    if os.path.exists(arch):
        os.remove(arch)

In [12]:
archives

['../data/unzipped/.ODT_1898.done']

In [13]:
xml_data.groupby('newspaper_id').count()

Unnamed: 0_level_0,filepath,archive_name
newspaper_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ODT,1,1


In [15]:
%%time
text_files = []
for root, dirs, files in os.walk('../data/unzipped'):
    for f in files:
        fp = os.path.join(root, f)
        if f.endswith('.txt') and not f.endswith('README.txt'):
            text_files.append(fp)

CPU times: user 9.81 s, sys: 3.53 s, total: 13.3 s
Wall time: 13.4 s


In [16]:
print(f'Found {len(text_files)} text files in the papers past data')

Found 1466407 text files in the papers past data


In [19]:
corpus_data = build_corpus_data(text_files)

In [21]:
corpus_data

Unnamed: 0,filepath,newspaper_id,archive_name
0,../data/unzipped/NOT/1887/NOT_18870317/MM_01/0...,NOT,NOT_1887
1,../data/unzipped/NOT/1887/NOT_18870317/MM_01/0...,NOT,NOT_1887
2,../data/unzipped/NOT/1887/NOT_18870317/MM_01/0...,NOT,NOT_1887
3,../data/unzipped/NOT/1887/NOT_18870317/MM_01/0...,NOT,NOT_1887
4,../data/unzipped/NOT/1887/NOT_18871217/MM_01/0...,NOT,NOT_1887
...,...,...,...
1466402,../data/unzipped/GBARG/1897/GBARG_18970610/MM_...,GBARG,GBARG_1897
1466403,../data/unzipped/GBARG/1897/GBARG_18970610/MM_...,GBARG,GBARG_1897
1466404,../data/unzipped/GBARG/1897/GBARG_18970610/MM_...,GBARG,GBARG_1897
1466405,../data/unzipped/GBARG/1897/GBARG_18970610/MM_...,GBARG,GBARG_1897


In [23]:
corpus_data.groupby('newspaper_id').count()['archive_name']

newspaper_id
AG         21728
ALG           44
AS         53661
BA          4365
BH         20169
CHARG         48
CHP        64229
CL         10721
CROMARG     9405
DSC        32496
DTN        20484
DUNST       8058
ESD        40622
FS         14608
GBARG       3274
GLOBE      10304
GRA        38357
HAST        4432
HBH        33343
HBT         6622
HBWT         548
HLC          420
HNS        17298
IT         14484
KUMAT      12950
LCP         5876
LT         78291
LTCBG        304
LWM        10487
ME          8670
MEX        31794
MH          7577
MIC        11491
MS          4096
MT          4248
MTBM         294
NA          1914
NEM        42411
NOT        35281
NZABIG       117
NZCPNA       423
NZGWS       1512
NZSCSG      4067
NZTIM      38423
OAM        28418
ODT        60227
OG          2487
OO          1714
OPUNT       2264
OW         70804
PATM       12113
PBH        24633
PGAMA       4221
SCANT      22515
SOCR        5733
ST         30890
TAN         3817
TC         35695
T