# Sentence tokenisation

The aim of this notebook is to construct some kind of crude sentence tokenizer to apply to the papers past documents.

In [1]:
import os
import numpy as np
import pandas as pd
%matplotlib inline

from utils import multicore_apply

In [2]:
%%time
txt_files = []
for root, dirs, files in os.walk('../data/unzipped'):
    for f in files:
        fp = os.path.join(root, f)
        if f.endswith('.txt') and not f.endswith("README.txt"):
            txt_files.append(fp)

CPU times: user 9.4 s, sys: 3.51 s, total: 12.9 s
Wall time: 12.9 s


In [3]:
def build_corpus_data(filelist):
    corpus_data = pd.DataFrame({'filepath': filelist})
    corpus_data['newspaper_id'] = corpus_data.filepath.str.extract('../data/unzipped/([^/]+)')
    corpus_data['archive_name'] = (corpus_data
        .filepath
        .str.extract('../data/unzipped/([^/]+/[^/]+)', expand = False)
        .str.replace('/', '_')
    )
    return corpus_data

In [4]:
corpus_data = build_corpus_data(txt_files)

In [5]:
corpus_data.head()

Unnamed: 0,filepath,newspaper_id,archive_name
0,../data/unzipped/NOT/1887/NOT_18870317/MM_01/0...,NOT,NOT_1887
1,../data/unzipped/NOT/1887/NOT_18870317/MM_01/0...,NOT,NOT_1887
2,../data/unzipped/NOT/1887/NOT_18870317/MM_01/0...,NOT,NOT_1887
3,../data/unzipped/NOT/1887/NOT_18870317/MM_01/0...,NOT,NOT_1887
4,../data/unzipped/NOT/1887/NOT_18871217/MM_01/0...,NOT,NOT_1887


In [6]:
def read_file(filepath):
    with open(filepath, 'r') as f:
        return f.read()

sample_data = corpus_data.sample(1000)
    
sample_data['text'] = multicore_apply(sample_data.filepath, read_file)

100%|██████████| 1.00k/1.00k [00:00<00:00, 3.86kit/s]
1000it [00:00, 481218.91it/s]


In [10]:
! pip3 install nltk pytest

Collecting pytest
  Downloading pytest-6.2.3-py3-none-any.whl (280 kB)
[K     |████████████████████████████████| 280 kB 25.0 MB/s eta 0:00:01
Collecting py>=1.8.2
  Downloading py-1.10.0-py2.py3-none-any.whl (97 kB)
[K     |████████████████████████████████| 97 kB 6.3 MB/s  eta 0:00:011
Collecting pluggy<1.0.0a1,>=0.12
  Downloading pluggy-0.13.1-py2.py3-none-any.whl (18 kB)
Collecting iniconfig
  Downloading iniconfig-1.1.1-py2.py3-none-any.whl (5.0 kB)
Collecting toml
  Downloading toml-0.10.2-py2.py3-none-any.whl (16 kB)
Installing collected packages: toml, py, pluggy, iniconfig, pytest
Successfully installed iniconfig-1.1.1 pluggy-0.13.1 py-1.10.0 pytest-6.2.3 toml-0.10.2


In [11]:
! python3 -m nltk.downloader punkt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [12]:
from nltk.tokenize import sent_tokenize

In [15]:
sample_data['sentence'] = multicore_apply(sample_data['text'], sent_tokenize)

100%|██████████| 1.00k/1.00k [00:01<00:00, 665it/s]
1000it [00:00, 436542.88it/s]


In [18]:
sample_data['num_sentences'] = sample_data.sentence.apply(len)

In [69]:
import random
print('\n\n'.join([s.replace('\n', ' ') for s in random.choice(sample_data.sentence.values)]))

SHIPPING INTELLIGENCE.

High.Watbr at Nbtsok.

a,m: p.m. Thunday, June 1, 858 920 Friday, June 2, 941 10-1 Saturday, June 3, 10-23 10-44 Sunday, June 4, 114 1123 ENTBBHD INWAEDS.

May 31— Kennedy, es, 136, Williams, from West Coast.

Passengers—Mieß Lange, Mrs Carter, Judge Broad, Dr Collies, Messrs Hodgson, Larebin, M'G-arrity, Thotnaß, For man, Rankin, (LJoates, Fraeer, and Burton's Circus Company.

31—Maid oi Italy, cutter, 15, Clarke, from Riwaka.

31—Planet, cutter, 12, Thorne, from Mo tueka.

* CLEABED OUTWAEDS.

May 31—Murray, ss, 78, Scotf, for Wan ganui.

Passengers—Miaa Barnicoat, Messrs Max, Anderson, and White.

31—Wallace, oe, BS, Bruce, for West Coast.

Passengers—Miss Sharpj and Mrs Parkhill and child.

The Wanaka, with the English mails, will arrive early this momicg, having left Wol lineton at 6 o'clook last night.

She leave* at 4-30 p.m for Picton and Sout;h, taking Mel bourne passengers and cargo to ss Te Aimu.

The Wallace left for West Coast Ports at *J o'clook la

In [47]:
# These targets don't work
exceptions=['ODT_1898', 'LT_1890', 'LT_1891']

archives = [f for f in glob.glob('../data/raw/*.tar.gz') if not is_exception.search(f)]

In [None]:
#!/usr/bin/python
import tarfile,os
import sys
 
list_of_tar = open('dean.txt', 'r')
for tar_name in list_of_tar:
    tar_name = tar_name.rstrip()
    print tar_name
    tar = tarfile.open(tar_name,'r:gz')
    for member in tar.getnames():
        file_name=tar.extractfile(member)
        for line in file_name:
            line = line.lower()
            if 'alter ' in line:
                if ' system ' in line:
                    if 'kill' in line:
                        print tar_name, member , line
tar.close()

In [70]:
import sys
import tarfile

In [83]:
tar = tarfile.open('../data/raw/AG_1880.tar.gz', 'r:gz')

members = list(tar.getnames())

In [80]:
?tar.getnames

[0;31mSignature:[0m [0mtar[0m[0;34m.[0m[0mgetnames[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return the members of the archive as a list of their names. It has
the same order as the list returned by getmembers().
[0;31mFile:[0m      /usr/lib/python3.6/tarfile.py
[0;31mType:[0m      method


In [85]:
?tar.getmembers

[0;31mSignature:[0m [0mtar[0m[0;34m.[0m[0mgetmembers[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return the members of the archive as a list of TarInfo objects. The
list has the same order as the members in the archive.
[0;31mFile:[0m      /usr/lib/python3.6/tarfile.py
[0;31mType:[0m      method


In [88]:
[m for m in members[:100] if m.endswith('.xml') and not m.endswith('mets.xml')][:10]

['AG/1880/AG_18800101/MM_01/0001.xml',
 'AG/1880/AG_18800101/MM_01/0002.xml',
 'AG/1880/AG_18800101/MM_01/0003.xml',
 'AG/1880/AG_18800101/MM_01/0004.xml',
 'AG/1880/AG_18800101/MM_01/0005.xml',
 'AG/1880/AG_18800101/MM_01/0006.xml',
 'AG/1880/AG_18800103/MM_01/0001.xml',
 'AG/1880/AG_18800103/MM_01/0002.xml',
 'AG/1880/AG_18800103/MM_01/0003.xml',
 'AG/1880/AG_18800103/MM_01/0004.xml']

In [2]:
import re

def split_date_body(data):
    p = re.compile(r'(Date),\s*(Body.*)')
    Date, Body = p.findall(data)[0]
    return Date, Body

def get_newspaper_date(path):
    p = re.compile(r'([A-Z]+_[0-9]+)/MM_01')
    newspaper_date = p.findall(path)[0]
    return newspaper_date

def get_article(path):
    p = re.compile(r'MM_01/([0-9]+)\.xml')
    article = p.findall(path)[0]
    return article



In [None]:
import os
import re
import tarfile
from tqdm import tqdm
import itertools as it
from multiprocessing import cpu_count
from concurrent.futures import ProcessPoolExecutor, as_completed

# Detect if filepath corresponds to a newspaper article
is_newspaper_xml = re.compile('[0-9]+.xml')


# Extract the archive name from the tar.gz filename
def get_basename(fp):
    return os.path.basename(fp).split('.')[0]


def list_tar(tar):
    '''
    Returns a list of all articles under the given `tar`,
    which is a tar.gz file
    '''
    members = []
    for member in tarfile.open(tar, 'r:gz').getnames():
        if is_newspaper_xml.search(member):
            members.append(member)
    return members


def list_articles(tar_files):
    '''
    Returns a list of all the articles under each tar.gz
    file in the `tar_dir` directory
    '''
    return list(it.chain.from_iterable(multicore_apply(tar_files, list_tar))


def get_newspaper_date(path):
    p = re.compile(r'([A-Z]+_[0-9]+)/MM_01')
    newspaper_date = p.findall(path)[0]
    return newspaper_date


def get_article(path):
    p = re.compile(r'MM_01/([0-9]+)\.xml')
    article = p.findall(path)[0]
    return article


NameError: name 'cpu_count' is not defined