In [1]:
# load necessary libraries
import numpy as np
from scipy import ndimage
from scipy import misc
from scipy import stats
from scipy.ndimage.filters import gaussian_filter as gf

import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.patches import Rectangle as Rec

from skimage import filters
from skimage import transform as tf

import csv
import json
import random
import urllib
import cStringIO
# import cv2
from PIL import Image

import sys, os
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
import json
from datetime import datetime

from projEdgeBreaks import *
from imageModifiers import *
from saveImages import *
from plottingFuncs import *

# change plot size
mpl.rcParams["figure.figsize"] = (15, 15)

# DCW images

In [2]:
subjFile = "data/decoding-the-civil-war-subjects-9-29-17.csv"
subj = pd.read_csv(subjFile)
# get only the workflow from the live project
subj = subj[subj['workflow_id'] == 1874]

# get metadata in dictionary format
subj["meta_json"] = [json.loads(q) for q in subj["metadata"]]
# get hdl_id from metadata
subj["hdl_id"] = [q.get("hdl_id", "mssF") for q in subj["meta_json"]]
# get image url
subj["url"] = [json.loads(q).get("0") for q in subj["locations"]]

# remove images without ids
subj = subj[subj["hdl_id"] != "mssF"]
# remove codebook images (mssEC_36-67)
filt = subj["hdl_id"].str.contains("mssEC_3[6-9]|[4-6][0-9]")
subj = subj[~filt]
# remove ledgers that seemed weird (only easy stuff for now)
filt = subj["hdl_id"].str.contains("mssEC_3[0-3]|2[6-9]")
subj = subj[~filt]
# remove the first few pages because they tended to be blank
filt = subj["hdl_id"].str.contains("mssEC_\d\d_00[1-6]")
subj = subj[~filt]

In [3]:
# get random sample of images (from set seed for reproducibility)
random.seed(53)
picLocs = np.sort(random.sample(list(range(len(subj))), 50))
subjSamp = subj.iloc[picLocs]

In [5]:
# save images locally so I can take a quick peek
for i in range(len(subjSamp)):
    fn1 = subjSamp["url"].iloc[i]
    tf = "DCW_images/"
    fn2 = subjSamp["subject_id"].iloc[i]
    readAndSave(fn1, tf, fn2)

In [6]:
# create manifest files
cols = list(subjSamp.keys())
cols.append("row_loc")
cols.append("imgLoc")

manifest1 = "DCW_slope_splits1/myManifest.csv"
with open(manifest1, "w") as f:
    writeit = csv.writer(f)
    writeit.writerow(cols)

manifest2 = "DCW_slope_splits1/manifest.csv"
cols = ["subject_id_orig", "hdl_id", "origin", "telegrams", "row_loc", 
        "img_loc"]
with open(manifest2, "w") as f:
    writeit = csv.writer(f)
    writeit.writerow(cols)

# create all images
for i in range(len(subjSamp)):
    fname = subjSamp["url"].iloc[i]

    # read in and do all pre-processing #################################
    let, grey = readImg(fname, plotIt=False)
    let, grey = removeEdges(let, grey, rmThresh=0)
    grey = whitenEdgesFilter(grey)
    greyBi = binarizeImg(grey, biThresh="otsu", plotIt=False)
    greySm = smoothImg(grey, smoothSigma=5.0, plotIt=False)

    # get linebreaks ####################################################
    lb = projBreaks(greySm, "y")
    # plotBoxes(let, lb)
    saveLinesdc(img=let, lb=lb, rw=subjSamp.iloc[i],
                folder="DCW_slope_splits1/", manifest1 = manifest1,
                manifest2 = manifest2, prefix = subjSamp["hdl_id"].iloc[i],
                lines = 4)

# BPL images

In [7]:
subjFile = "data/anti-slavery-manuscripts-subjects.csv"
subj = pd.read_csv(subjFile)

# get metadata in dictionary format
subj["meta_json"] = [json.loads(q) for q in subj["metadata"]]

# get image url
subj["url"] = [json.loads(q).get("0") for q in subj["locations"]]

In [8]:
# get random sample of images (from set seed for reproducibility)
random.seed(53)
picLocs = np.sort(random.sample(list(range(len(subj))), 50))
subjSamp = subj.iloc[picLocs]

In [9]:
# save images locally so I can take a quick peek
for i in range(len(subjSamp)):
    fn1 = subjSamp["url"].iloc[i]
    tf = "BPL_images/"
    fn2 = subjSamp["subject_id"].iloc[i]
    readAndSave(fn1, tf, fn2)

In [10]:
# create manifest files
cols = list(subjSamp.keys())
cols.append("row_loc")
cols.append("imgLoc")

manifest1 = "BPL_slope_splits1/myManifest.csv"
with open(manifest1, "w") as f:
    writeit = csv.writer(f)
    writeit.writerow(cols)

manifest2 = "BPL_slope_splits1/manifest.csv"
cols = ["subject_id_orig", "creator", "row_loc", "img_loc"]
with open(manifest2, "w") as f:
    writeit = csv.writer(f)
    writeit.writerow(cols)

# create all images
for i in range(len(subjSamp)):
    fname = subjSamp["url"].iloc[i]

    # read in and do all pre-processing #################################
    let, grey = readImg(fname, plotIt=False)
    let, grey = removeEdges(let, grey, rmThresh=0)
    grey = whitenEdgesFilter(grey)
    greyBi = binarizeImg(grey, biThresh="otsu", plotIt=False)
    greySm = smoothImg(grey, smoothSigma=7.0, plotIt=False)

    # get linebreaks ####################################################
    lb = projBreaks(greySm, "y")
#     plotBoxes(let, lb)
    saveLinesbp(img=let, lb=lb, rw=subjSamp.iloc[i],
                folder="BPL_slope_splits1/", manifest1 = manifest1,
                manifest2 = manifest2,
                prefix = "a" + str(subjSamp["subject_id"].iloc[i]),
                lines = 4)