In [1]:
# load necessary libraries
import numpy as np
from scipy import ndimage
from scipy import misc
from scipy import stats
from scipy.ndimage.filters import gaussian_filter as gf

import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.patches import Rectangle as Rec

from skimage import filters
from skimage import transform as tf

import csv
import json
import random
import urllib
from PIL import Image

import sys, os
import pandas as pd
import pickle
from datetime import datetime
import glob

from projEdgeBreaks import *
from imageModifiers import *
from saveImages import *
from plottingFuncs import *

# change pandas parameter
pd.options.mode.chained_assignment = None
# change plot size
mpl.rcParams["figure.figsize"] = (15, 15)

In [2]:
def gaussBreaks(chunk, nu=3.5, biThresh=2, shear=0.6, fix=15, plotIt=False):
    # get smoothing factors
    sigYs = np.arange(1, 8, 0.3)
    sigXs = sigYs * nu
    # shear the image
    oldrg = (np.max(chunk) - np.min(chunk))
    newrg = 2
    ch2 = (((chunk - np.min(chunk)) * newrg) / oldrg) - 1
    mytf = tf.AffineTransform(shear=shear)
    chunk = tf.warp(ch2, inverse_map=mytf)
    
    # choose which smoothing factor to use based on minimizing
    # the white space
    extents = []
    count = 0
    for j in range(len(sigYs)):
        filt = gf(input=chunk, sigma=(sigYs[j],sigXs[j]), order=0)
        if count < biThresh:
            th = filters.threshold_otsu(filt)
            count += 1
        binfilt = binarizeImg(filt, th)
        extents.append(np.sum(binfilt))
    j = np.argmin(extents)
    
    filt = ndimage.filters.gaussian_filter(input=chunk, sigma=(sigYs[j],sigXs[j]))
    binfilt = 1 - binarizeImg(filt, th)

    # find connect components
    labels, nrObj = ndimage.label(binfilt)
    osli = ndimage.find_objects(labels)
    
    # find the word boxes
    rec = []
    bounds = []
    sh = np.max(labels.shape)
    for sl in osli:
        sl0 = sl[0].indices(sh)
        sl1 = sl[1].indices(sh)
        
        xLeng = sl1[1]-sl1[0]
        yLeng = sl0[1]-sl0[0]
        if xLeng*yLeng > 100:
            bounds.append([sl1[0], sl1[1]])
            rec.append([[sl1[0], sl0[0]], xLeng, yLeng])
    # combine those that are surrounded by others
    bounds = sorted(bounds)
    newbounds = []
    skipnext = False
    if len(bounds) <= 0:
        return [0, chunk.shape[1]]
    bPrev = bounds[0]
    for i in range(1, len(bounds)):
        bCur = bounds[i]
        if bPrev[1] > bCur[0]:
            bPrev = [bPrev[0], bCur[1]]
        else:
            newbounds.append(bPrev)
            bPrev = [x for x in bCur]
    newbounds.append(bPrev)
    try:
        wbLine = [newbounds[0][0]]
    except:
        wbLine = [0]
    for i in range(1, len(newbounds)):
        b1 = newbounds[i-1]
        b2 = newbounds[i]
        wbLine.append(np.mean([b1[1], b2[0]])-fix)
    try:
        wbLine.append(b2[1])
    except:
        wbLine.append(chunk.shape[1])
#         pass
    
    # plot connected components
    if plotIt:
        fit,ax = plt.subplots(1)
        ax.imshow(labels, cmap='nipy_spectral')
        for i in range(len(rec)):
            rect = mpl.patches.Rectangle(rec[i][0], rec[i][1], rec[i][2], linewidth=1, edgecolor="r", facecolor="none")
            ax.add_patch(rect)
        plt.show()
    
    return np.array(wbLine).astype("int")

## Get data from classification

In [3]:
clExp = pd.read_csv("data/letter-slope-classifications_10nov2017.csv")
clExp = clExp.loc[clExp["workflow_version"] == 16.28]
# len(clExp.loc[:,"subject_ids"].drop_duplicates())
# len(clExp.loc)
clExp["subj_json"] = [json.loads(q) for q in clExp["subject_data"]]
clExp["hdl_id"] = [q.get(list(q.keys())[0]).get("hdl_id", "") for q in clExp["subj_json"]]
clExp = clExp.loc[clExp["hdl_id"] != ""]

## Specify file and folder locations

In [4]:
consensusFolder = "C:/Users/danny/Repos/text_segmentation/consensus/consensus/"
# consensusFile = consensusFolder + "decoding-the-civil-war-consensus-linewise_{mss_label}.csv"
subjFile = "data/decoding-the-civil-war-subjects-9-29-17.csv"
savefile = "accuracy.pkl"

consensusCsvFiles = glob.glob(
    '{}/*.csv'.format(consensusFolder))
consensusCsvFiles = [i for i in consensusCsvFiles if "linewise" in i]

### Read data files

In [5]:
for consensusFile in consensusCsvFiles:
    ##############################################################################################
    # Read data files
    # subject file
    subj = pd.read_csv(subjFile)
    # get only the workflow from the live project
    subj = subj.loc[subj['workflow_id'] == 1874]

    # get metadata in dictionary format
    subj["meta_json"] = [json.loads(q) for q in subj["metadata"]]
    # get hdl_id from metadata
    subj["hdl_id"] = [q.get("hdl_id", "mssF") for q in subj["meta_json"]]
    # get image url
    subj["url"] = [json.loads(q).get("0") for q in subj["locations"]]

    # remove images without ids
    subj = subj[subj["hdl_id"] != "mssF"]
    # remove codebook images (mssEC_36-67)
    filt = subj["hdl_id"].str.contains("mssEC_3[6-9]|[4-6][0-9]")
    subj = subj[~filt]
    # remove ledgers that seemed weird (only easy stuff for now)
    filt = subj["hdl_id"].str.contains("mssEC_3[0-3]|2[6-9]")
    subj = subj[~filt]
    # remove the first few pages because they tended to be blank
    filt = subj["hdl_id"].str.contains("mssEC_\d\d_00[1-6]")
    subj = subj[~filt]

    # consensus file (by line)
    cons = pd.read_csv(consensusFile, sep="@@", engine="python").drop_duplicates()

    # combine the two and sift out unneeded columns
    allTelegramInfo = pd.merge(cons, subj, on="hdl_id", suffixes=["_cons", "_subj"])
    idAndUrl = allTelegramInfo.loc[:,["hdl_id", "url_cons"]].drop_duplicates()
    transcriptionsByLine = allTelegramInfo.loc[:,["hdl_id", "bestLineIndex", "consensus_text", 
                                           "y_loc", "len_wordlist"]]

    # use only the data that has slant information
    idAndUrl = idAndUrl.loc[idAndUrl['hdl_id'].isin(clExp['hdl_id'])]
    transcriptionsByLine = transcriptionsByLine.loc[transcriptionsByLine['hdl_id'].isin(clExp['hdl_id'])]
    
    ##############################################################################################
    # collect the data for the wordbreaks
    data = {}

    nuOpt = np.arange(0.5, 6.5, 0.5) # 0.5:6.5
    biOpt = list(range(1, 10)) # 1:10

    for im in list(idAndUrl.index):
        hdl_id = idAndUrl.loc[im, "hdl_id"]
        data[hdl_id] = {}
        print(hdl_id)

        data[hdl_id]["url"] = idAndUrl.loc[im, "url_cons"]
        let, grey = readImg(idAndUrl.loc[im, "url_cons"])#, True)
        linesForTele = transcriptionsByLine.loc[transcriptionsByLine["hdl_id"] == hdl_id]
        linesForTele.loc[:,"y1"] = [eval(l)[0] for l in linesForTele.loc[:,"y_loc"]]


        # read in and do all pre-processing #################################
        let, grey = removeEdges(let, grey, rmThresh=0)
        grey = whitenEdgesProject(grey)
        greyBi = binarizeImg(grey, biThresh="otsu")#, plotIt=True)
        greySm = smoothImg(grey, smoothSigma=10.0)#, plotIt=True)
        greyBiSm = smoothImg(greyBi, smoothSigma=5.0)#, plotIt=True)


        # get linebreaks ####################################################
        matchlim = 30
        lbold = projBreaks(greySm, "y")
        lb = []
        lb.append(lbold[0])
        cur = 0
        for i in range(1, len(lbold)):
            if np.abs(lb[cur] - lbold[i]) < matchlim:
                lb[cur] = np.mean([lb[cur], lbold[i]])
            else:
                cur += 1
                lb.append(lbold[i])
        lb = np.array(lb).astype("int")
        data[hdl_id]["lb"] = lb

        # get matching lines with actual lines ##############################
        actuallb = linesForTele.loc[:, "y1"]
        matches = []
        for i in range(len(lb)):
            closest = np.argmin(np.abs(np.subtract(lb[i], actuallb)))
            if np.abs(lb[i] - actuallb[closest]) < matchlim:
                matches.append(closest)
            else:
                matches.append(-1)
        data[hdl_id]["matches"] = matches

        # get wordbreaks ####################################################
        accurL = []

        for i in range(1, len(lb)):
            print(i, end=" ")
            if matches[i] == -1:
                continue
            chunk = grey[lb[i-1]:lb[i],]
            rw = linesForTele.loc[matches[i],"consensus_text"]
            # remove leading and trailing "
            if rw[0] == '"':
                rw = rw[1:]
            if rw[-1] == '"':
                rw = rw[:-1]
            nr = len(rw.split(" "))
            for nu in nuOpt:
                for bi in biOpt:
                    br = gaussBreaks(chunk, nu=nu, biThresh=bi, shear=0, fix=0)
                    accurL.append({"i":i, "nu":nu, "bi":bi, "nr":nr, "br":br})
        accur = pd.DataFrame(accurL)
        data[hdl_id]["segment"] = accur
        print()
        
        # save object
        with open(savefile, "wb") as f:
            pickle.dump(data, f)

mssEC_02_007
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 
mssEC_02_032
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 
mssEC_02_131
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 
mssEC_02_175
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 
mssEC_04_089
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 
mssEC_11_085
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
mssEC_11_092
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 
mssEC_11_120
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 
mssEC_11_194
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 
mssEC_11_386
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 
ms

# OOOOOOOOOOOOOOOOOOOOOOOOOOOLLLLLLLLLLLD

## Get data from classification

In [4]:
clExp = pd.read_csv("data/letter-slope-classifications_10nov2017.csv")
clExp = clExp.loc[clExp["workflow_version"] == 16.28]
# len(clExp.loc[:,"subject_ids"].drop_duplicates())
# len(clExp.loc)
clExp["subj_json"] = [json.loads(q) for q in clExp["subject_data"]]
clExp["hdl_id"] = [q.get(list(q.keys())[0]).get("hdl_id", "") for q in clExp["subj_json"]]
clExp = clExp.loc[clExp["hdl_id"] != ""]

## Specify file and folder locations

In [5]:
consensusFolder = "C:/Users/danny/Repos/text_segmentation/consensus/consensus/"
consensusFile = consensusFolder + "decoding-the-civil-war-consensus-linewise_{mss_label}.csv"
subjFile = "dcw_data/decoding-the-civil-war-subjects-9-29-17.csv"

## This will late be in a loop - get hdl_id

In [6]:
mssLabel = "fication_export_mssEC_02_05_25_17"
consensusFile = consensusFile.format(mss_label=mssLabel)

### Read data files

In [7]:
# subject file
subj = pd.read_csv(subjFile)
# get only the workflow from the live project
subj = subj.loc[subj['workflow_id'] == 1874]

# get metadata in dictionary format
subj["meta_json"] = [json.loads(q) for q in subj["metadata"]]
# get hdl_id from metadata
subj["hdl_id"] = [q.get("hdl_id", "mssF") for q in subj["meta_json"]]
# get image url
subj["url"] = [json.loads(q).get("0") for q in subj["locations"]]

# remove images without ids
subj = subj[subj["hdl_id"] != "mssF"]
# remove codebook images (mssEC_36-67)
filt = subj["hdl_id"].str.contains("mssEC_3[6-9]|[4-6][0-9]")
subj = subj[~filt]
# remove ledgers that seemed weird (only easy stuff for now)
filt = subj["hdl_id"].str.contains("mssEC_3[0-3]|2[6-9]")
subj = subj[~filt]
# remove the first few pages because they tended to be blank
filt = subj["hdl_id"].str.contains("mssEC_\d\d_00[1-6]")
subj = subj[~filt]

# consensus file (by line)
cons = pd.read_csv(consensusFile, sep="@@", engine="python").drop_duplicates()

# combine the two and sift out unneeded columns
allTelegramInfo = pd.merge(cons, subj, on="hdl_id", suffixes=["_cons", "_subj"])
idAndUrl = allTelegramInfo.loc[:,["hdl_id", "url_cons"]].drop_duplicates()
transcriptionsByLine = allTelegramInfo.loc[:,["hdl_id", "bestLineIndex", "consensus_text", 
                                       "y_loc", "len_wordlist"]]

In [8]:
# use only the data that has slant information
idAndUrl = idAndUrl.loc[idAndUrl['hdl_id'].isin(clExp['hdl_id'])]
transcriptionsByLine = transcriptionsByLine.loc[transcriptionsByLine['hdl_id'].isin(clExp['hdl_id'])]

## Collect the data for the wordbreaks

In [10]:
data = {}

nuOpt = np.arange(3, 5, 0.5) # 0.5:6.5
biOpt = list(range(1, 4)) # 1:10

for im in list(idAndUrl.index):
    hdl_id = idAndUrl.loc[im, "hdl_id"]
    data[hdl_id] = {}
    print(hdl_id)
    
    let, grey = readImg(idAndUrl.loc[im, "url_cons"])#, True)
    linesForTele = transcriptionsByLine.loc[transcriptionsByLine["hdl_id"] == hdl_id]
    linesForTele.loc[:,"y1"] = [eval(l)[0] for l in linesForTele.loc[:,"y_loc"]]


    # read in and do all pre-processing #################################
    let, grey = removeEdges(let, grey, rmThresh=0)
    grey = whitenEdgesProject(grey)
    greyBi = binarizeImg(grey, biThresh="otsu")#, plotIt=True)
    greySm = smoothImg(grey, smoothSigma=10.0)#, plotIt=True)
    greyBiSm = smoothImg(greyBi, smoothSigma=5.0)#, plotIt=True)


    # get linebreaks ####################################################
    matchlim = 30
    lbold = projBreaks(greySm, "y")
    lb = []
    lb.append(lbold[0])
    cur = 0
    for i in range(1, len(lbold)):
        if np.abs(lb[cur] - lbold[i]) < matchlim:
            lb[cur] = np.mean([lb[cur], lbold[i]])
        else:
            cur += 1
            lb.append(lbold[i])
    lb = np.array(lb).astype("int")
    data[hdl_id]["lb"] = lb
    
    # get matching lines with actual lines ##############################
    actuallb = linesForTele.loc[:, "y1"]
    matches = []
    for i in range(len(lb)):
        closest = np.argmin(np.abs(np.subtract(lb[i], actuallb)))
        if np.abs(lb[i] - actuallb[closest]) < matchlim:
            matches.append(closest)
        else:
            matches.append(-1)
    data[hdl_id]["matches"] = matches
    
    # get wordbreaks ####################################################
    accurL = []

    for i in range(1, len(lb)):
        print(i, end=" ")
        if matches[i] == -1:
            continue
        chunk = grey[lb[i-1]:lb[i],]
        nr = len(eval(linesForTele.loc[matches[i],"consensus_text"]).split(" "))
        for nu in nuOpt:
            for bi in biOpt:
                br = gaussBreaks(chunk, nu=nu, biThresh=bi, shear=0, fix=0)
                accurL.append({"i":i, "nu":nu, "bi":bi, "nr":nr, "br":br})
    accur = pd.DataFrame(accurL)
    data[hdl_id]["segment"] = accur
    print()

mssEC_02_007
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 
mssEC_02_032
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 
mssEC_02_131
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 
mssEC_02_175
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 
