Skip to content

Commit

Permalink
Add TopMargin & BottomMargin transformations
Browse files Browse the repository at this point in the history
  • Loading branch information
filak committed Jan 24, 2020
1 parent 6c43b7c commit 61bb10e
Show file tree
Hide file tree
Showing 5 changed files with 168 additions and 18 deletions.
24 changes: 21 additions & 3 deletions alto__hocr.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,17 @@ License: MIT
<xsl:template match="*:Page">
<xsl:variable name="fname"><xsl:value-of select="//*:alto/*:Description/*:sourceImageInformation/*:fileName"/></xsl:variable>
<div class="ocr_page" id="{mf:getId(@ID,'page',.)}" title="image {$fname}; bbox 0 0 {@WIDTH} {@HEIGHT}; ppageno 0">
<xsl:apply-templates select="*:TopMargin"/>
<xsl:apply-templates select="*:PrintSpace"/>
<xsl:apply-templates select="*:BottomMargin"/>
</div>
</xsl:template>


<xsl:template match="*:TopMargin">
<div class="ocr_header" id="{mf:getId(@ID,'block',.)}" title="{mf:getBox(@HEIGHT,@WIDTH,@VPOS,@HPOS,@WC)}">
<xsl:apply-templates select="*:ComposedBlock"/>
<xsl:apply-templates select="*:TextBlock"/>
</div>
</xsl:template>

Expand All @@ -80,9 +90,17 @@ License: MIT
<xsl:apply-templates select="*:ComposedBlock"/>
<xsl:apply-templates select="*:TextBlock"/>
</xsl:template>


<xsl:template match="*:ComposedBlock">


<xsl:template match="*:BottomMargin">
<div class="ocr_footer" id="{mf:getId(@ID,'block',.)}" title="{mf:getBox(@HEIGHT,@WIDTH,@VPOS,@HPOS,@WC)}">
<xsl:apply-templates select="*:ComposedBlock"/>
<xsl:apply-templates select="*:TextBlock"/>
</div>
</xsl:template>


<xsl:template match="*:ComposedBlock">
<div class="ocr_carea" id="{mf:getId(@ID,'block',.)}" title="{mf:getBox(@HEIGHT,@WIDTH,@VPOS,@HPOS,@WC)}">
<xsl:apply-templates select="*:TextBlock"/>
</div>
Expand Down
49 changes: 41 additions & 8 deletions hocr__alto2.0.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ License: MIT
</alto>
</xsl:template>


<xsl:template match="*:head">
<Description>
<MeasurementUnit>pixel</MeasurementUnit>
Expand All @@ -53,16 +54,43 @@ License: MIT
<xsl:apply-templates select="*:div[@class='ocr_page']"/>
</Layout>
</xsl:template>


<xsl:template match="*:div[@class='ocr_page']">
<!-- bbox 552 999 1724 1141 x1-L2-T3-R4-B5 -->
<xsl:variable name="box" select="tokenize(mf:getBoxPage(@title), ' ')"/>
<Page ID="{@id}" PHYSICAL_IMG_NR="1" HEIGHT="{$box[5]}" WIDTH="{$box[4]}">
<PrintSpace HEIGHT="{$box[5]}" WIDTH="{$box[4]}" VPOS="0" HPOS="0">
<Page ID="{@id}" PHYSICAL_IMG_NR="1" HEIGHT="{$box[5]}" WIDTH="{$box[4]}">

<xsl:apply-templates select="*:div[@class='ocr_header']"/>

<PrintSpace HEIGHT="{$box[5]}" WIDTH="{$box[4]}" VPOS="0" HPOS="0">
<xsl:apply-templates select="*:div[@class='ocr_carea']"/>
<xsl:apply-templates select="*:p[@class='ocr_par']"/>
</PrintSpace>

<xsl:apply-templates select="*:div[@class='ocr_footer']"/>

</Page>
</xsl:template>

<xsl:apply-templates/>

</PrintSpace>
</Page>
<xsl:template match="*:div[@class='ocr_header']">
<xsl:variable name="box" select="tokenize(mf:getBox(@title), ' ')"/>
<TopMargin ID="{@id}" HEIGHT="{number($box[5]) - number($box[3])}" WIDTH="{number($box[4]) - number($box[2])}" VPOS="{$box[3]}" HPOS="{$box[2]}">

<xsl:apply-templates/>

</TopMargin>
</xsl:template>


<xsl:template match="*:div[@class='ocr_footer']">
<xsl:variable name="box" select="tokenize(mf:getBox(@title), ' ')"/>
<BottomMargin ID="{@id}" HEIGHT="{number($box[5]) - number($box[3])}" WIDTH="{number($box[4]) - number($box[2])}" VPOS="{$box[3]}" HPOS="{$box[2]}">

<xsl:apply-templates/>

</BottomMargin>
</xsl:template>


Expand Down Expand Up @@ -153,24 +181,28 @@ License: MIT
</xsl:choose>
</xsl:template>




<xsl:function name="mf:getFname">
<xsl:param name="titleString"/>
<xsl:variable name="pPat">"</xsl:variable>
<xsl:variable name="fpath" select="substring-after(tokenize(normalize-space($titleString),'; ')[1],'image &quot;')"/>
<xsl:value-of select="reverse(tokenize(replace($fpath,$pPat,''),'\\'))[1]"/>
</xsl:function>



<xsl:function name="mf:getBoxPage">
<xsl:param name="titleString"/>
<xsl:value-of select="tokenize(normalize-space($titleString),'; ')[2]"/>
</xsl:function>


<xsl:function name="mf:getBox">
<xsl:param name="titleString"/>
<xsl:value-of select="tokenize(normalize-space($titleString),'; ')"/>
</xsl:function>


<xsl:function name="mf:getConfidence">
<xsl:param name="titleString"/>
<xsl:variable name="wconfString" select="tokenize(normalize-space($titleString),'; ')[2]" />
Expand All @@ -186,5 +218,6 @@ License: MIT
</xsl:choose>

</xsl:function>



</xsl:stylesheet>
37 changes: 35 additions & 2 deletions hocr__alto2.1.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ License: MIT
<xsl:apply-templates/>
</alto>
</xsl:template>


<xsl:template match="*:head">
<Description>
Expand All @@ -53,17 +54,44 @@ License: MIT
<xsl:apply-templates select="*:div[@class='ocr_page']"/>
</Layout>
</xsl:template>


<xsl:template match="*:div[@class='ocr_page']">
<!-- bbox 552 999 1724 1141 x1-L2-T3-R4-B5 -->
<xsl:variable name="box" select="tokenize(mf:getBoxPage(@title), ' ')"/>
<Page ID="{@id}" PHYSICAL_IMG_NR="1" HEIGHT="{$box[5]}" WIDTH="{$box[4]}">
<PrintSpace HEIGHT="{$box[5]}" WIDTH="{$box[4]}" VPOS="0" HPOS="0">

<xsl:apply-templates/>
<xsl:apply-templates select="*:div[@class='ocr_header']"/>

<PrintSpace HEIGHT="{$box[5]}" WIDTH="{$box[4]}" VPOS="0" HPOS="0">
<xsl:apply-templates select="*:div[@class='ocr_carea']"/>
<xsl:apply-templates select="*:p[@class='ocr_par']"/>
</PrintSpace>

<xsl:apply-templates select="*:div[@class='ocr_footer']"/>

</Page>
</xsl:template>


<xsl:template match="*:div[@class='ocr_header']">
<xsl:variable name="box" select="tokenize(mf:getBox(@title), ' ')"/>
<TopMargin ID="{@id}" HEIGHT="{number($box[5]) - number($box[3])}" WIDTH="{number($box[4]) - number($box[2])}" VPOS="{$box[3]}" HPOS="{$box[2]}">

<xsl:apply-templates/>

</TopMargin>
</xsl:template>


<xsl:template match="*:div[@class='ocr_footer']">
<xsl:variable name="box" select="tokenize(mf:getBox(@title), ' ')"/>
<BottomMargin ID="{@id}" HEIGHT="{number($box[5]) - number($box[3])}" WIDTH="{number($box[4]) - number($box[2])}" VPOS="{$box[3]}" HPOS="{$box[2]}">

<xsl:apply-templates/>

</BottomMargin>
</xsl:template>


<xsl:template match="*:div[@class='ocr_carea']">
Expand Down Expand Up @@ -154,23 +182,27 @@ License: MIT
</xsl:template>



<xsl:function name="mf:getFname">
<xsl:param name="titleString"/>
<xsl:variable name="pPat">"</xsl:variable>
<xsl:variable name="fpath" select="substring-after(tokenize(normalize-space($titleString),'; ')[1],'image &quot;')"/>
<xsl:value-of select="reverse(tokenize(replace($fpath,$pPat,''),'\\'))[1]"/>
</xsl:function>


<xsl:function name="mf:getBoxPage">
<xsl:param name="titleString"/>
<xsl:value-of select="tokenize(normalize-space($titleString),'; ')[2]"/>
</xsl:function>


<xsl:function name="mf:getBox">
<xsl:param name="titleString"/>
<xsl:value-of select="tokenize(normalize-space($titleString),'; ')"/>
</xsl:function>


<xsl:function name="mf:getConfidence">
<xsl:param name="titleString"/>
<xsl:variable name="wconfString" select="tokenize(normalize-space($titleString),'; ')[2]" />
Expand All @@ -187,4 +219,5 @@ License: MIT

</xsl:function>


</xsl:stylesheet>
39 changes: 36 additions & 3 deletions hocr__alto3.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@ License: MIT
<xsl:variable name="langcodes" select="document('codes_lookup.xml')/*:codes/*:code" />

<xsl:template match="/">
<alto xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v3# https://www.loc.gov/standards/alto/v3/alto.xsd">
<alto xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v4# https://www.loc.gov/standards/alto/v4/alto.xsd">
<xsl:apply-templates/>
</alto>
</xsl:template>


<xsl:template match="*:head">
<Description>
<MeasurementUnit>pixel</MeasurementUnit>
Expand All @@ -53,17 +54,44 @@ License: MIT
<xsl:apply-templates select="*:div[@class='ocr_page']"/>
</Layout>
</xsl:template>


<xsl:template match="*:div[@class='ocr_page']">
<!-- bbox 552 999 1724 1141 x1-L2-T3-R4-B5 -->
<xsl:variable name="box" select="tokenize(mf:getBoxPage(@title), ' ')"/>
<Page ID="{@id}" PHYSICAL_IMG_NR="1" HEIGHT="{$box[5]}" WIDTH="{$box[4]}">
<PrintSpace HEIGHT="{$box[5]}" WIDTH="{$box[4]}" VPOS="0" HPOS="0">

<xsl:apply-templates/>
<xsl:apply-templates select="*:div[@class='ocr_header']"/>

<PrintSpace HEIGHT="{$box[5]}" WIDTH="{$box[4]}" VPOS="0" HPOS="0">
<xsl:apply-templates select="*:div[@class='ocr_carea']"/>
<xsl:apply-templates select="*:p[@class='ocr_par']"/>
</PrintSpace>

<xsl:apply-templates select="*:div[@class='ocr_footer']"/>

</Page>
</xsl:template>


<xsl:template match="*:div[@class='ocr_header']">
<xsl:variable name="box" select="tokenize(mf:getBox(@title), ' ')"/>
<TopMargin ID="{@id}" HEIGHT="{number($box[5]) - number($box[3])}" WIDTH="{number($box[4]) - number($box[2])}" VPOS="{$box[3]}" HPOS="{$box[2]}">

<xsl:apply-templates/>

</TopMargin>
</xsl:template>


<xsl:template match="*:div[@class='ocr_footer']">
<xsl:variable name="box" select="tokenize(mf:getBox(@title), ' ')"/>
<BottomMargin ID="{@id}" HEIGHT="{number($box[5]) - number($box[3])}" WIDTH="{number($box[4]) - number($box[2])}" VPOS="{$box[3]}" HPOS="{$box[2]}">

<xsl:apply-templates/>

</BottomMargin>
</xsl:template>


<xsl:template match="*:div[@class='ocr_carea']">
Expand Down Expand Up @@ -154,23 +182,27 @@ License: MIT
</xsl:template>



<xsl:function name="mf:getFname">
<xsl:param name="titleString"/>
<xsl:variable name="pPat">"</xsl:variable>
<xsl:variable name="fpath" select="substring-after(tokenize(normalize-space($titleString),'; ')[1],'image &quot;')"/>
<xsl:value-of select="reverse(tokenize(replace($fpath,$pPat,''),'\\'))[1]"/>
</xsl:function>


<xsl:function name="mf:getBoxPage">
<xsl:param name="titleString"/>
<xsl:value-of select="tokenize(normalize-space($titleString),'; ')[2]"/>
</xsl:function>


<xsl:function name="mf:getBox">
<xsl:param name="titleString"/>
<xsl:value-of select="tokenize(normalize-space($titleString),'; ')"/>
</xsl:function>


<xsl:function name="mf:getConfidence">
<xsl:param name="titleString"/>
<xsl:variable name="wconfString" select="tokenize(normalize-space($titleString),'; ')[2]" />
Expand All @@ -187,4 +219,5 @@ License: MIT

</xsl:function>


</xsl:stylesheet>
Loading

0 comments on commit 61bb10e

Please sign in to comment.