Skip to content

Commit

Permalink
Merge pull request #1061 from thinkbeforecoding/master
Browse files Browse the repository at this point in the history
Fix #1022: HtmlProvider Self closing table tags
  • Loading branch information
Gustavo Guerra committed Jul 21, 2017
2 parents ee4f12e + 6153202 commit 4e3c63c
Show file tree
Hide file tree
Showing 4 changed files with 216 additions and 1 deletion.
2 changes: 1 addition & 1 deletion docs/content/library/HtmlProvider.fsx
Expand Up @@ -129,7 +129,7 @@ let doctorWho = new HtmlProvider<DrWho>()
// Get the average number of viewers for each doctor's series run
let viewersByDoctor =
doctorWho.Tables.``Series overview``.Rows
|> Seq.groupBy (fun season -> season.``Doctor(s)``)
|> Seq.groupBy (fun season -> season.Doctor)
|> Seq.map (fun (doctor, seasons) ->
let averaged =
seasons
Expand Down
44 changes: 44 additions & 0 deletions src/Html/HtmlParser.fs
Expand Up @@ -704,6 +704,43 @@ module internal HtmlParser =
| "area" | "base" | "br" | "col" | "embed"| "hr" | "img" | "input" | "keygen" | "link" | "menuitem" | "meta" | "param"
| "source" | "track" | "wbr" -> true
| _ -> false

let isImplicitlyClosedByStartTag expectedTagEnd startTag =
match expectedTagEnd, startTag with
| ("td"|"th") , ("tr"|"td"|"th") -> true
| "tr", "tr" -> true
| "li", "li" -> true
| _ -> false

let implicitlyCloseByStartTag expectedTagEnd startTag tokens =
match expectedTagEnd, startTag with
| ("td"|"th"), "tr" ->
// the new tr is closing the cell and previous row
TagEnd expectedTagEnd :: TagEnd "tr" :: tokens
| ("td"|"th") , ("td"|"th")
| "tr", "tr"
| "li", "li" ->
// tags are on same level, just close
TagEnd expectedTagEnd :: tokens
| _ -> tokens

let isImplicitlyClosedByEndTag expectedTagEnd startTag =
match expectedTagEnd, startTag with
| ("td"|"th"|"tr") , ("thead"|"tbody"|"tfoot"|"table") -> true
| "li" , "ul" -> true
| _ -> false

let implicitlyCloseByEndTag expectedTagEnd tokens =
match expectedTagEnd with
| "td" | "th" ->
// the end tag closes the cell and the row
TagEnd expectedTagEnd :: TagEnd "tr" :: tokens
| "tr"
| "li" ->
// Only on level need to be closed
TagEnd expectedTagEnd :: tokens
| _ -> tokens

let rec parse' docType elements expectedTagEnd parentTagName (tokens:HtmlToken list) =
match tokens with
| DocType dt :: rest -> parse' (dt.Trim()) elements expectedTagEnd parentTagName rest
Expand All @@ -716,6 +753,13 @@ module internal HtmlParser =
| Tag(false, name, attributes) :: rest when canNotHaveChildren name ->
let e = HtmlElement(name, attributes, [])
parse' docType (e :: elements) expectedTagEnd parentTagName rest
| Tag(_, name, _) :: _ when isImplicitlyClosedByStartTag expectedTagEnd name ->
// insert missing </tr> </td> or </th> when starting new row/cell/header
parse' docType elements expectedTagEnd parentTagName (implicitlyCloseByStartTag expectedTagEnd name tokens)
| TagEnd(name) :: _ when isImplicitlyClosedByEndTag expectedTagEnd name ->
// insert missing </tr> </td> or </th> when starting new row/cell/header
parse' docType elements expectedTagEnd parentTagName (implicitlyCloseByEndTag expectedTagEnd tokens)

| Tag(_, name, attributes) :: rest ->
let dt, tokens, content = parse' docType [] name expectedTagEnd rest
let e = HtmlElement(name, attributes, content)
Expand Down
Expand Up @@ -28,6 +28,9 @@ class HtmlProvider : FDR.BaseTypes.HtmlDocument
static member Parse: text:string -> HtmlProvider
HtmlDocument.Create(false, ((new StringReader(text)) :> TextReader))

member Tables: HtmlProvider+TablesContainer with get
this


class HtmlProvider+ListsContainer : FDR.BaseTypes.HtmlDocument
member ``2002 ROVER 75,CLUB SE.A NICE ONE.!!!!!!!!.``: HtmlProvider+2002Rover75ClubSeANiceOne with get
Expand Down Expand Up @@ -91,10 +94,22 @@ class HtmlProvider+ListsContainer : FDR.BaseTypes.HtmlDocument
HtmlList<_>.Create(rowConverter, this, "See also")


class HtmlProvider+TablesContainer : FDR.BaseTypes.HtmlDocument
member ``Additional site navigation``: HtmlProvider+AdditionalSiteNavigation with get
let rowConverter = new Func<_,_>(fun (row:string[]) ->
let value = TextConversions.AsString(row.[0])
TextRuntime.GetNonOptionalValue("Column1", TextRuntime.ConvertString(value), value),
let value = TextConversions.AsString(row.[1])
TextRuntime.GetNonOptionalValue("Column2", TextRuntime.ConvertString(value), value))
HtmlTable<_>.Create(rowConverter, this, "Additional site navigation", false)


class HtmlProvider+2002Rover75ClubSeANiceOne : FDR.BaseTypes.HtmlList<string>

class HtmlProvider+2002Rover75ClubSeANiceOne2 : FDR.BaseTypes.HtmlList<string>

class HtmlProvider+AdditionalSiteNavigation : FDR.BaseTypes.HtmlTable<HtmlProvider+AdditionalSiteNavigation+Row>

class HtmlProvider+GfL : FDR.BaseTypes.HtmlList<string>

class HtmlProvider+GhEb : FDR.BaseTypes.HtmlList<string>
Expand All @@ -111,3 +126,11 @@ class HtmlProvider+Menu5 : FDR.BaseTypes.HtmlList<int>

class HtmlProvider+SeeAlso : FDR.BaseTypes.HtmlList<string>

class HtmlProvider+AdditionalSiteNavigation+Row : string * string
member Column1: string with get
(let t1,_ = this in t1)

member Column2: string with get
(let _,t2 = this in t2)


148 changes: 148 additions & 0 deletions tests/FSharp.Data.Tests/HtmlParser.fs
Expand Up @@ -323,6 +323,135 @@ let ``Extracts data and headers with thead and tbody``() =
[ "January"; "$100" ]
[ "February"; "$80" ] ]

[<Test>]
let ``Extracts data and headers with unclosed tr th and td``() =
let html = """<table id="savings_table">
<thead>
<tr>
<th>Month
<th>Savings
</thead>
<tfoot>
<tr>
<td>Sum
<td>$180
</tfoot>
<tbody>
<tr>
<td>January
<td>$100
<tr>
<td>February
<td>$80
</tbody>
</table>"""

let tables =
html
|> HtmlDocument.Parse
|> getTables true
tables.Length |> should equal 1
tables.[0].Name |> should equal "savings_table"
tables.[0].HasHeaders |> should equal (Some true)
tables.[0].Rows |> should equal [ [ "Month"; "Savings" ]
[ "Sum"; "$180" ]
[ "January"; "$100" ]
[ "February"; "$80" ] ]

[<Test>]
let ``Extracts data and headers with unclosed tr``() =
let html = """<table id="savings_table">
<thead>
<tr>
<th>Month</th>
<th>Savings</th>
</thead>
<tfoot>
<tr>
<td>Sum</td>
<td>$180</td>
</tfoot>
<tbody>
<tr>
<td>January</td>
<td>$100</td>
<tr>
<td>February</td>
<td>$80</td>
</tbody>
</table>"""

let tables =
html
|> HtmlDocument.Parse
|> getTables true
tables.Length |> should equal 1
tables.[0].Name |> should equal "savings_table"
tables.[0].HasHeaders |> should equal (Some true)
tables.[0].Rows |> should equal [ [ "Month"; "Savings" ]
[ "Sum"; "$180" ]
[ "January"; "$100" ]
[ "February"; "$80" ] ]

[<Test>]
let ``Extracts data and headers with unclosed tr th and td without tbody``() =
let html = """<table id="savings_table">
<tr>
<th>Month
<th>Savings
<tr>
<td>Sum
<td>$180
<tr>
<td>January
<td>$100
<tr>
<td>February
<td>$80
</table>"""

let tables =
html
|> HtmlDocument.Parse
|> getTables true
tables.Length |> should equal 1
tables.[0].Name |> should equal "savings_table"
tables.[0].HasHeaders |> should equal (Some true)
tables.[0].Rows |> should equal [ [ "Month"; "Savings" ]
[ "Sum"; "$180" ]
[ "January"; "$100" ]
[ "February"; "$80" ] ]

[<Test>]
let ``Extracts data and headers with unclosed tr without tbody``() =
let html = """<table id="savings_table">
<tr>
<th>Month</th>
<th>Savings</th>
<tr>
<td>Sum</td>
<td>$180</td>
<tr>
<td>January</td>
<td>$100</td>
<tr>
<td>February</td>
<td>$80</td>
</table>"""

let tables =
html
|> HtmlDocument.Parse
|> getTables true
tables.Length |> should equal 1
tables.[0].Name |> should equal "savings_table"
tables.[0].HasHeaders |> should equal (Some true)
tables.[0].Rows |> should equal [ [ "Month"; "Savings" ]
[ "Sum"; "$180" ]
[ "January"; "$100" ]
[ "February"; "$80" ] ]


[<Test>]
let ``Extracts tables in malformed html``() =
let html = """<html>
Expand Down Expand Up @@ -552,6 +681,25 @@ let ``Can parse nested lists correctly when continues on recurse``() =
|> Seq.toList
result |> should equal [ "12"; "1"; "2"; "3"; "4" ]

[<Test>]
let ``Can parse nested lists correctly when continues closing tags are missing``() =
let html = """
<ul>
<li>
<ul><li>1<li>2</ul>
<li>3
<li>4
</ul>
"""

let result =
(HtmlDocument.Parse html)
|> HtmlDocument.descendantsNamed true [ "li" ]
|> Seq.map (HtmlNode.innerText)
|> Seq.toList
result |> should equal [ "12"; "1"; "2"; "3 "; "4 " ]


[<Test>]
let ``Can parse pre blocks``() =
let html = "<pre>\r\n This code should be indented and\r\n have line feeds in it</pre>"
Expand Down

0 comments on commit 4e3c63c

Please sign in to comment.