/
indeed_scraper.go
65 lines (59 loc) · 1.44 KB
/
indeed_scraper.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
package main
import (
"errors"
"fmt"
"github.com/ECLabs/Eaton-Feeder/mapping"
"github.com/PuerkitoBio/goquery"
"net/url"
)
type IndeedScraper struct {
}
func (i *IndeedScraper) doGetFullJobSumary(jobResult mapping.JobResult) (*mapping.JobResult, error) {
indeedUrl, err := url.Parse(jobResult.Url)
if err != nil {
return nil, err
}
if indeedUrl.Host != "www.indeed.com" {
return nil, errors.New(fmt.Sprintf("unknown job summary host: %s", indeedUrl.Host))
}
doc, err := goquery.NewDocument(jobResult.Url)
if err != nil {
return nil, err
}
var findErr error
fullSummary := ""
doc.Find("span#job_summary").Each(func(i int, s *goquery.Selection) {
fullSummary, findErr = s.Html()
})
if findErr != nil {
return nil, findErr
}
if fullSummary == "" {
return nil, errors.New(fmt.Sprintf("couldn't find full summary for jobKey: %s", jobResult.JobKey))
}
jobResult.FullJobSummary = fullSummary
return &jobResult, nil
}
func (i *IndeedScraper) GetFullJobSummary(input <-chan mapping.JobResult) (<-chan error, <-chan mapping.JobResult) {
errChannel := make(chan error)
output := make(chan mapping.JobResult)
go func() {
defer func() {
close(errChannel)
close(output)
}()
for jobResult := range input {
if jobResult.IsLast() {
output <- jobResult
return
}
jr, err := i.doGetFullJobSumary(jobResult)
if err != nil {
errChannel <- err
continue
}
output <- *jr
}
}()
return errChannel, output
}