-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.go
106 lines (90 loc) · 2.53 KB
/
crawler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
package services
import (
"log"
"net/url"
"sync"
"time"
"github.com/cleitonmarx/webcrawler/datatypes"
)
//UrlFetcher is an interface for URL Fetching
type UrlFetcher interface {
ParseUrl(newUrl string) ([]string, []string, error)
}
//Crawler represents a recursive url fetcher
type Crawler struct {
mainURL *url.URL
LinkList *datatypes.SitemapList
fetcher UrlFetcher
depth int
logger *log.Logger
timeout time.Time
}
//GenerateSitemap fetch a url and generate a sitemap datastructure
func (c *Crawler) GenerateSitemap(urlToFetch *url.URL, timeout time.Duration) datatypes.Sitemap {
c.mainURL = urlToFetch
wg := &sync.WaitGroup{}
wg.Add(1)
c.timeout = time.Now().Add(timeout)
c.fetchLink(urlToFetch.String(), c.depth, wg)
wg.Wait()
return c.LinkList.CreateSitemap()
}
//fetchLink is a recursive and concurrent function to a url and your links
func (c *Crawler) fetchLink(linkToFetch string, depth int, waitGroup *sync.WaitGroup) {
defer func() {
waitGroup.Done()
c.logger.Printf("Done | Depth: %d | link: %s\n", depth, linkToFetch)
}()
if time.Now().After(c.timeout) {
c.logger.Printf("Timeout | Depth: %d | link: %s\n", depth, linkToFetch)
return
}
c.logger.Printf("Fetching | Depth: %d | link: %s\n", depth, linkToFetch)
links, dependencies, err := c.fetcher.ParseUrl(linkToFetch)
newItem := &datatypes.SitemapItem{
Location: linkToFetch,
Updated: time.Now(),
}
c.LinkList.Add(newItem)
if err != nil {
newItem.ErrorInFetch = true
newItem.ErrorMessage = err.Error()
return
}
newItem.Links = links
newItem.Assets = dependencies
if depth <= 0 {
return
}
for _, link := range links {
if time.Now().After(c.timeout) {
c.logger.Printf("Timeout | Depth: %d | link: %s\n", depth, linkToFetch)
return
}
if foundLink := c.LinkList.Search(link); foundLink == nil && c.allowedLink(link) {
waitGroup.Add(1)
go c.fetchLink(link, depth-1, waitGroup)
time.Sleep(100 * time.Millisecond)
} else {
c.logger.Printf("Skipped | Depth: %d - %s\n", depth, link)
}
}
}
//allowedLink allows just links with same domain, skipping subdomains or diferent hosts
func (c *Crawler) allowedLink(link string) bool {
linkURL, err := url.Parse(link)
if err == nil && linkURL.Host == c.mainURL.Host {
return true
}
return false
}
//NewCrawler creates a new Crawler instance
func NewCrawler(urlFetcher UrlFetcher, depth int, logger *log.Logger) *Crawler {
sitemapList := datatypes.NewSitemapList()
return &Crawler{
LinkList: sitemapList,
fetcher: urlFetcher,
depth: depth,
logger: logger,
}
}