/
generator.go
135 lines (121 loc) · 3.85 KB
/
generator.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
package sitemapgen
import (
"fmt"
"log"
"net/http"
"net/url"
"sync"
"time"
"github.com/eapache/channels"
"github.com/maciekmm/sitemap-generator/config"
"github.com/maciekmm/sitemap-generator/filegen"
"github.com/maciekmm/sitemap-generator/limit"
"github.com/temoto/robotstxt"
)
type Generator struct {
WorkerQueue *channels.InfiniteChannel
waitGroup *sync.WaitGroup
config *config.Config
}
//NewSitemapGenerator constructs a new sitemap generator instance,
//Call Start() in order to start the proccesszz
func NewGenerator(config *config.Config) *Generator {
return &Generator{
WorkerQueue: channels.NewInfiniteChannel(),
waitGroup: new(sync.WaitGroup),
config: config,
}
}
//Start gives the whole machine a spin
//TODO: Divide and conquer :>
func (sg *Generator) Start() error {
parsed, err := url.Parse(sg.config.URL)
if err != nil {
return err
}
//Parse robots.txt
var robs *robotstxt.RobotsData
if sg.config.Parsing.RespectRobots {
robs, err = GetRobots(parsed)
if err != nil {
log.Println(err.Error())
}
}
//Create sitemapgenerator
sitemapgen, err := filegen.New(*sg.config, sg.waitGroup)
if err != nil {
log.Println("Generator: " + err.Error())
return err
}
go sitemapgen.Start()
//Create validator
log.Println("Generator: Creating validator.")
validator := NewValidator(*sg.config, sg.WorkerQueue, sg.waitGroup, robs, sitemapgen.Input)
go validator.start()
sg.waitGroup.Add(1)
validator.Input <- parsed
//Create proxies
var httpCls []*limit.Client
// cr := func(req *http.Request, via []*http.Request) error {
// req.Header.Add("User-Agent", sg.config.Parsing.UserAgent) //Construct the channel
// if len(via) >= 20 {
// return errors.New("stopped after 10 redirects")
// }
// return nil
// }
if sg.config.Parsing.NoProxyClient {
client := &http.Client{}
httpCls = append(httpCls, limit.NewClient(client, limit.NewRateLimiter(sg.config.Parsing.RequestsPerSecond, sg.config.Parsing.Burst), sg.config.Parsing.UserAgent))
}
for _, proxy := range sg.config.Parsing.Proxies {
proxyURL, err := url.Parse(proxy.Address)
if err != nil {
log.Println("Generator: Invalid proxy url: ", proxy.Address)
}
proxyURL.User = url.UserPassword(proxy.Username, proxy.Password)
client := &http.Client{
Transport: &http.Transport{Proxy: http.ProxyURL(proxyURL)},
Timeout: time.Duration(5 * time.Second),
}
httpCls = append(httpCls, limit.NewClient(client, limit.NewRateLimiter(sg.config.Parsing.RequestsPerSecond, sg.config.Parsing.Burst), sg.config.Parsing.UserAgent))
}
//Construct the channel
log.Println("Generator: Finished creating proxies, total: ", len(httpCls))
httpClients := make(chan *limit.Client, len(httpCls))
for _, cli := range httpCls {
httpClients <- cli
}
//Create workers
for i := 0; i < sg.config.Parsing.Workers; i++ {
log.Println("Generator: Creating worker no. ", i)
worker := NewWorker(sg.WorkerQueue, validator.Input, sg.waitGroup, sitemapgen.Input, httpClients)
go worker.Start()
}
//Wait for work to finish
sg.waitGroup.Wait()
log.Println("Generator: All work's done, closing channels.")
sg.WorkerQueue.Close()
close(httpClients)
close(validator.Input)
close(sitemapgen.Input)
//Sitemap generator cleanup
sg.waitGroup.Add(1)
sg.waitGroup.Wait()
return nil
}
//GetRobots gets RobotsData for given url
func GetRobots(url *url.URL) (*robotstxt.RobotsData, error) {
resp, err := http.DefaultClient.Get("http://" + url.Host + "/robots.txt")
if err != nil {
return nil, fmt.Errorf("Generator: robots.txt lookup yield an error %s", err.Error())
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("Generator: robots.txt returned an invalid http code: %d", resp.StatusCode)
}
rob, err := robotstxt.FromResponse(resp)
if err != nil {
return nil, fmt.Errorf("Generator: Parsing robots.txt yield an error %s", err)
}
return rob, nil
}