forked from richyen/walker
-
Notifications
You must be signed in to change notification settings - Fork 1
/
interfaces.go
169 lines (128 loc) · 5.17 KB
/
interfaces.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
package cassandra
import (
"net/http"
"time"
"github.com/gocql/gocql"
"github.com/iParadigms/walker"
)
// ModelDatastore defines additional methods for querying and modifying domains
// and links in walker, and includes the walker.Datastore interface (which is
// intended only for the bare minimum that fetchers in the walker package
// need). This interface is good for use by the console and other tools that
// need CRUD-like capabilities.
type ModelDatastore interface {
walker.Datastore
// FindDomain returns the DomainInfo for the specified domain
FindDomain(domain string) (*DomainInfo, error)
// ListDomains returns a slice of DomainInfo structs populated according to
// the specified DQ (domain query)
ListDomains(query DQ) ([]*DomainInfo, error)
// UpdateDomain updates the given domain with fields from `info`. Which
// fields will be persisted to the store from the argument DomainInfo is
// configured from the DomainInfoUpdateConfig argument. For example, to
// persist the Priority field in the info strut, one would pass
// DomainInfoUpdateConfig{Priority: true} as the cfg argument to
// UpdateDomain.
UpdateDomain(domain string, info *DomainInfo, cfg DomainInfoUpdateConfig) error
// FindLink returns a LinkInfo matching the given URL. Arguments to this
// function are: (a) u is the url to find (b) collectContent, if true,
// indicates that Body and Headers field of LinkInfo will be populated.
FindLink(u *walker.URL, collectContent bool) (*LinkInfo, error)
// ListLinks fetches links for the given domain according to the given LQ
// (Link Query)
ListLinks(domain string, query LQ) ([]*LinkInfo, error)
// ListLinkHistorical gets the crawl history of a specific link
ListLinkHistorical(u *walker.URL) ([]*LinkInfo, error)
// InsertLink inserts the given link into the database, adding it's domain
// if it does not exist. If excludeDomainReason is not empty, this domain
// will be excluded from crawling marked with the given reason.
InsertLink(link string, excludeDomainReason string) error
// InsertLinks does the same as InsertLink with many potential errors. It
// will insert as many as it can (it won't stop once it hits a bad link)
// and only return errors for problematic links or domains.
InsertLinks(links []string, excludeDomainReason string) []error
}
// LQ is a link query struct used for gettings links from cassandra.
// Zero-values mean use default behavior.
type LQ struct {
// When listing links, the seed should be the URL preceding the queried
// set. When paginating, use the last URL of the previous set as the seed.
// Default: select from the beginning
Seed *walker.URL
// Limit the returned results, used for pagination.
// Default: no limit
Limit int
FilterRegex string
}
// LinkInfo defines a row from the link or segment table
type LinkInfo struct {
// URL of the link
URL *walker.URL
// Status of the fetch
Status int
// When did this link get crawled
CrawlTime time.Time
// Any error reported when attempting to fetch the URL
Error string
// Was this excluded by robots
RobotsExcluded bool
// URL this link redirected to if it was a redirect
RedirectedTo string
// Whether this link was flagged for immediate fetching
GetNow bool
// Mime type (or Content-Type) of the returned data
Mime string
// FNV hash of the contents
FnvFingerprint int64
// FNV hash of the text extracted from the page
FnvTextFingerprint int64
// Body of request (if configured to be stored)
Body string
// Header of request (if configured to be stored)
Headers http.Header
}
// DQ is a domain query struct used for getting domains from cassandra.
// Zero-values mean use default behavior.
type DQ struct {
// When listing domains, the seed should be the domain preceding the
// queried set. When paginating, use the last domain of the previous set as
// the seed.
// Default: select from the beginning
Seed string
// Limit the returned results, used for pagination.
// Default: no limit
Limit int
// Set to true to get only dispatched domains
// default: get all domains
Working bool
}
// DomainInfo defines a row from the domain_info table
type DomainInfo struct {
// TLD+1
Domain string
// Is this domain excluded from the crawl?
Excluded bool
// Why did this domain get excluded, or empty if not excluded
ExcludeReason string
// When did this domain last get queued to be crawled. Or TimeQueed.IsZero() if not crawled
ClaimTime time.Time
// What was the UUID of the crawler that last crawled the domain
ClaimToken gocql.UUID
// Number of (unique) links found in this domain
NumberLinksTotal int
// Number of (unique) links queued to be processed for this domain
NumberLinksQueued int
// Number of links not yet crawled
NumberLinksUncrawled int
// Priority of this domain
Priority int
}
// DomainInfoUpdateConfig is used to configure the method Datastore.UpdateDomain
type DomainInfoUpdateConfig struct {
// Setting Exclude to true indicates that the ExcludeReason field of the
// DomainInfo passed to UpdateDomain should be persisted to the database.
Exclude bool
// Setting Priority to true indicates that the Priority field of the
// DomainInfo passed to UpdateDomain should be persisted to the database.
Priority bool
}