forked from CorentinB/warc
/
write.go
177 lines (149 loc) · 4.07 KB
/
write.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
package warc
import (
"bufio"
"bytes"
"compress/gzip"
"fmt"
"io"
"io/ioutil"
"os"
"strconv"
"strings"
"time"
"github.com/klauspost/compress/zstd"
uuid "github.com/satori/go.uuid"
)
// Writer writes WARC records to WARC files.
type Writer struct {
FileName string
Compression string
GZIPWriter *gzip.Writer
ZSTDWriter *zstd.Encoder
FileWriter *bufio.Writer
}
// RecordBatch is a structure that contains a bunch of
// records to be written at the same time, and a common
// capture timestamp
type RecordBatch struct {
Records []*Record
Done chan bool
CaptureTime string
}
// Record represents a WARC record.
type Record struct {
Header Header
Content io.Reader
PayloadPath string
}
// WriteRecord writes a record to the underlying WARC file.
// A record consists of a version string, the record header followed by a
// record content block and two newlines:
// Version CLRF
// Header-Key: Header-Value CLRF
// CLRF
// Content
// CLRF
// CLRF
func (w *Writer) WriteRecord(r *Record) (recordID string, err error) {
// Generate record ID
recordID = uuid.NewV4().String()
// Add the mandatories headers
if r.Header.Get("WARC-Date") == "" {
r.Header.Set("WARC-Date", time.Now().UTC().Format(time.RFC3339))
}
if r.Header.Get("WARC-Type") == "" {
r.Header.Set("WARC-Type", "resource")
}
if r.Header.Get("WARC-Record-ID") == "" {
r.Header.Set("WARC-Record-ID", "<urn:uuid:"+recordID+">")
}
_, err = io.WriteString(w.FileWriter, "WARC/1.0\r\n")
if err != nil {
return recordID, err
}
// If PayloadPath isn't empty, it means that the payload we need to write
// lives on disk
if r.PayloadPath != "" {
file, err := os.Open(r.PayloadPath)
if err != nil {
return recordID, err
}
defer file.Close()
// Write headers
fileStats, err := file.Stat()
if err != nil {
return recordID, err
}
r.Header.Set("Content-Length", strconv.Itoa(int(fileStats.Size())))
// Generate WARC-Block-Digest
digest, err := GetSHA1FromFile(r.PayloadPath)
if err == nil {
r.Header.Set("WARC-Block-Digest", "sha1:"+digest)
}
// Write headers
for key, value := range r.Header {
_, err = io.WriteString(w.FileWriter, strings.Title(key)+": "+value+"\r\n")
if err != nil {
return recordID, err
}
}
_, err = io.WriteString(w.FileWriter, "\r\n")
if err != nil {
return recordID, err
}
_, err = io.Copy(w.FileWriter, file)
if err != nil {
return recordID, err
}
} else {
data, err := ioutil.ReadAll(r.Content)
if err != nil {
return recordID, err
}
// Write headers
r.Header.Set("Content-Length", strconv.Itoa(len(data)))
r.Header.Set("WARC-Block-Digest", "sha1:"+GetSHA1(data))
for key, value := range r.Header {
_, err = io.WriteString(w.FileWriter, strings.Title(key)+": "+value+"\r\n")
if err != nil {
return recordID, err
}
}
_, err = io.WriteString(w.FileWriter, "\r\n"+string(data))
if err != nil {
return recordID, err
}
}
_, err = io.WriteString(w.FileWriter, "\r\n\r\n")
if err != nil {
return recordID, err
}
// Flush data
w.FileWriter.Flush()
return recordID, nil
}
// WriteInfoRecord method can be used to write informations record to the WARC file
func (w *Writer) WriteInfoRecord(payload map[string]string) (recordID string, err error) {
// Initialize the record
infoRecord := NewRecord()
// Set the headers
infoRecord.Header.Set("WARC-Date", time.Now().UTC().Format(time.RFC3339))
infoRecord.Header.Set("WARC-Filename", strings.TrimSuffix(w.FileName, ".open"))
infoRecord.Header.Set("WARC-Type", "warcinfo")
infoRecord.Header.Set("Content-Type", "application/warc-fields")
// Write the payload
warcInfoContent := new(bytes.Buffer)
for k, v := range payload {
warcInfoContent.WriteString(fmt.Sprintf("%s: %s\r\n", k, v))
}
infoRecord.Content = warcInfoContent
// Generate WARC-Block-Digest
infoRecord.Header.Set("WARC-Block-Digest", "sha1:"+GetSHA1(warcInfoContent.Bytes()))
// Finally, write the record and flush the data
recordID, err = w.WriteRecord(infoRecord)
if err != nil {
return recordID, err
}
w.FileWriter.Flush()
return recordID, err
}