-
Notifications
You must be signed in to change notification settings - Fork 2
/
parser.go
179 lines (154 loc) · 4.74 KB
/
parser.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
package main
import (
"crypto/tls"
"fmt"
"io"
"io/ioutil"
"log"
"math/rand"
"net/http"
"net/url"
"regexp"
"strings"
)
var (
urlregex *regexp.Regexp = regexp.MustCompile(`((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((:\d{1,5})?(?:\/[\+~%\/.#@\w-_]*)?\??(?:[-\+=&;%#@.\w_]*)#?(?:[\w]*))?)`) // 19.5.2016 von moob
//urlregex *regexp.Regexp = regexp.MustCompile(`((([A-Za-z]{3,9}:(?:\/\/)?)+(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;!:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~!#%\/.\w-_]*)?\??(?:[-\+!=&;%@.\w_]*)[#:]?(?:[\w]*))?)`)
// urlregex = regexp.MustCompile(`((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)`) // 19.5.2016 von aidskrebs bot
// Versuch eine leserliche URl regex zu basteln
// urlregex *regexp.Regexp = regexp.MustCompile(`[a-zA-Z]{3,9}:\/\/((.*)\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,9}(:[0-9]{1,5})?(/(.*)?)?`)
)
var sprueche []string = []string{
"Obacht! %s hat es am %s schon gepostet.",
"Aufmerksamkeitsspanne wie ne Fruchtfliege (%s von %s)",
"AAAALT! (%s von %s)",
"Dududu! (%s von %s)",
"Langsam wie ne Omma (%s von %s)",
"Kennen wir schon. (%s von %s)",
"Ein Zitat vom %s von %s",
"Frei nach Goethe: AAALT (%s von %s)",
"Es war einmal vor nicht allzu geraumer Zeit (%s) da gab es einen Helden (%s), der sich zur Aufgabe gemacht hat Informationen unters Volk zu bringen.",
"Ein Plagiat!!! (%s von %s)",
}
// StartParser dunno what this does
func StartParser() error {
for {
post := <-postReceiver
links := extractLink(post.Message)
for _, l := range links {
x := &Link{User: post.User, Url: l, Post: post.Message, Timestamp: post.Timestamp}
if x.User == "g0bot" {
continue
}
u, err := url.Parse(x.Url)
if err != nil {
log.Println("unable to parse URL", x.Url)
continue
}
//assuming a sane default
if u.Scheme == "" {
x.Url = "http://" + x.Url
}
x.Domain = u.Host
// check for duplicate
result, dup := checkDuplicate(x)
if dup {
if !strings.Contains(post.Message, result.User) {
ircMessage(*cfgChannel, fmt.Sprintf(getSpruch(), result.Timestamp.Format("02.01.2006 15:04"), result.User))
}
continue
}
tr := &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
}
client := &http.Client{Transport: tr}
_size, err := client.Head(x.Url)
if err != nil {
log.Println("unable to get HEAD from " + x.Url + ": " + err.Error())
continue
}
x.Size = _size.ContentLength
io.Copy(ioutil.Discard, _size.Body) // throw any response away, should 0 cause of HEAD
_size.Body.Close()
if addLink(x) {
log.Printf("%s: %s\n", post.User, x.Url)
} else {
log.Printf("Could not insert link (%s) into the database.\n", x.Url)
}
}
}
}
func trimSuffix(s, suffix string) string {
if strings.HasSuffix(s, suffix) {
s = s[:len(s)-len(suffix)]
}
return s
}
func extractLink(data string) []string {
var result []string
if urlregex.MatchString(data) {
links := urlregex.FindAllString(data, -1)
for _, x := range links {
result = append(result, trimSuffix(x, "/"))
}
}
return result
}
func addLink(link *Link) bool {
db := Db{}
db.Open()
defer db.Close()
err := db.Prepare("Insert into links(user, url, time, post, domain) values($1, $2, $3, $4, $5)")
if err != nil {
log.Println("addLink: " + err.Error())
return false
}
err = db.ExecuteStmt(link.User, link.Url, link.Timestamp, link.Post, link.Domain)
if err != nil {
log.Println(err.Error())
return false
}
link.Id, err = db.Result.LastInsertId()
if err != nil {
log.Println(err.Error())
return false
}
if link.Size < 10000000 { // "crawl" aka download only if < 10MB
crawlReceiver <- link
return true
}
log.Printf("Size exceeds limit, %d Bytes \n", link.Size)
return true
}
func checkDuplicate(link *Link) (Link, bool) {
var result Link
result.User = ""
db := Db{}
db.Open()
defer db.Close()
err := db.Prepare("Select id, user, url, time from links where url = $1 limit 0, 1")
if err != nil {
log.Println("checkDuplicate: %s" + err.Error())
return result, false
}
err = db.QueryStmt(link.Url)
if err != nil {
log.Println("checkDuplicate: " + err.Error())
return result, false
}
defer db.ResultRows.Close()
for db.ResultRows.Next() {
err = db.ResultRows.Scan(&result.Id, &result.User, &result.Url, &result.Timestamp)
if err != nil {
log.Println("checkDuplicate: " + err.Error())
continue
}
}
if result.User == "" { // hm, doofer check, besser machen fgt
return result, false // kein Duplikat
}
return result, true // true falls der Link schon in der DB ist, ansonsten false
}
func getSpruch() string {
return sprueche[rand.Intn(len(sprueche))]
}