diff --git a/cmd/distr/wscan.go b/cmd/distr/wscan.go index a782b2a..1900f5a 100644 --- a/cmd/distr/wscan.go +++ b/cmd/distr/wscan.go @@ -3,13 +3,11 @@ package main import ( "flag" "fmt" - "github.com/PuerkitoBio/goquery" - "github.com/digital-technology-agency/web-scan/pkg/models" "github.com/digital-technology-agency/web-scan/pkg/services/generators" "github.com/digital-technology-agency/web-scan/pkg/services/json" + "github.com/digital-technology-agency/web-scan/pkg/services/page" "github.com/digital-technology-agency/web-scan/pkg/utils" "github.com/zenthangplus/goccm" - "net/http" "runtime" ) @@ -17,19 +15,10 @@ var ( coreCount = flag.String(`core_count`, "1", `Example 1`) alphabet = flag.String(`alphabet`, "", `Example abcdefg`) urlLen = flag.String(`len`, "", `Example 2`) - concurrencyCount = flag.String(`concurrency`, "10", `Example 10`) + concurrencyCount = flag.String(`concurrency`, "5", `Example 5`) protocols = []string{"http", "https"} ) -func genWritersProtocols(names []string) map[string]*json.EachRowWriter { - result := map[string]*json.EachRowWriter{} - for _, name := range names { - writer, _ := json.NewEachRowWriter(fmt.Sprintf("%s.txt", name)) - result[name] = writer - } - return result -} - func main() { flag.Parse() /*check flags*/ @@ -45,7 +34,7 @@ func main() { Alphabet: *alphabet, Len: utils.Int(*urlLen), } - protocolWriters := genWritersProtocols(protocols) + protocolWriters := json.NewEachRowWriters(protocols) for domenName := range gen.Gen() { cuncurency.Wait() total += 1 @@ -53,30 +42,17 @@ func main() { go func(protokol, domen string, w *json.EachRowWriter) { defer cuncurency.Done() url := fmt.Sprintf("%s://%s.ru", protokol, domen) - res, err := http.Get(url) - if err != nil { - fmt.Printf("Err:[%s]\n", err.Error()) - return + pageService := page.PageService{ + Url: url, } - defer res.Body.Close() - if res.StatusCode != 200 { - fmt.Printf("Status code [%d] error [%s]", res.StatusCode, res.Status) + item, err := pageService.ReadPage() + if err != nil { return } - doc, err := goquery.NewDocumentFromReader(res.Body) - if err != nil { - fmt.Printf("Err:[%s]\n", err.Error()) + if item == nil { + fmt.Printf("Page is nil\n") return } - item := models.Page{} - doc.Find("title").Each(func(i int, s *goquery.Selection) { - item.Title = s.Text() - }) - doc.Find("meta").Each(func(i int, s *goquery.Selection) { - if s.AttrOr("name", "") == "description" { - item.Description = s.AttrOr("content", "") - } - }) err = w.WriteLine(item) if err != nil { fmt.Printf("Write line err:[%s]\n", err.Error()) @@ -88,7 +64,4 @@ func main() { } cuncurency.WaitAllDone() println(fmt.Sprintf("Total size:[%d] Result:[%d]", total, domenNames)) - /* for key, value := range list { - fmt.Printf("Domen:[%s] Title:[%s] Description:[%s]\n", key, value.Title, value.Description) - }*/ } diff --git a/pkg/models/page.go b/pkg/models/page.go index bb16140..8ffeef3 100644 --- a/pkg/models/page.go +++ b/pkg/models/page.go @@ -2,6 +2,8 @@ package models /*Page type of page*/ type Page struct { - Title string - Description string + Title string `json:"title"` + Description string `json:"description"` + Url string `json:"url"` + Robots string `json:"robots"` } diff --git a/pkg/services/json/file-each-row-writer.go b/pkg/services/json/file-each-row-writer.go index 5b7b302..e273538 100644 --- a/pkg/services/json/file-each-row-writer.go +++ b/pkg/services/json/file-each-row-writer.go @@ -11,6 +11,16 @@ type EachRowWriter struct { file *os.File } +// NewEachRowWriters create new writers. +func NewEachRowWriters(names []string) map[string]*EachRowWriter { + result := map[string]*EachRowWriter{} + for _, name := range names { + writer, _ := NewEachRowWriter(fmt.Sprintf("%s.txt", name)) + result[name] = writer + } + return result +} + // NewEachRowWriter new writer. func NewEachRowWriter(path string) (*EachRowWriter, error) { create, err := os.Create(path) diff --git a/pkg/services/page/page-service.go b/pkg/services/page/page-service.go new file mode 100644 index 0000000..6831ed8 --- /dev/null +++ b/pkg/services/page/page-service.go @@ -0,0 +1,62 @@ +package page + +import ( + "fmt" + "github.com/PuerkitoBio/goquery" + "github.com/digital-technology-agency/web-scan/pkg/models" + "io/ioutil" + "net/http" +) + +// PageService page service. +type PageService struct { + Url string +} + +// ReadPage read page. +func (s PageService) ReadPage() (*models.Page, error) { + url := s.Url + item := models.Page{ + Url: url, + } + res, err := http.Get(url) + if err != nil { + fmt.Printf("Err:[%s]\n", err.Error()) + return nil, err + } + defer res.Body.Close() + if res.StatusCode != 200 { + fmt.Printf("Status code [%d] error [%s]", res.StatusCode, res.Status) + return nil, err + } + doc, err := goquery.NewDocumentFromReader(res.Body) + if err != nil { + fmt.Printf("Err:[%s]\n", err.Error()) + return nil, err + } + doc.Find("title").Each(func(i int, s *goquery.Selection) { + item.Title = s.Text() + }) + doc.Find("meta").Each(func(i int, s *goquery.Selection) { + if s.AttrOr("name", "") == "description" { + item.Description = s.AttrOr("content", "") + } + }) + urlRobotTxt := fmt.Sprintf("%s/robots.txt", url) + resRobots, err := http.Get(urlRobotTxt) + if err != nil { + fmt.Printf("Err:[%s]\n", err.Error()) + return nil, err + } + if resRobots.StatusCode != 200 { + fmt.Printf("Robots txt. Status:[%d]\n", resRobots.StatusCode) + return nil, err + } + allBytesRobotsTxt, err := ioutil.ReadAll(resRobots.Body) + if err != nil { + fmt.Printf("Err:[%s]\n", err.Error()) + return nil, err + } + item.Robots = string(allBytesRobotsTxt) + return &item, nil +}