Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes
28 changes: 23 additions & 5 deletions handlers/scrape.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"crypto/tls"
"fmt"
"net/http"
"net/url"
"strconv"
"strings"

Expand All @@ -14,8 +15,10 @@ import (
"scraper/utils"

"github.com/gin-gonic/gin"
"golang.org/x/net/publicsuffix"
)

// This handles the initial scraping request received from the client.
func ScrapeHandler(context *gin.Context) {
baseURL := context.Query("url")
client := &http.Client{
Expand All @@ -26,58 +29,73 @@ func ScrapeHandler(context *gin.Context) {

if baseURL == "" {
logger.Debug("URL query parameter is required")
context.JSON(http.StatusBadRequest, gin.H{"error": "url query parameter is required"})
context.JSON(http.StatusBadRequest, utils.BuildErrorResponse("url query parameter is required"))
return
} else {
if !strings.HasPrefix(baseURL, "http://") && !strings.HasPrefix(baseURL, "https://") {
baseURL = "http://" + baseURL
}

baseUrlParsed, _ := url.Parse(baseURL)
_, err := publicsuffix.EffectiveTLDPlusOne(baseUrlParsed.Host)
if err != nil {
logger.Error(err)
context.JSON(http.StatusBadRequest, utils.BuildErrorResponse("Invalid URL format, please provide a valid URL."))
return
}
}

pageInfo, err := services.FetchPageInfo(client, baseURL)
if err != nil {
logger.Error(err)
context.JSON(http.StatusInternalServerError, gin.H{"error": "failed to fetch page info"})
context.JSON(http.StatusInternalServerError, utils.BuildErrorResponse("Failed to fetch page info"))
return
}

// We store scraped page info in-memory to use with pagination later.
// Stored page infomation mapped to the returned request ID.
requestID := storage.StorePageInfo(pageInfo)
// Here we check the status of 10 (config.PageSize) scraped URLs.
inaccessibleCount := services.CheckURLStatus(client, pageInfo.URLs, 0, min(config.PageSize, len(pageInfo.URLs)))
totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.PageSize)

context.JSON(http.StatusOK, utils.BuildPageResponse(requestID, 1, totalPages, pageInfo, inaccessibleCount, 0, min(config.PageSize, len(pageInfo.URLs))))
}

// This handles subsequent pagination requests to check status of URLs.
func PageHandler(context *gin.Context) {
client := &http.Client{
Transport: &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, // Disable TLS verification
},
}
// Request ID is required to fetch infromation from the in-memory storage.
requestID := context.Param("id")
pageNumStr := context.Param("page")

// Retrieve page information from in-memory storage using the request ID.
pageInfo, exists := storage.RetrievePageInfo(requestID)
if !exists {
logger.Debug(fmt.Sprintf("Requested ID [%s] not found in the local storage", requestID))
context.JSON(http.StatusNotFound, gin.H{"error": "request ID not found"})
context.JSON(http.StatusNotFound, utils.BuildErrorResponse("request ID not found"))
return
}

pageNum, err := strconv.Atoi(pageNumStr)
if err != nil || pageNum < 1 {
logger.Error(err)
context.JSON(http.StatusBadRequest, gin.H{"error": "invalid page number"})
context.JSON(http.StatusBadRequest, utils.BuildErrorResponse("invalid page number"))
return
}

start, end := utils.CalculatePageBounds(pageNum, len(pageInfo.URLs), config.PageSize)
if start >= len(pageInfo.URLs) {
logger.Debug(fmt.Sprintf("Requested page [%d] not found", pageNum))
context.JSON(http.StatusNotFound, gin.H{"error": "page not found"})
context.JSON(http.StatusNotFound, utils.BuildErrorResponse("page not found"))
return
}

// Check the URL status for URLs on the given pagination page.
inaccessibleCount := services.CheckURLStatus(client, pageInfo.URLs, start, end)
totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.PageSize)

Expand Down
2 changes: 1 addition & 1 deletion handlers/scrape_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ func TestScrapeHandler(test_type *testing.T) {
mockRequestID: "",
expectedStatus: http.StatusInternalServerError,
expectedBody: map[string]interface{}{
"error": "failed to fetch page info",
"error": "Failed to fetch page info",
},
},
}
Expand Down
1 change: 1 addition & 0 deletions logger/logger.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"os"
)

// Set custom loggers for each log level
var (
DEBUG = log.New(os.Stdout, "[scraper-DEBUG] ", log.LstdFlags)
INFO = log.New(os.Stdout, "[scraper-INFO] ", log.LstdFlags)
Expand Down
2 changes: 1 addition & 1 deletion models/entity.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ type PageInfo struct {
type URLStatus struct {
URL string `json:"url"`
HTTPStatus int `json:"http_status"`
Error error `json:"error"`
Error string `json:"error"`
}
2 changes: 1 addition & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ An API service to scrape a URL and get a summary.

## High level architecture

![High level diagram](./docs/lucytech.jpg)
![High level diagram](./docs/highlevel_arch_diagram.jpg)

### Components

Expand Down
10 changes: 10 additions & 0 deletions services/htmlparser.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"golang.org/x/net/publicsuffix"
)

// This is to fetch the HTML content of the given URL.
func FetchPageInfo(client *http.Client, baseURL string) (*models.PageInfo, error) {
resp, err := client.Get(baseURL)
if err != nil {
Expand All @@ -23,6 +24,7 @@ func FetchPageInfo(client *http.Client, baseURL string) (*models.PageInfo, error
return ParseHTML(resp.Body, baseURL)
}

// This is to parse the HTML content and extract required data.
func ParseHTML(body io.Reader, baseURL string) (*models.PageInfo, error) {
pageInfo := &models.PageInfo{HeadingCounts: make(map[string]int)}
doc, err := html.Parse(body)
Expand Down Expand Up @@ -73,6 +75,7 @@ func traverse(node *html.Node, visit func(*html.Node)) {
}
}

// This is to extract links from the HTML content.
func extractHref(node *html.Node) string {
for _, attr := range node.Attr {
if attr.Key == "href" {
Expand All @@ -82,12 +85,15 @@ func extractHref(node *html.Node) string {
return ""
}

// This is to build the absolute URL from given baseURL and path.
func resolveURL(baseURL, href string) string {
base, _ := url.Parse(baseURL)
rel, _ := url.Parse(href)
return base.ResolveReference(rel).String()
}

// This is to compare TLDs of found URLs against the scraped page URL
// to determine if found URLs are internal links or external link.
func isInternal(baseUrl, scrappedUrl string) bool {
baseUrlParsed, _ := url.Parse(baseUrl)
scrappedUrlParsed, _ := url.Parse(scrappedUrl)
Expand All @@ -98,6 +104,8 @@ func isInternal(baseUrl, scrappedUrl string) bool {
return strings.EqualFold(baseUrlTld, scrappedUrlTld)
}

// This is to check if the scraped HTML content has a password input.
// Based on this we decided if the page contains a login form.
func containsPasswordInput(node *html.Node) bool {
if node.Type == html.ElementNode && node.Data == "input" {
for _, attr := range node.Attr {
Expand All @@ -114,6 +122,7 @@ func containsPasswordInput(node *html.Node) bool {
return false
}

// This is to extract the scraped page title from the HTML title tag.
func extractTitle(node *html.Node) string {
if node.Type == html.ElementNode && node.Data == "title" && node.FirstChild != nil {
return node.FirstChild.Data
Expand All @@ -127,6 +136,7 @@ func extractTitle(node *html.Node) string {
return ""
}

// This is to extract the HTML version of the scraped page.
func extractHtmlVersion(node *html.Node) string {
// Check for a "version" attribute
for _, attr := range node.Attr {
Expand Down
10 changes: 1 addition & 9 deletions services/urlstatus.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,6 @@ import (
// This is to check the URL status and decide wether it is accessible or not.
// It marks the status of each collected URL.
// Since the URL collection can be huge we check status based on given start and end positions.
// Parameters:
//
// urls - Collected URLs by scrapping a web page.
// start - starting position of the URL set.
// end - Ending position of the URL set.
//
// Returns:
// inaccessible URL count.
func CheckURLStatus(client *http.Client, urls []models.URLStatus, start, end int) int {
var wg sync.WaitGroup
var mu sync.Mutex
Expand All @@ -37,7 +29,7 @@ func CheckURLStatus(client *http.Client, urls []models.URLStatus, start, end int
inaccessibleCount++
mu.Unlock()

urls[idx].Error = err
urls[idx].Error = err.Error()
return
}

Expand Down
8 changes: 8 additions & 0 deletions storage/memory.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
// This is a simple in-memory storage to store page info to support for pagination.
// Each store page info is mapped to a random unique ID which generated upon storing data.
// To retrieve stored page info need to provide the ID generator upon storing data.
// This simple storage supports only store and retrieve operations as of now.
package storage

import (
Expand All @@ -12,6 +16,7 @@ var storage = struct {
data map[string]models.PageInfo
}{data: make(map[string]models.PageInfo)}

// This is to store page info.
func StorePageInfo(info *models.PageInfo) string {
storage.Lock()
defer storage.Unlock()
Expand All @@ -21,6 +26,7 @@ func StorePageInfo(info *models.PageInfo) string {
return id
}

// This is to retrieve page info by unique ID.
func RetrievePageInfo(id string) (*models.PageInfo, bool) {
storage.RLock()
defer storage.RUnlock()
Expand All @@ -29,10 +35,12 @@ func RetrievePageInfo(id string) (*models.PageInfo, bool) {
return &info, exists
}

// This is to generate the random unique ID.
func generateID() string {
return time.Now().Format("20060102150405") + "-" + randomString(8)
}

// This is to generate a random string to append into the random unique ID.
func randomString(size int) string {
const letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
result := make([]byte, size)
Expand Down
8 changes: 8 additions & 0 deletions utils/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"math"
"scraper/config"
"scraper/models"

"github.com/gin-gonic/gin"
)

func CalculateTotalPages(totalItems, pageSize int) int {
Expand All @@ -24,6 +26,7 @@ func min(a, b int) int {
return b
}

// This is to build the response after a successful scraping.
func BuildPageResponse(requestID string, pageNum, totalPages int, pageInfo *models.PageInfo, inaccessible, start, end int) models.PageResponse {
var prevPage, nextPage *string
if pageNum > 1 {
Expand Down Expand Up @@ -59,3 +62,8 @@ func BuildPageResponse(requestID string, pageNum, totalPages int, pageInfo *mode
},
}
}

// This is to build the error response.
func BuildErrorResponse(message string) gin.H {
return gin.H{"error": message}
}
Loading