Refactored error response/added comments #10

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

dushanlk merged 1 commit into master from refactoring/error-response-docs

Jan 3, 2025

0 docs/lucytech.jpg → docs/highlevel_arch_diagram.jpg

File renamed without changes

handlers/scrape.go

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -4,6 +4,7 @@ import (
  
    	"crypto/tls"

    	"fmt"

    	"net/http"

    	"net/url"

    	"strconv"

    	"strings"

    @@ -14,8 +15,10 @@ import (
  
    	"scraper/utils"

    	"github.com/gin-gonic/gin"

    	"golang.org/x/net/publicsuffix"

    )

    // This handles the initial scraping request received from the client.

    func ScrapeHandler(context *gin.Context) {

    	baseURL := context.Query("url")

    	client := &http.Client{

    @@ -26,58 +29,73 @@ func ScrapeHandler(context *gin.Context) {
  
    	if baseURL == "" {

    		logger.Debug("URL query parameter is required")

    		context.JSON(http.StatusBadRequest, gin.H{"error": "url query parameter is required"})

    		context.JSON(http.StatusBadRequest, utils.BuildErrorResponse("url query parameter is required"))

    		return

    	} else {

    		if !strings.HasPrefix(baseURL, "http://") && !strings.HasPrefix(baseURL, "https://") {

    			baseURL = "http://" + baseURL

    		}

    		baseUrlParsed, _ := url.Parse(baseURL)

    		_, err := publicsuffix.EffectiveTLDPlusOne(baseUrlParsed.Host)

    		if err != nil {

    			logger.Error(err)

    			context.JSON(http.StatusBadRequest, utils.BuildErrorResponse("Invalid URL format, please provide a valid URL."))

    			return

    		}

    	}

    	pageInfo, err := services.FetchPageInfo(client, baseURL)

    	if err != nil {

    		logger.Error(err)

    		context.JSON(http.StatusInternalServerError, gin.H{"error": "failed to fetch page info"})

    		context.JSON(http.StatusInternalServerError, utils.BuildErrorResponse("Failed to fetch page info"))

    		return

    	}

    	// We store scraped page info in-memory to use with pagination later.

    	// Stored page infomation mapped to the returned request ID.

    	requestID := storage.StorePageInfo(pageInfo)

    	// Here we check the status of 10 (config.PageSize) scraped URLs.

    	inaccessibleCount := services.CheckURLStatus(client, pageInfo.URLs, 0, min(config.PageSize, len(pageInfo.URLs)))

    	totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.PageSize)

    	context.JSON(http.StatusOK, utils.BuildPageResponse(requestID, 1, totalPages, pageInfo, inaccessibleCount, 0, min(config.PageSize, len(pageInfo.URLs))))

    }

    // This handles subsequent pagination requests to check status of URLs.

    func PageHandler(context *gin.Context) {

    	client := &http.Client{

    		Transport: &http.Transport{

    			TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, // Disable TLS verification

    		},

    	}

    	// Request ID is required to fetch infromation from the in-memory storage.

    	requestID := context.Param("id")

    	pageNumStr := context.Param("page")

    	// Retrieve page information from in-memory storage using the request ID.

    	pageInfo, exists := storage.RetrievePageInfo(requestID)

    	if !exists {

    		logger.Debug(fmt.Sprintf("Requested ID [%s] not found in the local storage", requestID))

    		context.JSON(http.StatusNotFound, gin.H{"error": "request ID not found"})

    		context.JSON(http.StatusNotFound, utils.BuildErrorResponse("request ID not found"))

    		return

    	}

    	pageNum, err := strconv.Atoi(pageNumStr)

    	if err != nil || pageNum < 1 {

    		logger.Error(err)

    		context.JSON(http.StatusBadRequest, gin.H{"error": "invalid page number"})

    		context.JSON(http.StatusBadRequest, utils.BuildErrorResponse("invalid page number"))

    		return

    	}

    	start, end := utils.CalculatePageBounds(pageNum, len(pageInfo.URLs), config.PageSize)

    	if start >= len(pageInfo.URLs) {

    		logger.Debug(fmt.Sprintf("Requested page [%d] not found", pageNum))

    		context.JSON(http.StatusNotFound, gin.H{"error": "page not found"})

    		context.JSON(http.StatusNotFound, utils.BuildErrorResponse("page not found"))

    		return

    	}

    	// Check the URL status for URLs on the given pagination page.

    	inaccessibleCount := services.CheckURLStatus(client, pageInfo.URLs, start, end)

    	totalPages := utils.CalculateTotalPages(len(pageInfo.URLs), config.PageSize)

handlers/scrape_test.go

-Original file line number
+Diff line change
@@ Expand Up / @@ -63,7 +63,7 @@ func TestScrapeHandler(test_type *testing.T) { @@
     			mockRequestID:  "",
     			expectedStatus: http.StatusInternalServerError,
     			expectedBody: map[string]interface{}{
-    				"error": "failed to fetch page info",
+    				"error": "Failed to fetch page info",
     			},
     		},
     	}
@@ Expand Down @@

logger/logger.go

-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,7 @@ import ( @@
     	"os"
     )
+    // Set custom loggers for each log level
     var (
     	DEBUG = log.New(os.Stdout, "[scraper-DEBUG] ", log.LstdFlags)
     	INFO  = log.New(os.Stdout, "[scraper-INFO] ", log.LstdFlags)
@@ Expand Down @@

models/entity.go

-Original file line number
+Diff line change
@@ Expand Up / @@ -13,5 +13,5 @@ type PageInfo struct { @@
     type URLStatus struct {
     	URL        string `json:"url"`
     	HTTPStatus int    `json:"http_status"`
-    	Error      error  `json:"error"`
+    	Error      string `json:"error"`
     }

readme.md

-Original file line number
+Diff line change
@@ Expand Up / @@ -4,7 +4,7 @@ An API service to scrape a URL and get a summary. @@
     ## High level architecture
-    ![High level diagram](./docs/lucytech.jpg)
+    ![High level diagram](./docs/highlevel_arch_diagram.jpg)
     ### Components
@@ Expand Down @@

services/htmlparser.go

-Original file line number
+Diff line change
@@ Expand Up / @@ -12,6 +12,7 @@ import ( @@
     	"golang.org/x/net/publicsuffix"
     )
+    // This is to fetch the HTML content of the given URL.
     func FetchPageInfo(client *http.Client, baseURL string) (*models.PageInfo, error) {
     	resp, err := client.Get(baseURL)
     	if err != nil {
@@ Expand All @@
     	return ParseHTML(resp.Body, baseURL)
     }
+    // This is to parse the HTML content and extract required data.
     func ParseHTML(body io.Reader, baseURL string) (*models.PageInfo, error) {
     	pageInfo := &models.PageInfo{HeadingCounts: make(map[string]int)}
     	doc, err := html.Parse(body)
@@ Expand Down Expand Up / @@ -73,6 +75,7 @@ func traverse(node *html.Node, visit func(*html.Node)) { @@
     	}
     }
+    // This is to extract links from the HTML content.
     func extractHref(node *html.Node) string {
     	for _, attr := range node.Attr {
     		if attr.Key == "href" {
@@ Expand All / @@ -82,12 +85,15 @@ func extractHref(node *html.Node) string { @@
     	return ""
     }
+    // This is to build the absolute URL from given baseURL and path.
     func resolveURL(baseURL, href string) string {
     	base, _ := url.Parse(baseURL)
     	rel, _ := url.Parse(href)
     	return base.ResolveReference(rel).String()
     }
+    // This is to compare TLDs of found URLs against the scraped page URL
+    // to determine if found URLs are internal links or external link.
     func isInternal(baseUrl, scrappedUrl string) bool {
     	baseUrlParsed, _ := url.Parse(baseUrl)
     	scrappedUrlParsed, _ := url.Parse(scrappedUrl)
@@ Expand All / @@ -98,6 +104,8 @@ func isInternal(baseUrl, scrappedUrl string) bool { @@
     	return strings.EqualFold(baseUrlTld, scrappedUrlTld)
     }
+    // This is to check if the scraped HTML content has a password input.
+    // Based on this we decided if the page contains a login form.
     func containsPasswordInput(node *html.Node) bool {
     	if node.Type == html.ElementNode && node.Data == "input" {
     		for _, attr := range node.Attr {
@@ Expand All / @@ -114,6 +122,7 @@ func containsPasswordInput(node *html.Node) bool { @@
     	return false
     }
+    // This is to extract the scraped page title from the HTML title tag.
     func extractTitle(node *html.Node) string {
     	if node.Type == html.ElementNode && node.Data == "title" && node.FirstChild != nil {
     		return node.FirstChild.Data
@@ Expand All / @@ -127,6 +136,7 @@ func extractTitle(node *html.Node) string { @@
     	return ""
     }
+    // This is to extract the HTML version of the scraped page.
     func extractHtmlVersion(node *html.Node) string {
     	// Check for a "version" attribute
     	for _, attr := range node.Attr {
@@ Expand Down @@

services/urlstatus.go

-Original file line number
+Diff line change
@@ Expand Up / @@ -10,14 +10,6 @@ import ( @@
     // This is to check the URL status and decide wether it is accessible or not.
     // It marks the status of each collected URL.
     // Since the URL collection can be huge we check status based on given start and end positions.
-    // Parameters:
-    //
-    //	urls - Collected URLs by scrapping a web page.
-    //	start - starting position of the URL set.
-    //	end - Ending position of the URL set.
-    //
-    // Returns:
-    // inaccessible URL count.
     func CheckURLStatus(client *http.Client, urls []models.URLStatus, start, end int) int {
     	var wg sync.WaitGroup
     	var mu sync.Mutex
@@ Expand All @@
     				inaccessibleCount++
     				mu.Unlock()
-    				urls[idx].Error = err
+    				urls[idx].Error = err.Error()
     				return
     			}
@@ Expand Down @@

storage/memory.go

-Original file line number
+Diff line change
@@ -1,3 +1,7 @@
+    // This is a simple in-memory storage to store page info to support for pagination.
+    // Each store page info is mapped to a random unique ID which generated upon storing data.
+    // To retrieve stored page info need to provide the ID generator upon storing data.
+    // This simple storage supports only store and retrieve operations as of now.
     package storage
     import (
@@ Expand All / @@ -12,6 +16,7 @@ var storage = struct { @@
     	data map[string]models.PageInfo
     }{data: make(map[string]models.PageInfo)}
+    // This is to store page info.
     func StorePageInfo(info *models.PageInfo) string {
     	storage.Lock()
     	defer storage.Unlock()
@@ Expand All / @@ -21,6 +26,7 @@ func StorePageInfo(info *models.PageInfo) string { @@
     	return id
     }
+    // This is to retrieve page info by unique ID.
     func RetrievePageInfo(id string) (*models.PageInfo, bool) {
     	storage.RLock()
     	defer storage.RUnlock()
@@ Expand All / @@ -29,10 +35,12 @@ func RetrievePageInfo(id string) (*models.PageInfo, bool) { @@
     	return &info, exists
     }
+    // This is to generate the random unique ID.
     func generateID() string {
     	return time.Now().Format("20060102150405") + "-" + randomString(8)
     }
+    // This is to generate a random string to append into the random unique ID.
     func randomString(size int) string {
     	const letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
     	result := make([]byte, size)
@@ Expand Down @@

utils/helpers.go

-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,8 @@ import ( @@
     	"math"
     	"scraper/config"
     	"scraper/models"
+    	"github.com/gin-gonic/gin"
     )
     func CalculateTotalPages(totalItems, pageSize int) int {
@@ Expand All / @@ -24,6 +26,7 @@ func min(a, b int) int { @@
     	return b
     }
+    // This is to build the response after a successful scraping.
     func BuildPageResponse(requestID string, pageNum, totalPages int, pageInfo *models.PageInfo, inaccessible, start, end int) models.PageResponse {
     	var prevPage, nextPage *string
     	if pageNum > 1 {
@@ Expand Down Expand Up @@
     		},
     	}
     }
+    // This is to build the error response.
+    func BuildErrorResponse(message string) gin.H {
+    	return gin.H{"error": message}
+    }

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Refactored error response/added comments #10

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!