In [3]:

import (
	"fmt"
	"log"
	"regexp"
	"strings"

	"github.com/gocolly/colly"
)



type Scraper struct {
	Title string
	Text  []string
	Links []string
}

func ScraperWeb(website string) (string, string, string) {
	c := colly.NewCollector(
		colly.AllowedDomains(website),
		colly.MaxDepth(1),
	)

	var tt Scraper
	scriptRegex := regexp.MustCompile(
		`(?i)(function\s*\(|var\s+\w+|window\.\w+|document\.\w+|parentElement|insertBefore|_stq\.push|JSON\.parse|classList\.add)`,
	)
	jsonRegex := regexp.MustCompile(`(?i)\{.*?[:].*?\}`) // Matches JSON-like text
	longLineRegex := regexp.MustCompile(`[{}()\[\];]+`)  // Detects code-like lines

	// Get page title

	c.OnHTML("title", func(h *colly.HTMLElement) {
		tt.Title = h.Text
	})

	// Find and print all links
	c.OnHTML("body", func(e *colly.HTMLElement) {
		content := e.Text
		lines := strings.Split(content, "\n")
		for _, line := range lines {
			trimmed := strings.TrimSpace(line)
			if trimmed != "" && !scriptRegex.MatchString(trimmed) &&
				!jsonRegex.MatchString(trimmed) &&
				!longLineRegex.MatchString(trimmed) {
				tt.Text = append(tt.Text, trimmed+"\n")
			}
		}
	})

	// Extract all links
	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Request.AbsoluteURL(e.Attr("href"))
		if link != "" {
			tt.Links = append(tt.Links, link)
		}
	})

	c.OnRequest(func(r *colly.Request) {
		fmt.Println("Visiting", r.URL.String())
	})
	c.OnError(func(r *colly.Response, err error) {
		log.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
	})

	// Start scraping
	err := c.Visit("https://" + website)
	if err != nil {
		log.Fatal("error Visit ", err)
	}

	allLines := strings.Join(tt.Text, "\n") // Joins with newline between lines
	allHref := strings.Join(tt.Links, ",")
	return tt.Title, allLines, allHref
}

In [6]:
type Message struct {
	Content string `json:"content"`
	Role    string `json:"role"`
}

type ResponseFormat struct {
	Type string `json:"type"`
}

type Payload struct {
	Messages         []Message      `json:"messages"`
	Model            string         `json:"model"`
	FrequencyPenalty int            `json:"frequency_penalty"`
	MaxTokens        int            `json:"max_tokens"`
	PresencePenalty  int            `json:"presence_penalty"`
	ResponseFormat   ResponseFormat `json:"response_format"`
	Stop             any            `json:"stop"`
	Stream           bool           `json:"stream"`
	StreamOptions    any            `json:"stream_options"`
	Temperature      int            `json:"temperature"`
	TopP             int            `json:"top_p"`
	Tools            any            `json:"tools"`
	ToolChoice       string         `json:"tool_choice"`
	Logprobs         bool           `json:"logprobs"`
	TopLogprobs      any            `json:"top_logprobs"`
}

func CreatePayload(systemPrompt, userMessage string) *strings.Reader {
	payload := Payload{
		Messages: []Message{
			{
				Content: systemPrompt,
				Role:    "system",
			},
			{
				Content: userMessage, // Now using a variable
				Role:    "user",
			},
		},
		Model:            "deepseek-chat",
		FrequencyPenalty: 0,
		MaxTokens:        2048,
		PresencePenalty:  0,
		ResponseFormat:   ResponseFormat{Type: "text"},
		Stop:             nil,
		Stream:           false,
		StreamOptions:    nil,
		Temperature:      1,
		TopP:             1,
		Tools:            nil,
		ToolChoice:       "none",
		Logprobs:         false,
		TopLogprobs:      nil,
	}

	jsonPayload, _ := json.Marshal(payload)
	return strings.NewReader(string(jsonPayload))
}
func parseLinks(data []byte) {
// Define the structs (same as before)

    type Choice struct {
	Index        int     `json:"index"`
	Message      Message `json:"message"`
	Logprobs     *string `json:"logprobs"`
	FinishReason string  `json:"finish_reason"`
}

type Message struct {
	Role    string `json:"role"`
	Content string `json:"content"`
}

type Usage struct {
	PromptTokens     int `json:"prompt_tokens"`
	CompletionTokens int `json:"completion_tokens"`
	TotalTokens      int `json:"total_tokens"`
}

    type Link struct {
	Type string `json:"type"`
	URL  string `json:"url"`
}
type ChatCompletion struct {
	ID      string    `json:"id"`
	Object  string    `json:"object"`
	Created int64     `json:"created"`
	Model   string    `json:"model"`
	Choices []Choice  `json:"choices"`
	Usage   Usage     `json:"usage"`
}



type LinksResponse struct {
	Links []Link `json:"links"`
}



    // Parse the main response
	var completion ChatCompletion
	err := json.Unmarshal(data, &completion)
	if err != nil {
		fmt.Printf("Error parsing JSON: %v\n", err)
		return
	}

	// Extract and clean the content
	content := completion.Choices[0].Message.Content
	content = strings.TrimPrefix(content, "```json\n")
	content = strings.TrimSuffix(content, "\n```")

	// Parse the nested links JSON
	var linksResp LinksResponse
	err = json.Unmarshal([]byte(content), &linksResp)
	if err != nil {
		fmt.Printf("Error parsing links JSON: %v\n", err)
		return
	}

	// Display results in notebook
	fmt.Println("## Chat Completion Details")
	fmt.Printf("- ID: %s\n", completion.ID)
	fmt.Printf("- Model: %s\n", completion.Model)
	fmt.Printf("- Created: %d\n", completion.Created)
	
	fmt.Println("\n## Links Found")
	for _, link := range linksResp.Links {
		fmt.Printf("- %s: %s\n", link.Type, link.URL)

        }
    fmt.Println("\n## Usage Statistics")
	fmt.Printf("- Prompt tokens: %d\n", completion.Usage.PromptTokens)
	fmt.Printf("- Completion tokens: %d\n", completion.Usage.CompletionTokens)
	fmt.Printf("- Total tokens: %d\n", completion.Usage.TotalTokens)
    
}

func Chat(systemPrompt, userPrompt, apiKey string) {
	url := "https://api.deepseek.com/chat/completions"
	method := "POST"

	payload := CreatePayload(systemPrompt, userPrompt)
	client := &http.Client{}
	req, err := http.NewRequest(method, url, payload)
	if err != nil {
		fmt.Println(err)
		return
	}
	req.Header.Add("Content-Type", "application/json")
	req.Header.Add("Accept", "application/json")
	req.Header.Add("Authorization", "Bearer "+apiKey)

	res, err := client.Do(req)
	if err != nil {
		fmt.Println(err)
		return
	}
	defer res.Body.Close()

	body, err := io.ReadAll(res.Body)
	if err != nil {
		fmt.Println(err)
		return
	}
	parseLinks(body)
}

func UserPrompt(websiteUrl, links string) string {
	userPrompt := fmt.Sprintf("Here is the list of links on the website of  %s\n", websiteUrl)

	userPrompt += `Please decide which of theese are relevant web links for the brochure about the company, respond with the full https URL,
	Do not include Terms of Service , Privacy email links.`

	userPrompt += "\nLinks (some might be relative links):\n"
	userPrompt += fmt.Sprintf("\n %s", links)
	return userPrompt
}

%%
	_, _, links := ScraperWeb("edwarddonner.com")

	systemPrompt := `You are provided with a list of links found on a webpage.
You are able to decide which of the links would be most relevant to include in brochure about the company,
such as links to About page, or Company page, or Career/Jobs pages.
You Should respond in JSON as in this example.
	  {
	    "links": [
	      { "type:"about page","url":"https://full.url/goes/here/about" },
	      { "type": "careers page": "url": "https://another.full.url/careers"}
	     ]
	  }
	`
	userprmpt := UserPrompt("https://edwarddonner.com", links)

	// Load .env file
	err := godotenv.Load()
	if err != nil {
		log.Fatal("Error loading .env file")
	}

	// Get API_KEY from environment variables
	apiKey := os.Getenv("API_KEY")
  
	if apiKey == "" {
		log.Fatal("API_KEY not found in .env file")
	}
	Chat(systemPrompt, userprmpt, apiKey)


Visiting https://edwarddonner.com
/notebooks/notebooks <nil>
## Chat Completion Details
- ID: 966904ae-1298-4744-b43f-bd443594028e
- Model: deepseek-chat
- Created: 1743280484

## Links Found
- home page: https://edwarddonner.com/
- about page: https://edwarddonner.com/about-me-and-about-nebula/
- blog/posts: https://edwarddonner.com/posts/
- linkedin profile: https://www.linkedin.com/in/eddonner/

## Usage Statistics
- Prompt tokens: 657
- Completion tokens: 118
- Total tokens: 775
