Skip to content

Commit

Permalink
supports passing id and bug fix
Browse files Browse the repository at this point in the history
  • Loading branch information
gosom committed Apr 18, 2024
1 parent 358d26f commit 7f896de
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 10 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ pages are scraped.
## Extracted Data Points

```
input_id
link
title
category
Expand Down Expand Up @@ -76,6 +77,13 @@ emails

**Note**: email is empty by default (see Usage)

**Note**: Input id is an ID that you can define per query. By default its a UUID
In order to define it you can have an input file like:

```
Matsuhisa Athens #!#MyIDentifier
```

## Quickstart

### Using docker:
Expand Down
20 changes: 13 additions & 7 deletions gmaps/entry.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ type Review struct {
}

type Entry struct {
ID string `json:"input_id"`
Link string `json:"link"`
Cid string `json:"cid"`
Title string `json:"title"`
Expand Down Expand Up @@ -123,6 +124,7 @@ func (e *Entry) Validate() error {

func (e *Entry) CsvHeaders() []string {
return []string{
"input_id",
"link",
"title",
"category",
Expand Down Expand Up @@ -159,6 +161,7 @@ func (e *Entry) CsvHeaders() []string {

func (e *Entry) CsvRow() []string {
return []string{
e.ID,
e.Link,
e.Title,
e.Category,
Expand Down Expand Up @@ -344,18 +347,21 @@ func EntryFromJSON(raw []byte) (entry Entry, err error) {
reviewsI := getNthElementAndCast[[]any](darray, 175, 9, 0, 0)
for i := range reviewsI {
el := getNthElementAndCast[[]any](reviewsI, i, 0)
price := getNthElementAndCast[any](el, 2, 6, 2, 2, 0, 0, 1)
if price != nil {
entry.PriceRange = price.(string)
}

time := getNthElementAndCast[[]any](el, 2, 2, 0, 1, 21, 6, 7)

review := Review{
Name: getNthElementAndCast[string](el, 1, 4, 0, 4),
ProfilePicture: getNthElementAndCast[string](el, 1, 4, 0, 3),
When: fmt.Sprintf("%v-%v-%v", time[0], time[1], time[2]),
Rating: int(getNthElementAndCast[float64](el, 2, 0, 0)),
Description: getNthElementAndCast[string](el, 2, 15, 0, 0),
When: func() string {
if len(time) < 3 {
return ""
}

return fmt.Sprintf("%v-%v-%v", time[0], time[1], time[2])
}(),
Rating: int(getNthElementAndCast[float64](el, 2, 0, 0)),
Description: getNthElementAndCast[string](el, 2, 15, 0, 0),
}

if review.Name == "" {
Expand Down
8 changes: 6 additions & 2 deletions gmaps/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,21 @@ type GmapJob struct {
ExtractEmail bool
}

func NewGmapJob(langCode, query string, maxDepth int, extractEmail bool) *GmapJob {
func NewGmapJob(id, langCode, query string, maxDepth int, extractEmail bool) *GmapJob {
query = url.QueryEscape(query)

const (
maxRetries = 3
prio = scrapemate.PriorityLow
)

if id == "" {
id = uuid.New().String()
}

job := GmapJob{
Job: scrapemate.Job{
ID: uuid.New().String(),
ID: id,
Method: http.MethodGet,
URL: "https://www.google.com/maps/search/" + query,
URLParams: map[string]string{"hl": langCode},
Expand Down
2 changes: 2 additions & 0 deletions gmaps/place.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ func (j *PlaceJob) Process(_ context.Context, resp *scrapemate.Response) (any, [
return nil, nil, err
}

entry.ID = j.ParentID

if j.ExtractEmail && entry.IsWebsiteValidForEmail() {
emailJob := NewEmailJob(j.ID, &entry)

Expand Down
9 changes: 8 additions & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,14 @@ func createSeedJobs(langCode string, r io.Reader, maxDepth int, email bool) (job
continue
}

jobs = append(jobs, gmaps.NewGmapJob(langCode, query, maxDepth, email))
var id string

if before, after, ok := strings.Cut(query, "#!#"); ok {
query = strings.TrimSpace(before)
id = strings.TrimSpace(after)
}

jobs = append(jobs, gmaps.NewGmapJob(id, langCode, query, maxDepth, email))
}

return jobs, scanner.Err()
Expand Down

0 comments on commit 7f896de

Please sign in to comment.