diff --git a/README.md b/README.md index ac9dece..c433ed2 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ pages are scraped. ## Extracted Data Points ``` +input_id link title category @@ -76,6 +77,13 @@ emails **Note**: email is empty by default (see Usage) +**Note**: Input id is an ID that you can define per query. By default its a UUID +In order to define it you can have an input file like: + +``` +Matsuhisa Athens #!#MyIDentifier +``` + ## Quickstart ### Using docker: diff --git a/gmaps/entry.go b/gmaps/entry.go index 8a73461..3ead7cc 100644 --- a/gmaps/entry.go +++ b/gmaps/entry.go @@ -53,6 +53,7 @@ type Review struct { } type Entry struct { + ID string `json:"input_id"` Link string `json:"link"` Cid string `json:"cid"` Title string `json:"title"` @@ -123,6 +124,7 @@ func (e *Entry) Validate() error { func (e *Entry) CsvHeaders() []string { return []string{ + "input_id", "link", "title", "category", @@ -159,6 +161,7 @@ func (e *Entry) CsvHeaders() []string { func (e *Entry) CsvRow() []string { return []string{ + e.ID, e.Link, e.Title, e.Category, @@ -344,18 +347,21 @@ func EntryFromJSON(raw []byte) (entry Entry, err error) { reviewsI := getNthElementAndCast[[]any](darray, 175, 9, 0, 0) for i := range reviewsI { el := getNthElementAndCast[[]any](reviewsI, i, 0) - price := getNthElementAndCast[any](el, 2, 6, 2, 2, 0, 0, 1) - if price != nil { - entry.PriceRange = price.(string) - } time := getNthElementAndCast[[]any](el, 2, 2, 0, 1, 21, 6, 7) + review := Review{ Name: getNthElementAndCast[string](el, 1, 4, 0, 4), ProfilePicture: getNthElementAndCast[string](el, 1, 4, 0, 3), - When: fmt.Sprintf("%v-%v-%v", time[0], time[1], time[2]), - Rating: int(getNthElementAndCast[float64](el, 2, 0, 0)), - Description: getNthElementAndCast[string](el, 2, 15, 0, 0), + When: func() string { + if len(time) < 3 { + return "" + } + + return fmt.Sprintf("%v-%v-%v", time[0], time[1], time[2]) + }(), + Rating: int(getNthElementAndCast[float64](el, 2, 0, 0)), + Description: getNthElementAndCast[string](el, 2, 15, 0, 0), } if review.Name == "" { diff --git a/gmaps/job.go b/gmaps/job.go index 49ecdba..34294d8 100644 --- a/gmaps/job.go +++ b/gmaps/job.go @@ -21,7 +21,7 @@ type GmapJob struct { ExtractEmail bool } -func NewGmapJob(langCode, query string, maxDepth int, extractEmail bool) *GmapJob { +func NewGmapJob(id, langCode, query string, maxDepth int, extractEmail bool) *GmapJob { query = url.QueryEscape(query) const ( @@ -29,9 +29,13 @@ func NewGmapJob(langCode, query string, maxDepth int, extractEmail bool) *GmapJo prio = scrapemate.PriorityLow ) + if id == "" { + id = uuid.New().String() + } + job := GmapJob{ Job: scrapemate.Job{ - ID: uuid.New().String(), + ID: id, Method: http.MethodGet, URL: "https://www.google.com/maps/search/" + query, URLParams: map[string]string{"hl": langCode}, diff --git a/gmaps/place.go b/gmaps/place.go index a457c0c..c3a60f4 100644 --- a/gmaps/place.go +++ b/gmaps/place.go @@ -59,6 +59,8 @@ func (j *PlaceJob) Process(_ context.Context, resp *scrapemate.Response) (any, [ return nil, nil, err } + entry.ID = j.ParentID + if j.ExtractEmail && entry.IsWebsiteValidForEmail() { emailJob := NewEmailJob(j.ID, &entry) diff --git a/main.go b/main.go index b2fa913..9f97061 100644 --- a/main.go +++ b/main.go @@ -220,7 +220,14 @@ func createSeedJobs(langCode string, r io.Reader, maxDepth int, email bool) (job continue } - jobs = append(jobs, gmaps.NewGmapJob(langCode, query, maxDepth, email)) + var id string + + if before, after, ok := strings.Cut(query, "#!#"); ok { + query = strings.TrimSpace(before) + id = strings.TrimSpace(after) + } + + jobs = append(jobs, gmaps.NewGmapJob(id, langCode, query, maxDepth, email)) } return jobs, scanner.Err()