Skip to content

Commit

Permalink
Merge pull request #2237 from sudoAlireza/colly
Browse files Browse the repository at this point in the history
add colly scraping and storing to db example
  • Loading branch information
ReneWerner87 committed Feb 26, 2024
2 parents a36a82c + de7171a commit a706c11
Show file tree
Hide file tree
Showing 17 changed files with 1,007 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Here you can find the most **delicious** recipes to cook delicious meals using o
- [Certificates from Let's Encrypt](/autocert)
- [Clean Architecture](/clean-architecture)
- [Cloud Run](/cloud-run)
- [Colly Scraping using Fiber and PostgreSQL](/fiber-colly-gorm)
- [CSRF-with-Session](/csrf-with-session)
- [CSRF](/csrf)
- [Custom 404 Not Found](/404-handler)
Expand Down
18 changes: 18 additions & 0 deletions aws-sam/app/go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY=
github.com/aws/aws-lambda-go v1.46.0/go.mod h1:dpMpZgvWx5vuQJfBt0zqBha60q7Dd7RfgJv23DymV8A=
github.com/awslabs/aws-lambda-go-api-proxy v0.16.1/go.mod h1:31WDgvTzVyra022CWzO6uEZFel9/y7QKaZpUQEqYLr0=
github.com/gofiber/fiber/v2 v2.52.1/go.mod h1:KEOE+cXMhXG0zHc9d8+E38hoX+ZN7bhOtgeF2oT6jrQ=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/klauspost/compress v1.17.6/go.mod h1:/dCuZOvVtNoHsyb+cuJD3itjs3NbnF6KH9zAO4BDxPM=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
github.com/valyala/fasthttp v1.52.0/go.mod h1:hf5C4QnVMkNXMspnsUlfM3WitlgYflyhHYoKol/szxQ=
github.com/valyala/tcplisten v1.0.0/go.mod h1:T0xQ8SeCZGxckz9qRXTfG43PvQ/mcWh7FwZEA7Ioqkc=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
16 changes: 16 additions & 0 deletions fiber-colly-gorm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
## Simple Web Scraping Colly App with Fiber

This is a basic Go application using the Fiber framework to create scraping tasks in colly.

### How to Run

1. Clone the repository.
2. Navigate to the project directory.
3. Run `docker compose up --build`.
4. Visit `http://127.0.0.1:3000/api/healthchecker` in a web browser or use a tool like `curl` to test it.
5. Send `GET` request to `http://127.0.0.1:3000/scrape/coursera` to start scraping Coursera courses. And `http://127.0.0.1:3000/scrape/quotes` to scrape `quotes.toscrape.com`.


### What It Does

- Scrapes data from websites and stores in PostgreSQL database.
8 changes: 8 additions & 0 deletions fiber-colly-gorm/app/.idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions fiber-colly-gorm/app/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM golang:1.20

RUN apt update && apt upgrade -y &&\
apt install -y git

WORKDIR /go/src/app

COPY . ./

RUN go mod tidy && go mod verify

ENTRYPOINT [ "go", "run", "./cmd/api" ]
7 changes: 7 additions & 0 deletions fiber-colly-gorm/app/app.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
POSTGRES_HOST=colly_db
POSTGRES_PORT=5432
POSTGRES_USER=postgres
POSTGRES_PASSWORD=postgres
POSTGRES_DB=colly

DATABASE_URL=postgres://postgres:postgres@colly_db:5432/colly?schema=public
7 changes: 7 additions & 0 deletions fiber-colly-gorm/app/app.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
POSTGRES_HOST=colly_db
POSTGRES_PORT=5432
POSTGRES_USER=postgres
POSTGRES_PASSWORD=postgres
POSTGRES_DB=colly

DATABASE_URL=postgres://postgres:postgres@colly_db:5432/colly?schema=public
61 changes: 61 additions & 0 deletions fiber-colly-gorm/app/cmd/api/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package main

import (
"log"

"fiber-colly-gorm/internals/consts"
"fiber-colly-gorm/internals/services/database"
"fiber-colly-gorm/internals/services/scrapers"

"github.com/gofiber/fiber/v2"
"github.com/gofiber/fiber/v2/middleware/cors"
"github.com/gofiber/fiber/v2/middleware/logger"
)

func main() {

config, err := consts.LoadConfig(".")
if err != nil {
log.Fatalln("Failed to load environment variables!\n", err.Error())
}
database.ConnectDb(&config)

app := fiber.New()
micro := fiber.New()
scrape := fiber.New()

app.Mount("/api", micro)
app.Mount("/scrape", scrape)
app.Use(logger.New())
app.Use(cors.New(cors.Config{
AllowOrigins: "http://localhost:3000",
AllowHeaders: "Origin, Content-Type, Accept",
AllowMethods: "GET",
AllowCredentials: true,
}))

micro.Get("/healthchecker", func(c *fiber.Ctx) error {
return c.Status(200).JSON(fiber.Map{
"status": "success",
"message": "Welcome to Golang, Fiber, and Colly",
})
})

scrape.Get("quotes", func(c *fiber.Ctx) error {
go scrapers.Quotes()
return c.Status(200).JSON(fiber.Map{
"status": "success",
"message": "Start scraping quotes.toscrape.com ...",
})
})

scrape.Get("coursera", func(c *fiber.Ctx) error {
go scrapers.CourseraCourses()
return c.Status(200).JSON(fiber.Map{
"status": "success",
"message": "Start scraping courses details from coursera.org...",
})
})

log.Fatal(app.Listen(":3000"))
}
58 changes: 58 additions & 0 deletions fiber-colly-gorm/app/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
module fiber-colly-gorm

go 1.20

require (
github.com/gocolly/colly v1.2.0
github.com/gofiber/fiber/v2 v2.52.1
github.com/spf13/viper v1.16.0
gorm.io/driver/postgres v1.5.2
gorm.io/gorm v1.25.3
)

require (
github.com/PuerkitoBio/goquery v1.8.1 // indirect
github.com/andybalholm/brotli v1.0.5 // indirect
github.com/andybalholm/cascadia v1.3.1 // indirect
github.com/antchfx/htmlquery v1.3.0 // indirect
github.com/antchfx/xmlquery v1.3.17 // indirect
github.com/antchfx/xpath v1.2.4 // indirect
github.com/fsnotify/fsnotify v1.6.0 // indirect
github.com/gobwas/glob v0.2.3 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/google/uuid v1.5.0 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
github.com/jackc/pgpassfile v1.0.0 // indirect
github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a // indirect
github.com/jackc/pgx/v5 v5.3.1 // indirect
github.com/jinzhu/inflection v1.0.0 // indirect
github.com/jinzhu/now v1.1.5 // indirect
github.com/kennygrant/sanitize v1.2.4 // indirect
github.com/klauspost/compress v1.17.0 // indirect
github.com/magiconair/properties v1.8.7 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mattn/go-runewidth v0.0.15 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/pelletier/go-toml/v2 v2.0.8 // indirect
github.com/rivo/uniseg v0.2.0 // indirect
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
github.com/spf13/afero v1.9.5 // indirect
github.com/spf13/cast v1.5.1 // indirect
github.com/spf13/jwalterweatherman v1.1.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/subosito/gotenv v1.4.2 // indirect
github.com/temoto/robotstxt v1.1.2 // indirect
github.com/valyala/bytebufferpool v1.0.0 // indirect
github.com/valyala/fasthttp v1.51.0 // indirect
github.com/valyala/tcplisten v1.0.0 // indirect
golang.org/x/crypto v0.14.0 // indirect
golang.org/x/net v0.17.0 // indirect
golang.org/x/sys v0.15.0 // indirect
golang.org/x/text v0.13.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/protobuf v1.30.0 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
Loading

0 comments on commit a706c11

Please sign in to comment.