Skip to content

Commit

Permalink
Merge pull request #27 from crawlab-team/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
tikazyq committed Mar 23, 2023
2 parents e75f24b + a713036 commit 1917e84
Show file tree
Hide file tree
Showing 85 changed files with 3,397 additions and 908 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ data/
.ipynb_checkpoints/
.idea/
.models/
dev/
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
WEBSPOT_DATABASE_URL=
4 changes: 3 additions & 1 deletion .github/workflows/dockerpublish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ name: "Docker Image CI: websplot"

on:
push:
branches: [ main ]
branches:
- main
- develop

env:
IMAGE_NAME: crawlabteam/webspot
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/main_webspot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ on:
push:
branches:
- main
- develop
workflow_dispatch:

jobs:
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,7 @@ node_modules/

*.db
*.db-journal

dev/data/

*.lock
46 changes: 44 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
FROM golang:1.19 AS build

WORKDIR /go/src/app
COPY ./webspot_rod .

ENV GO111MODULE on

RUN go mod tidy \
&& go install -v ./...

FROM python:3.10.9

# Working directory
Expand All @@ -7,6 +17,36 @@ WORKDIR /app
RUN echo `uname -a`
RUN echo `python --version`

# Install supervisor
RUN apt-get update && apt-get install -y supervisor

# Install dependencies
RUN apt-get -qq install --no-install-recommends -y \
# chromium dependencies
libnss3 \
libxss1 \
libasound2 \
libxtst6 \
libgtk-3-0 \
libgbm1 \
ca-certificates \
# fonts
fonts-liberation fonts-noto-color-emoji fonts-noto-cjk

# Start and enable SSH
RUN apt-get update \
&& apt-get install -y --no-install-recommends dialog \
&& apt-get install -y --no-install-recommends openssh-server \
&& echo "root:Docker!" | chpasswd
COPY webspot_rod/conf/sshd_config /etc/ssh/

# Expose SSH port
EXPOSE 8000 2222

# Copy webspot_rod
COPY --from=build /go/bin/webspot_rod /go/bin/webspot_rod
COPY webspot_rod/conf/supervisord.conf /etc/supervisor/supervisord.conf

# Install requirements
COPY ./requirements.txt /app
RUN pip install --upgrade pip
Expand All @@ -22,5 +62,7 @@ ADD . /app
ENV PORT 80
EXPOSE 80

ENTRYPOINT ["python", "main.py"]
CMD ["web"]
# Change mode of entrypoint.sh
RUN chmod u+x ./entrypoint.sh

CMD ["sh", "entrypoint.sh"]
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,22 @@ Then you can access the web UI at http://localhost:80.
webspot web --help
```

## Architecture

### Overview

The overall process of how Webspot detects meaningful elements from HTML or web pages is shown in the following figure.

```mermaid
graph LR
hr[HtmlRequester]
gl[GraphLoader]
d[Detector]
r[Results]
hr --"html + json"--> gl --"graph"--> d --"output"--> r
```

## Development

Development with Webspot is easy. You can follow the following guidance to get started.
Expand Down
12 changes: 12 additions & 0 deletions dev/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
version: '3.3'
services:
mysql:
image: mysql:8
container_name: mysql
ports:
- "3306:3306"
environment:
MYSQL_DATABASE: "webspot"
MYSQL_ALLOW_EMPTY_PASSWORD: "yes"
volumes:
- ./data/mysql:/var/lib/mysql
13 changes: 13 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
version: '3.3'
services:
webspot:
image: crawlabteam/webspot:develop
container_name: webspot
environment:
WEBSPOT_DATABASE_URL: mongodb://mongo:27017/webspot
ports:
- '9999:80'

mongo:
image: mongo:4.2
container_name: webspot_mongo
8 changes: 8 additions & 0 deletions entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/sh
set -e

# Start ssh
service ssh start

# Start supervisor
supervisord -c /etc/supervisor/supervisord.conf
7 changes: 6 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
from argparse import ArgumentParser

from dotenv import load_dotenv

from webspot.cmd.crawl import cmd_crawl
from webspot.cmd.request import cmd_request
from webspot.cmd.web import cmd_web
from webspot.constants.html_request_method import HTML_REQUEST_METHOD_REQUEST

load_dotenv()

parser = ArgumentParser()

Expand All @@ -27,7 +32,7 @@

request_parser = subparsers.add_parser('request')
request_parser.add_argument('--url', '-U', help='url to request', required=True)
request_parser.add_argument('--method', '-M', help='request method', default='rod')
request_parser.add_argument('--method', '-M', help='request method', default=HTML_REQUEST_METHOD_REQUEST)
request_parser.set_defaults(func=cmd_request)

if __name__ == '__main__':
Expand Down
Loading

0 comments on commit 1917e84

Please sign in to comment.