Skip to content

Commit

Permalink
Publish transfermarkt-dataset as a Streamlit app (#104)
Browse files Browse the repository at this point in the history
  • Loading branch information
dcaribou committed Sep 10, 2022
1 parent cbcbc6a commit e20670e
Show file tree
Hide file tree
Showing 37 changed files with 1,497 additions and 1,070 deletions.
1 change: 0 additions & 1 deletion .dockerignore
Expand Up @@ -6,4 +6,3 @@ infra
data/prep
data/raw
prep/stage
diagram.png
183 changes: 183 additions & 0 deletions .streamlit/config.toml
@@ -0,0 +1,183 @@
# Streamlit version: 1.9.0

[global]

# By default, Streamlit checks if the Python watchdog module is available and, if not, prints a warning asking for you to install it. The watchdog module is not required, but highly recommended. It improves Streamlit's ability to detect changes to files in your filesystem.
# If you'd like to turn off this warning, set this to True.
# Default: false
disableWatchdogWarning = false

# If True, will show a warning when you run a Streamlit-enabled script via "python my_script.py".
# Default: true
showWarningOnDirectExecution = true

# DataFrame serialization.
# Acceptable values: - 'legacy': Serialize DataFrames using Streamlit's custom format. Slow but battle-tested. - 'arrow': Serialize DataFrames using Apache Arrow. Much faster and versatile.
# Default: "arrow"
dataFrameSerialization = "arrow"


[logger]

# Level of logging: 'error', 'warning', 'info', or 'debug'.
# Default: 'info'
level = "debug"

# String format for logging messages. If logger.datetimeFormat is set, logger messages will default to `%(asctime)s.%(msecs)03d %(message)s`. See [Python's documentation](https://docs.python.org/2.6/library/logging.html#formatter-objects) for available attributes.
# Default: "%(asctime)s %(message)s"
messageFormat = "%(asctime)s %(message)s"


[client]

# Whether to enable st.cache.
# Default: true
caching = true

# If false, makes your Streamlit script not draw to a Streamlit app.
# Default: true
displayEnabled = true

# Controls whether uncaught app exceptions are displayed in the browser. By default, this is set to True and Streamlit displays app exceptions and associated tracebacks in the browser.
# If set to False, an exception will result in a generic message being shown in the browser, and exceptions and tracebacks will be printed to the console only.
# Default: true
showErrorDetails = true


[runner]

# Allows you to type a variable or string by itself in a single line of Python code to write it to the app.
# Default: true
magicEnabled = true

# Install a Python tracer to allow you to stop or pause your script at any point and introspect it. As a side-effect, this slows down your script's execution.
# Default: false
installTracer = false

# Sets the MPLBACKEND environment variable to Agg inside Streamlit to prevent Python crashing.
# Default: true
fixMatplotlib = true

# Run the Python Garbage Collector after each script execution. This can help avoid excess memory use in Streamlit apps, but could introduce delay in rerunning the app script for high-memory-use applications.
# Default: true
postScriptGC = true

# Handle script rerun requests immediately, rather than waiting for script execution to reach a yield point. Enabling this will make Streamlit much more responsive to user interaction, but it can lead to race conditions in apps that mutate session_state data outside of explicit session_state assignment statements.
# Default: false
fastReruns = false


[server]

# List of folders that should not be watched for changes. This impacts both "Run on Save" and @st.cache.
# Relative paths will be taken as relative to the current working directory.
# Example: ['/home/user1/env', 'relative/path/to/folder']
# Default: []
folderWatchBlacklist = []

# Change the type of file watcher used by Streamlit, or turn it off completely.
# Allowed values: * "auto" : Streamlit will attempt to use the watchdog module, and falls back to polling if watchdog is not available. * "watchdog" : Force Streamlit to use the watchdog module. * "poll" : Force Streamlit to always use polling. * "none" : Streamlit will not watch files.
# Default: "auto"
fileWatcherType = "auto"

# Symmetric key used to produce signed cookies. If deploying on multiple replicas, this should be set to the same value across all replicas to ensure they all share the same secret.
# Default: randomly generated secret key.
# cookieSecret =

# If false, will attempt to open a browser window on start.
# Default: false unless (1) we are on a Linux box where DISPLAY is unset, or (2) we are running in the Streamlit Atom plugin.
headless = true

# Automatically rerun script when the file is modified on disk.
# Default: false
runOnSave = false

# The address where the server will listen for client and browser connections. Use this if you want to bind the server to a specific address. If set, the server will only be accessible from this address, and not from any aliases (like localhost).
# Default: (unset)
# address =

# The port where the server will listen for browser connections.
# Default: 8501
port = 8080

# The base path for the URL where Streamlit should be served from.
# Default: ""
baseUrlPath = ""

# Enables support for Cross-Origin Request Sharing (CORS) protection, for added security.
# Due to conflicts between CORS and XSRF, if `server.enableXsrfProtection` is on and `server.enableCORS` is off at the same time, we will prioritize `server.enableXsrfProtection`.
# Default: true
enableCORS = true

# Enables support for Cross-Site Request Forgery (XSRF) protection, for added security.
# Due to conflicts between CORS and XSRF, if `server.enableXsrfProtection` is on and `server.enableCORS` is off at the same time, we will prioritize `server.enableXsrfProtection`.
# Default: true
enableXsrfProtection = true

# Max size, in megabytes, for files uploaded with the file_uploader.
# Default: 200
maxUploadSize = 200

# Max size, in megabytes, of messages that can be sent via the WebSocket connection.
# Default: 200
maxMessageSize = 200

# Enables support for websocket compression.
# Default: false
enableWebsocketCompression = true


[browser]

# Internet address where users should point their browsers in order to connect to the app. Can be IP address or DNS name and path.
# This is used to: - Set the correct URL for CORS and XSRF protection purposes. - Show the URL on the terminal - Open the browser
# Default: 'localhost'
serverAddress = "localhost"

# Whether to send usage statistics to Streamlit.
# Default: true
gatherUsageStats = true

# Port where users should point their browsers in order to connect to the app.
# This is used to: - Set the correct URL for CORS and XSRF protection purposes. - Show the URL on the terminal - Open the browser
# Default: whatever value is set in server.port.
serverPort = 8080


[mapbox]

# Configure Streamlit to use a custom Mapbox token for elements like st.pydeck_chart and st.map. To get a token for yourself, create an account at https://mapbox.com. It's free (for moderate usage levels)!
# Default: ""
token = ""


[deprecation]

# Set to false to disable the deprecation warning for the file uploader encoding.
# Default: true
showfileUploaderEncoding = true

# Set to false to disable the deprecation warning for using the global pyplot instance.
# Default: true
showPyplotGlobalUse = true


[theme]

# The preset Streamlit theme that your custom theme inherits from. One of "light" or "dark".
base = "dark"

# Primary accent color for interactive elements.
# primaryColor =

# Background color for the main content area.
# backgroundColor =

# Background color used for the sidebar and most interactive widgets.
# secondaryBackgroundColor =

# Color used for almost all text.
# textColor =

# Font family for all text in the app, except code blocks. One of "sans serif", "serif", or "monospace".
font = "monospace"
2 changes: 1 addition & 1 deletion 1_acquire.py
Expand Up @@ -22,7 +22,7 @@
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging

from cloud_lib import submit_batch_job_and_wait
from transfermarkt_datasets.core.utils import submit_batch_job_and_wait

class Asset():
"""A wrapper for the asset to be acquired.
Expand Down
2 changes: 1 addition & 1 deletion 2_prepare.py
Expand Up @@ -45,7 +45,7 @@ def prepare_on_cloud(
job_definition, branch, message, args,
func):

from cloud_lib import submit_batch_job_and_wait
from transfermarkt_datasets.core.utils import submit_batch_job_and_wait

submit_batch_job_and_wait(
job_name=job_name,
Expand Down
18 changes: 17 additions & 1 deletion Dockerfile
Expand Up @@ -19,6 +19,22 @@ RUN git config --global user.email "transfermarkt-datasets-ci@transfermark-datas
git config --global user.name "CI Job" && \
git config --global core.sshCommand "ssh -o StrictHostKeyChecking=no"

# Creating folders, and files for the project

COPY bootstrap.sh /app/
COPY Makefile /app/

COPY streamlit/ /app/streamlit/
COPY .streamlit/ /app/.streamlit/
COPY resources /app/resources

COPY transfermarkt_datasets/ /app/transfermarkt_datasets/
COPY config.yml /app/config.yml

COPY .dvc/config /app/.dvc/config
COPY data/prep.dvc /app/data/prep.dvc

COPY .git /app/.git

ENTRYPOINT ["/bin/bash", "bootstrap.sh"]
ENTRYPOINT [ "/bin/sh", "-c" ]
CMD make streamlit_cloud
30 changes: 26 additions & 4 deletions Makefile
Expand Up @@ -2,21 +2,27 @@ PLATFORM = linux/arm64 # linux/amd64
BRANCH = $(shell git rev-parse --abbrev-ref HEAD)
JOB_NAME = on-cli

build :
docker build --platform=$(PLATFORM) -t dcaribou/transfermarkt-datasets:dev .
build :
docker build \
--platform=$(PLATFORM) \
--tag dcaribou/transfermarkt-datasets:dev \
--tag registry.heroku.com/transfermarkt-datasets/web \
.

push :
docker push dcaribou/transfermarkt-datasets:dev

acquire_local :
python 1_acquire.py local $(ARGS)

acquire_docker :
docker run -ti \
--env-file .env \
-v `pwd`/.:/app/transfermarkt-datasets/ \
--memory=4g \
dcaribou/transfermarkt-datasets:dev \
python 1_acquire.py local $(ARGS)

acquire_cloud : JOB_DEFINITION_NAME = transfermarkt-datasets-batch-job-definition-dev
acquire_cloud : ARGS = --asset all --season 2022
acquire_cloud :
Expand All @@ -28,6 +34,7 @@ acquire_cloud :

prepare_local :
python -Wignore 2_prepare.py local $(ARGS)

prepare_docker :
docker run -ti \
--env-file .env \
Expand All @@ -43,6 +50,21 @@ prepare_cloud :
--job-definition $(JOB_DEFINITION_NAME) \
""

sync: MESSAGE = Manual sync
sync:
sync : MESSAGE = Manual sync
sync :
python 3_sync.py --message "$(MESSAGE)" --season 2022

streamlit_deploy :
docker push registry.heroku.com/transfermarkt-datasets/web && \
heroku container:release web

streamlit_local :
streamlit run streamlit/01_👋_about.py

streamlit_cloud :
streamlit run \
--server.port ${PORT} \
streamlit/01_👋_about.py

dagit_local :
dagit -f transfermarkt_datasets/dagster/jobs.py
21 changes: 14 additions & 7 deletions README.md
@@ -1,16 +1,21 @@
# transfermarkt-datasets

| ![diagram](resources/diagram.svg) |
|:--:|
| *High level data model for transfermarkt-datasets* |

Checkout this dataset also in: :white_check_mark: [Kaggle](https://www.kaggle.com/davidcariboo/player-scores) | :white_check_mark: [data.world](https://data.world/dcereijo/player-scores) |
:white_check_mark: [streamlit](https://transfermarkt-datasets.herokuapp.com/)

------

In an nutshell, this project aims for three things:

1. Acquire data from transfermarkt website using the [trasfermarkt-scraper](https://github.com/dcaribou/transfermarkt-scraper).
2. Build a **clean, public football (soccer) dataset** using data in 1.
3. Automatate 1 and 2 to **keep these assets up to date** and publicly available on some well-known data catalogs.

Checkout this dataset also in: :white_check_mark: [Kaggle](https://www.kaggle.com/davidcariboo/player-scores) | :white_check_mark: [data.world](https://data.world/dcereijo/player-scores)

| ![diagram](resources/diagram.png) |
|:--:|
| *High level data model for transfermarkt-datasets* |
Continue on this `README` to learn about the different components of this project and how you can setup your environment for to run it locally.

- [setup](#setup)
- [data storage](#data-storage)
Expand All @@ -22,6 +27,8 @@ Checkout this dataset also in: :white_check_mark: [Kaggle](https://www.kaggle.co
- [infra](#infra)
- [contributing :pray:](#contributing-pray)

------

## setup
Setup your local environment to run the project with `poetry`.
1. Install [poetry](https://python-poetry.org/docs/)
Expand All @@ -38,10 +45,10 @@ poetry install
## data storage
> :information_source: Read access to the S3 [DVC remote storage](https://dvc.org/doc/command-reference/remote#description) for the project is required to successfully run `dvc pull`. Contributors should feel free to grant themselves access by adding their AWS IAM user ARN to [this whitelist](https://github.com/dcaribou/transfermarkt-datasets/blob/6b6dd6572f582b2c40039913a65ba99d10fd1f44/infra/main.tf#L16).
All project data assets are kept inside the `data` folder. This is a [DVC](https://dvc.org/) repository and all files a therefore all files can be pulled from the remote storage with the `dvc pull` command.
All project data assets are kept inside the `data` folder. This is a [DVC](https://dvc.org/) repository and therefore all files can be pulled from the remote storage with the `dvc pull` command.

* `data/raw`: contains raw data per season as acquired with [trasfermarkt-scraper](https://github.com/dcaribou/transfermarkt-scraper) (check [acquire](#acquire))
* `data/prep`: contains the prepared datasets as produced by `transfermarkt_datasets` module (check [prepare](#prepare))
* `data/prep`: contains the prepared datasets as produced by `transfermarkt_datasets` module (check [prepare](#data-preparation))

## data acquisition
In the scope of this project, "acquiring" is the process of collecting "raw data", as it is produced by [trasfermarkt-scraper](https://github.com/dcaribou/transfermarkt-scraper). Acquired data lives in the `data/raw` folder and it can be created or updated for a particular season using the `1_acquire.py` script.
Expand Down
Empty file modified bootstrap.sh 100644 → 100755
Empty file.

0 comments on commit e20670e

Please sign in to comment.