Publish transfermarkt-dataset as a Streamlit app (#104)

dcaribou · Sep 10, 2022 · e20670e · e20670e
1 parent cbcbc6a
commit e20670e
Show file tree

Hide file tree

Showing 37 changed files with 1,497 additions and 1,070 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -6,4 +6,3 @@ infra
 data/prep
 data/raw
 prep/stage
-diagram.png
diff --git a/.streamlit/config.toml b/.streamlit/config.toml
@@ -0,0 +1,183 @@
+# Streamlit version: 1.9.0
+
+[global]
+
+# By default, Streamlit checks if the Python watchdog module is available and, if not, prints a warning asking for you to install it. The watchdog module is not required, but highly recommended. It improves Streamlit's ability to detect changes to files in your filesystem.
+# If you'd like to turn off this warning, set this to True.
+# Default: false
+disableWatchdogWarning = false
+
+# If True, will show a warning when you run a Streamlit-enabled script via "python my_script.py".
+# Default: true
+showWarningOnDirectExecution = true
+
+# DataFrame serialization.
+# Acceptable values: - 'legacy': Serialize DataFrames using Streamlit's custom format. Slow but battle-tested. - 'arrow': Serialize DataFrames using Apache Arrow. Much faster and versatile.
+# Default: "arrow"
+dataFrameSerialization = "arrow"
+
+
+[logger]
+
+# Level of logging: 'error', 'warning', 'info', or 'debug'.
+# Default: 'info'
+level = "debug"
+
+# String format for logging messages. If logger.datetimeFormat is set, logger messages will default to `%(asctime)s.%(msecs)03d %(message)s`. See [Python's documentation](https://docs.python.org/2.6/library/logging.html#formatter-objects) for available attributes.
+# Default: "%(asctime)s %(message)s"
+messageFormat = "%(asctime)s %(message)s"
+
+
+[client]
+
+# Whether to enable st.cache.
+# Default: true
+caching = true
+
+# If false, makes your Streamlit script not draw to a Streamlit app.
+# Default: true
+displayEnabled = true
+
+# Controls whether uncaught app exceptions are displayed in the browser. By default, this is set to True and Streamlit displays app exceptions and associated tracebacks in the browser.
+# If set to False, an exception will result in a generic message being shown in the browser, and exceptions and tracebacks will be printed to the console only.
+# Default: true
+showErrorDetails = true
+
+
+[runner]
+
+# Allows you to type a variable or string by itself in a single line of Python code to write it to the app.
+# Default: true
+magicEnabled = true
+
+# Install a Python tracer to allow you to stop or pause your script at any point and introspect it. As a side-effect, this slows down your script's execution.
+# Default: false
+installTracer = false
+
+# Sets the MPLBACKEND environment variable to Agg inside Streamlit to prevent Python crashing.
+# Default: true
+fixMatplotlib = true
+
+# Run the Python Garbage Collector after each script execution. This can help avoid excess memory use in Streamlit apps, but could introduce delay in rerunning the app script for high-memory-use applications.
+# Default: true
+postScriptGC = true
+
+# Handle script rerun requests immediately, rather than waiting for script execution to reach a yield point. Enabling this will make Streamlit much more responsive to user interaction, but it can lead to race conditions in apps that mutate session_state data outside of explicit session_state assignment statements.
+# Default: false
+fastReruns = false
+
+
+[server]
+
+# List of folders that should not be watched for changes. This impacts both "Run on Save" and @st.cache.
+# Relative paths will be taken as relative to the current working directory.
+# Example: ['/home/user1/env', 'relative/path/to/folder']
+# Default: []
+folderWatchBlacklist = []
+
+# Change the type of file watcher used by Streamlit, or turn it off completely.
+# Allowed values: * "auto" : Streamlit will attempt to use the watchdog module, and falls back to polling if watchdog is not available. * "watchdog" : Force Streamlit to use the watchdog module. * "poll" : Force Streamlit to always use polling. * "none" : Streamlit will not watch files.
+# Default: "auto"
+fileWatcherType = "auto"
+
+# Symmetric key used to produce signed cookies. If deploying on multiple replicas, this should be set to the same value across all replicas to ensure they all share the same secret.
+# Default: randomly generated secret key.
+# cookieSecret =
+
+# If false, will attempt to open a browser window on start.
+# Default: false unless (1) we are on a Linux box where DISPLAY is unset, or (2) we are running in the Streamlit Atom plugin.
+headless = true
+
+# Automatically rerun script when the file is modified on disk.
+# Default: false
+runOnSave = false
+
+# The address where the server will listen for client and browser connections. Use this if you want to bind the server to a specific address. If set, the server will only be accessible from this address, and not from any aliases (like localhost).
+# Default: (unset)
+# address =
+
+# The port where the server will listen for browser connections.
+# Default: 8501
+port = 8080
+
+# The base path for the URL where Streamlit should be served from.
+# Default: ""
+baseUrlPath = ""
+
+# Enables support for Cross-Origin Request Sharing (CORS) protection, for added security.
+# Due to conflicts between CORS and XSRF, if `server.enableXsrfProtection` is on and `server.enableCORS` is off at the same time, we will prioritize `server.enableXsrfProtection`.
+# Default: true
+enableCORS = true
+
+# Enables support for Cross-Site Request Forgery (XSRF) protection, for added security.
+# Due to conflicts between CORS and XSRF, if `server.enableXsrfProtection` is on and `server.enableCORS` is off at the same time, we will prioritize `server.enableXsrfProtection`.
+# Default: true
+enableXsrfProtection = true
+
+# Max size, in megabytes, for files uploaded with the file_uploader.
+# Default: 200
+maxUploadSize = 200
+
+# Max size, in megabytes, of messages that can be sent via the WebSocket connection.
+# Default: 200
+maxMessageSize = 200
+
+# Enables support for websocket compression.
+# Default: false
+enableWebsocketCompression = true
+
+
+[browser]
+
+# Internet address where users should point their browsers in order to connect to the app. Can be IP address or DNS name and path.
+# This is used to: - Set the correct URL for CORS and XSRF protection purposes. - Show the URL on the terminal - Open the browser
+# Default: 'localhost'
+serverAddress = "localhost"
+
+# Whether to send usage statistics to Streamlit.
+# Default: true
+gatherUsageStats = true
+
+# Port where users should point their browsers in order to connect to the app.
+# This is used to: - Set the correct URL for CORS and XSRF protection purposes. - Show the URL on the terminal - Open the browser
+# Default: whatever value is set in server.port.
+serverPort = 8080
+
+
+[mapbox]
+
+# Configure Streamlit to use a custom Mapbox token for elements like st.pydeck_chart and st.map. To get a token for yourself, create an account at https://mapbox.com. It's free (for moderate usage levels)!
+# Default: ""
+token = ""
+
+
+[deprecation]
+
+# Set to false to disable the deprecation warning for the file uploader encoding.
+# Default: true
+showfileUploaderEncoding = true
+
+# Set to false to disable the deprecation warning for using the global pyplot instance.
+# Default: true
+showPyplotGlobalUse = true
+
+
+[theme]
+
+# The preset Streamlit theme that your custom theme inherits from. One of "light" or "dark".
+base = "dark"
+
+# Primary accent color for interactive elements.
+# primaryColor =
+
+# Background color for the main content area.
+# backgroundColor =
+
+# Background color used for the sidebar and most interactive widgets.
+# secondaryBackgroundColor =
+
+# Color used for almost all text.
+# textColor =
+
+# Font family for all text in the app, except code blocks. One of "sans serif", "serif", or "monospace".
+font = "monospace"
diff --git a/1_acquire.py b/1_acquire.py
@@ -22,7 +22,7 @@
 from scrapy.utils.project import get_project_settings
 from scrapy.utils.log import configure_logging
 
-from cloud_lib import submit_batch_job_and_wait
+from transfermarkt_datasets.core.utils import submit_batch_job_and_wait
 
 class Asset():
   """A wrapper for the asset to be acquired.

diff --git a/2_prepare.py b/2_prepare.py
@@ -45,7 +45,7 @@ def prepare_on_cloud(
   job_definition, branch, message, args,
   func):
 
-  from cloud_lib import submit_batch_job_and_wait
+  from transfermarkt_datasets.core.utils import submit_batch_job_and_wait
 
   submit_batch_job_and_wait(
     job_name=job_name,

diff --git a/Dockerfile b/Dockerfile
@@ -19,6 +19,22 @@ RUN git config --global user.email "transfermarkt-datasets-ci@transfermark-datas
     git config --global user.name "CI Job" && \
     git config --global core.sshCommand "ssh -o StrictHostKeyChecking=no"
 
+# Creating folders, and files for the project
+
 COPY bootstrap.sh /app/
+COPY Makefile /app/
+
+COPY streamlit/ /app/streamlit/
+COPY .streamlit/ /app/.streamlit/
+COPY resources /app/resources
+
+COPY transfermarkt_datasets/ /app/transfermarkt_datasets/
+COPY config.yml /app/config.yml
+
+COPY .dvc/config /app/.dvc/config
+COPY data/prep.dvc /app/data/prep.dvc
+
+COPY .git /app/.git
 
-ENTRYPOINT ["/bin/bash", "bootstrap.sh"]
+ENTRYPOINT [ "/bin/sh", "-c" ]
+CMD make streamlit_cloud
diff --git a/Makefile b/Makefile
@@ -2,21 +2,27 @@ PLATFORM = linux/arm64 # linux/amd64
 BRANCH = $(shell git rev-parse --abbrev-ref HEAD)
 JOB_NAME = on-cli
 
-build : 
-	docker build --platform=$(PLATFORM) -t dcaribou/transfermarkt-datasets:dev .
+build :
+	docker build \
+		--platform=$(PLATFORM) \
+		--tag dcaribou/transfermarkt-datasets:dev \
+		--tag registry.heroku.com/transfermarkt-datasets/web \
+		.
 
 push :
 	docker push dcaribou/transfermarkt-datasets:dev
 
 acquire_local :
 	python 1_acquire.py local $(ARGS)
+
 acquire_docker : 
 	docker run -ti \
 			--env-file .env \
 			-v `pwd`/.:/app/transfermarkt-datasets/ \
 			--memory=4g  \
 			dcaribou/transfermarkt-datasets:dev \
 				python 1_acquire.py local $(ARGS)
+
 acquire_cloud : JOB_DEFINITION_NAME = transfermarkt-datasets-batch-job-definition-dev
 acquire_cloud : ARGS = --asset all --season 2022
 acquire_cloud :
@@ -28,6 +34,7 @@ acquire_cloud :
 
 prepare_local :
 	python -Wignore 2_prepare.py local $(ARGS)
+
 prepare_docker :
 	docker run -ti \
 			--env-file .env \
@@ -43,6 +50,21 @@ prepare_cloud :
 		--job-definition $(JOB_DEFINITION_NAME) \
 		""
 
-sync: MESSAGE = Manual sync
-sync:
+sync : MESSAGE = Manual sync
+sync :
 	python 3_sync.py --message "$(MESSAGE)" --season 2022
+
+streamlit_deploy :
+	docker push registry.heroku.com/transfermarkt-datasets/web && \
+	heroku container:release web
+
+streamlit_local :
+	streamlit run streamlit/01_👋_about.py
+
+streamlit_cloud :
+	streamlit run \
+		--server.port ${PORT} \
+		streamlit/01_👋_about.py
+
+dagit_local :
+	dagit -f transfermarkt_datasets/dagster/jobs.py
diff --git a/README.md b/README.md
@@ -1,16 +1,21 @@
 # transfermarkt-datasets
 
+| ![diagram](resources/diagram.svg) | 
+|:--:| 
+| *High level data model for transfermarkt-datasets* |
+
+Checkout this dataset also in: :white_check_mark: [Kaggle](https://www.kaggle.com/davidcariboo/player-scores) | :white_check_mark: [data.world](https://data.world/dcereijo/player-scores) | 
+:white_check_mark: [streamlit](https://transfermarkt-datasets.herokuapp.com/)
+
+------
+
 In an nutshell, this project aims for three things:
 
 1. Acquire data from transfermarkt website using the [trasfermarkt-scraper](https://github.com/dcaribou/transfermarkt-scraper).
 2. Build a **clean, public football (soccer) dataset** using data in 1.
 3. Automatate 1 and 2 to **keep these assets up to date** and publicly available on some well-known data catalogs.
 
-Checkout this dataset also in: :white_check_mark: [Kaggle](https://www.kaggle.com/davidcariboo/player-scores) | :white_check_mark: [data.world](https://data.world/dcereijo/player-scores)
-
-| ![diagram](resources/diagram.png) | 
-|:--:| 
-| *High level data model for transfermarkt-datasets* |
+Continue on this `README` to learn about the different components of this project and how you can setup your environment for to run it locally.
 
 - [setup](#setup)
 - [data storage](#data-storage)
@@ -22,6 +27,8 @@ Checkout this dataset also in: :white_check_mark: [Kaggle](https://www.kaggle.co
 - [infra](#infra)
 - [contributing :pray:](#contributing-pray)
 
+------
+
 ## setup
 Setup your local environment to run the project with `poetry`.
 1. Install [poetry](https://python-poetry.org/docs/)
@@ -38,10 +45,10 @@ poetry install
 ## data storage
 > :information_source: Read access to the S3 [DVC remote storage](https://dvc.org/doc/command-reference/remote#description) for the project is required to successfully run `dvc pull`. Contributors should feel free to grant themselves access by adding their AWS IAM user ARN to [this whitelist](https://github.com/dcaribou/transfermarkt-datasets/blob/6b6dd6572f582b2c40039913a65ba99d10fd1f44/infra/main.tf#L16).
 
-All project data assets are kept inside the `data` folder. This is a [DVC](https://dvc.org/) repository and all files a therefore all files can be pulled from the remote storage with the `dvc pull` command.
+All project data assets are kept inside the `data` folder. This is a [DVC](https://dvc.org/) repository and therefore all files can be pulled from the remote storage with the `dvc pull` command.
 
 * `data/raw`: contains raw data per season as acquired with [trasfermarkt-scraper](https://github.com/dcaribou/transfermarkt-scraper) (check [acquire](#acquire))
-* `data/prep`: contains the prepared datasets as produced by `transfermarkt_datasets` module (check [prepare](#prepare))
+* `data/prep`: contains the prepared datasets as produced by `transfermarkt_datasets` module (check [prepare](#data-preparation))
 
 ## data acquisition
 In the scope of this project, "acquiring" is the process of collecting "raw data", as it is produced by [trasfermarkt-scraper](https://github.com/dcaribou/transfermarkt-scraper). Acquired data lives in the `data/raw` folder and it can be created or updated for a particular season using the `1_acquire.py` script.

diff --git a/bootstrap.sh b/bootstrap.sh