In [None]:
# default_exp data.acquisition

# Data Acquisition

> This is a script which invokes `pybaseball`'s [`statcast()`](https://github.com/jldbc/pybaseball#statcast-pull-advanced-metrics-from-major-league-baseballs-statcast-system) function to retrieve pitch-level data from statcast.


In [None]:
#hide

# documentation
from nbdev.showdoc import *

# testing
import pytest
import psycopg2

In [None]:
# exporti
from pybaseball import statcast
import pandas as pd
from fastscript import *
from sqlalchemy import create_engine
from os import path
import os
from dotenv import find_dotenv, load_dotenv

In [None]:
# export

def load_postgres_env():
    # retrieving engironment variables
    load_dotenv(find_dotenv(), override=True)
    dbname = os.getenv("POSTGRES_DB")
    user = os.getenv("POSTGRES_USER")
    password = os.getenv("POSTGRES_PASSWORD")
    host = os.getenv("POSTGRES_HOST")
    port = os.getenv("POSTGRES_PORT")
    return dbname, user, password, host, port

def load_postgres_engine():
    # postgres env variables
    dbname, user, password, host, port = load_postgres_env()

    # creating remote db connection
    engine = create_engine(f'postgresql://{user}:{password}@{host}/{dbname}')
    
    return engine

In [None]:
# export


@call_parse
def query_statcast(
    start_dt: Param(help="Beginning date to pull data from", type=str) = None,
    end_dt: Param(help="End date to pull data from", type=str) = None,
    team: Param(help="Abbreviation for team of interest", type=str) = None,
    verbose: Param(
        help="Whether or not to print verbose updates", type=bool_arg
    ) = True,
    if_exists: Param(
        help="How to behave if the table already exists.",
        type=str,
    ) = 'fail'
):
    """
    Callable from the command-line or in Python. Pulls pitch-level MLB data from [statcast](https://baseballsavant.mlb.com/statcast_search).
    Saves to a pre-instantiated Postgres DB in AWS RDS.
    
    * inputs:
        - `start_dt`: `str`, Beginning date to pull data from = None
        - `end_dt`: `str`, End date to pull data from = None
        - `team`: `str`, abbreviation for team of interest = None
        - `verbose`: `bool`, Whether or not to print verbose updates
        - `if_exists`: `str`, How to behave if the table already exists.
            * `'fail'`: Raise a ValueError.
            * `'replace'`: Drop the table before inserting new values.
            * `'append'`: Insert new values to the existing table.
        
    * outputs:
        - None
    """
    # creating remote db connection
    engine = load_postgres_engine()

    # pulling data from statcast
    data = statcast(start_dt=start_dt, end_dt=end_dt, team=team, verbose=verbose)
    with engine.connect() as connection:
        data.to_sql(f"statcast_{start_dt[:4]}", connection, if_exists=if_exists)

    return None


In [None]:
# query_statcast tests


# getting red sox data from July 7th, 2019
start_dt = end_dt = "2019-07-07"
query_statcast(
    start_dt=start_dt,
    end_dt=end_dt,
    team="BOS",
    if_exists="replace",
)

# ensuring David Price threw 99 pitches
engine = load_postgres_engine()
with engine.connect() as connection:
    result = connection.execute("""select count(1)
                                   from statcast_2019
                                   where player_name = 'David Price'""")
assert result.first()[0] == 99

In [None]:
# export


def query_db(
    year: str = "2019",
    columns: str = "*",
    limit: int = None,
    verbose: bool = True,
):
    """
    Queries a sqlite db file. Assumes that it's been created by `query_statcast`.
    Only queries for a single year at a time.
    
    * intputs:
        - `year`: `str`, year of data to query
        - `columns`: `str`, which columns from the [statcast data](https://baseballsavant.mlb.com/csv-docs) to include in table
        - `limit`: `int`, the maximum number of rows to retrieve ([postgresql documentation](https://www.postgresql.org/docs/8.1/queries-limit.html)) 
        - `verbose`: `bool`, Whether or not to print verbose updates
    
    * output:
        - `df`: `pd.DataFrame`, DataFrame populated with data queried from database
    """
    if verbose:
        print(f"querying year {year} from db now.")
    
    engine = load_postgres_engine()
    
    query = f"""select {columns}
                from statcast_{year}"""
    if limit:
        query += f" limit {round(limit)}"
    
    # if year is not in db, return empty pd.DataFrame
    try:
        with engine.connect() as connection:
            df = pd.read_sql_query(query, connection)
    except:
        print(f"Year {year} not in database. Returning empty dataframe.")
        df = pd.DataFrame()
        
    return df


In [None]:
# BOS @ DET on 7/7/19
df = query_db()
assert df["away_team"].unique().item() == "BOS"

# checking consistent rows and columns (extra column because index is included)
assert df.shape == (339, 91)

# year not present in db gives empty DataFrame
df = query_db(year="2012")
assert df.empty

# # also testing that csv file is of expected size
# df = pd.read_csv(f"{output_path}/statcast_{start_dt[:4]}.csv")
# assert df.shape == (4457, 90)


# clean up: removing table, and asserting the correct error
with engine.connect() as connection:
    connection.execute("""drop table statcast_2019""")

querying year 2019 from db now.
querying year 2012 from db now.
Year 2012 not in database. Returning empty dataframe.


## Usage

### From the command-line

```shell
$ query_statcast --start_dt 2019-05-07 --end_dt 2019-06-09 --output_type db --output_path /tmp
This is a large query, it may take a moment to complete
Completed sub-query from 2019-05-07 to 2019-05-12
Completed sub-query from 2019-05-13 to 2019-05-18
Completed sub-query from 2019-05-19 to 2019-05-24
Completed sub-query from 2019-05-25 to 2019-05-30
Completed sub-query from 2019-05-31 to 2019-06-05
Completed sub-query from 2019-06-06 to 2019-06-09
$ ls /tmp/ | grep statcast_pitches
statcast_pitches.db
```

### Using Python

```python
>>> query_statcast(
        start_dt="2019-06-07", end_dt="2019-06-09", output_type="csv", output_path="/tmp"
    )
```

```shell
$ ls /tmp/ | grep statcast
```