In [1]:
# default_exp data.acquisition

# Data Acquisition

> This is a script which invokes `pybaseball`'s [`statcast()`](https://github.com/jldbc/pybaseball#statcast-pull-advanced-metrics-from-major-league-baseballs-statcast-system) function to retrieve pitch-level data from statcast.


In [2]:
#hide
from nbdev.showdoc import *

In [3]:
# exporti
from pybaseball import statcast
import pandas as pd
from fastscript import *
import sqlite3

In [5]:
# export

@call_parse
def query_statcast(start_dt: Param(help="Beginning date to pull data from", type=str)=None,
                   end_dt: Param(help="End date to pull data from", type=str)=None,
                   team: Param(help="Abbreviation for team of interest", type=str)=None,
                   verbose: Param(help="Whether or not to print verbose updates", type=bool_arg)=True,
                   output_type: Param(help="What format to save data in", type=str)="db",
                   overwrite: Param(help="Whether or not to overwrite the db table if it already exists",
                                   type=bool_arg)=True,
                   output_path: Param(help="path to location that data should be saved", type=str)="."):
    """
    Note, working with sqlite files, as opposed to csv files is the recommended usage.
    """
    # pulling data from statcast
    data = statcast(start_dt, end_dt, team, verbose)
    if output_type not in ('db', 'csv'):
        raise ValueError("output_type must be one of {'db', 'csv'}")
    elif output_type == "db":
        conn = sqlite3.connect(f"{output_path}/statcast_pitches.db")
        if overwrite:
            conn.execute(f"DROP TABLE IF EXISTS statcast_{start_dt[:4]}")
        data.to_sql(f"statcast_{start_dt[:4]}", conn)
        conn.close()
    else:
        # output type must be csv
        data.to_csv(f"{output_path}/statcast_{start_dt[:4]}.csv", index=False)
        
    return None

## Usage

### From the command-line

```shell
$ query_statcast --start_dt 2019-05-07 --end_dt 2019-06-09 --output_type db --output_path /tmp
This is a large query, it may take a moment to complete
Completed sub-query from 2019-05-07 to 2019-05-12
Completed sub-query from 2019-05-13 to 2019-05-18
Completed sub-query from 2019-05-19 to 2019-05-24
Completed sub-query from 2019-05-25 to 2019-05-30
Completed sub-query from 2019-05-31 to 2019-06-05
Completed sub-query from 2019-06-06 to 2019-06-09
```

In [7]:
! ls /tmp/ | grep statcast_pitches

statcast_pitches.db


### Using Python

In [12]:
query_statcast(start_dt="2019-06-07",
               end_dt="2019-06-09",
               output_type="csv",
               output_path="/tmp")

In [13]:
! ls /tmp/ | grep statcast

statcast_2019.csv
statcast_pitches.db
