# Generate Validation Data

For every bird, this notebook generates a list of locations, with for each location, the average number of birds per checklist, and the fraction of checklists where the bird appears. 

This notebook has been included for the sake of reproducibility, but it does not need to be run, because the data we provide already include its output. 
This notebook is the one place where you do need the database created from ebird data; see the [documentation](ebird_data/README.md) for how to generate it.  It is a process that takes a few days, between downloading, and creating the database.  
For convenience, we include already the result of running this notebook, so again, it does not need to be run. 

In [1]:
import os, sys
sys.path.append(os.path.join(os.getcwd(), "birdmaps"))

In [2]:
import os
import pandas as pd
import random

import ebird_db
import bird_runs

In [None]:
# Replace the paths below with the ones you need. 
ebird_obs = ebird_db.EbirdObservations("/Users/luca/data/eBird/bird_data_uswest.db")
valid = ebird_db.Validation()
FILE_PATH = "/Users/luca/Library/CloudStorage/GoogleDrive-luca@ucsc.edu/Shared drives/Birds and CS/Data/CA"
bird_run = bird_runs.BirdRun(FILE_PATH)

In [None]:
birds = [
    bird_run.get_bird_run("acowoo", "Acorn Woodpecker"),
    bird_run.get_bird_run("oaktit308", "Oak Titmouse"),
    bird_run.get_bird_run("stejay", "Steller's Jay"),
    ]

max_distances = [2]
date_range = ("2012-01-01", "2018-12-31")
breeding = True
state = "US-CA"
num_sample_squares = 20000 # Sampling number for the squares.

In [None]:

for bird in birds:
    print("Bird:", bird.name)
    for max_dist in max_distances:
        print("Max distance:", max_dist)
        bird_run.createdir(bird.obs_path)
        df = pd.DataFrame(columns=["Square", "NumChecklists", "NumBirdChecklists", "NumBirds"])
        squares = ebird_obs.get_all_squares(state=state, breeding=breeding,
                                            date_range=date_range, max_dist=max_dist)
        num_squares = len(squares)
        k = min(num_squares, num_sample_squares)
        print("Retrieved", num_squares, "squares")
        # print(squares[:10])
        selected_squares = random.sample(squares, k)
        for i, sq in enumerate(selected_squares):
            num_checklists, num_bird_checklists, num_birds = ebird_obs.get_square_observations(
                sq, bird, breeding=breeding, date_range=date_range, max_dist=max_dist)
            df = df.append({
                "Square": sq, "NumChecklists": num_checklists,
                "NumBirdChecklists": num_bird_checklists, "NumBirds": num_birds
            }, ignore_index=True)
            if (i + 1) % 1000 == 0:
                print("Processed", i + 1, "squares")
        obs_fn = bird_run.get_observations_all_fn(bird.obs_path, max_distance=max_dist,
                                                  date_range="-".join(date_range),
                                                  num_squares=num_sample_squares)
        df.to_csv(obs_fn)