### 1. Imports, constants, etc.

In [3]:
import os
import pandas as pd

INPUT_DIR = "input"
INPUT_DIR_EVVET = os.path.join(INPUT_DIR, 'evvet')

OUTPUT_DIR = "output"
OUTPUT_EVVET_FILE = os.path.join(OUTPUT_DIR, 'evvet_master.csv')

OUTPUT_EVVET_META = os.path.join(OUTPUT_DIR, "evvet_meta.json")

### 2. Fetch, merge, clean, output.

In [10]:
import datetime
import json
import os
import pandas as pd
import pytz

def load_and_merge_evvet_data():
    # Define the PDT timezone
    pdt = pytz.timezone('America/Los_Angeles')
    
    # Get the current date and time in UTC and convert to PDT
    current_datetime = datetime.datetime.now(pytz.utc).astimezone(pdt)
    last_updated = current_datetime.strftime("%B %d, %Y %H:%M:%S %Z")
    print(f"Last updated:\n{last_updated}")

    file_pattern = '.csv'

    # Delete the output file if it exists
    if os.path.exists(OUTPUT_EVVET_FILE):
        os.remove(OUTPUT_EVVET_FILE)

    # List all CSV files in the input directory
    csv_files = [f for f in os.listdir(INPUT_DIR_EVVET) if f.endswith(file_pattern)]
    
    df_list = []
    for file in csv_files:
        df = pd.read_csv(os.path.join(INPUT_DIR_EVVET, file))
        df.dropna(how='all', inplace=True)
        print(f"\nProcessing file: {file}...")
        print(f"Number of rows: {len(df)}")
        df_list.append(df)
    
    # Concatenate all dataframes into one
    master_df = pd.concat(df_list, ignore_index=True)
    master_df.drop_duplicates(inplace=True)

    # Reorder columns to move 'AER form' to the last index if it exists in the dataframe
    if 'AER form' in master_df.columns:
        cols = master_df.columns.tolist()
        cols.append(cols.pop(cols.index('AER form')))
        master_df = master_df[cols]

    # Move 'Drug' column to the end if it exists in the dataframe
    if 'Drug' in master_df.columns:
        cols = master_df.columns.tolist()
        cols.append(cols.pop(cols.index('Drug')))
        master_df = master_df[cols]

    # Move 'Received date' column to the first index if it exists in the dataframe
    if 'Received date' in master_df.columns:
        cols = master_df.columns.tolist()
        cols.insert(0, cols.pop(cols.index('Received date')))
        master_df = master_df[cols]

    # Sort the dataframe by 'Received date'
    master_df.sort_values(by='Received date', inplace=True)

    # Drop any empty rows in master_df
    master_df.dropna(how='all', inplace=True)

    # Change Received Date column's format
    master_df['Received date'] = pd.to_datetime(master_df['Received date'])

    # Check for any rows where dates couldn't be parsed
    if master_df['Received date'].isna().any():
        print("Some dates couldn't be parsed and were set to NaT")

    # Load the most recent CSV listed in the meta file
    with open(OUTPUT_EVVET_META, "r") as f:
        meta = json.load(f)
    
    if meta['csvs']:
        last_csv_info = meta['csvs'][0]
        last_master_df = pd.read_csv(os.path.join(OUTPUT_DIR, last_csv_info['name']), index_col='Received date', parse_dates=True)
        
        # Set the index to 'Received date' for master_df
        master_df.set_index('Received date', inplace=True)

        # Check if the dataframes are identical
        if master_df.equals(last_master_df):
            print()
            print("Duplicate fetch. Aborted!")
            return

    # Write the updated master dataframe to a new CSV
    new_csv_name = f"evvet_master_{current_datetime.strftime('%Y%m%d')}.csv"
    # Write to csv archive
    master_df.to_csv(os.path.join(OUTPUT_DIR, new_csv_name))
    # Make this the new master file (copy)
    master_df.to_csv(OUTPUT_EVVET_FILE)
    
    print("\nMaster file compiled and written to archive and source.")

    print()
    print(f"Total cases: {len(master_df)}")
    print(f"Animals affected: {master_df['Animals affected'].sum()}")
    print(f"Animals treated: {master_df['Animals treated'].sum()}")
    print(f"Animals died: {master_df['Animals died'].sum()}")

    print()

    # Update the meta file
    new_csv_info = {
        "id": last_csv_info['id'] + 1 if meta['csvs'] else 1,
        "name": new_csv_name,
        "timestamp": current_datetime.isoformat()
    }
    meta['csvs'].insert(0, new_csv_info)
    
    with open(OUTPUT_EVVET_META, "w") as f:
        json.dump(meta, f, indent=4)
    print(f'{OUTPUT_EVVET_META} updated!')

load_and_merge_evvet_data()

Last updated:
July 15, 2024 22:03:35 PDT

Processing file: 2021.csv...
Number of rows: 383

Processing file: 2023.csv...
Number of rows: 5126

Processing file: 2022.csv...
Number of rows: 3170

Processing file: 2024.csv...
Number of rows: 8911

Master file compiled!
Total cases: 17590
Animals affected: 18429
Animals treated: 21381.0
Animals died: 2021.0
output/evvet_meta.json updated!
