In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Create frog_baseline.csv
num_rows = 121
frog_ids = [f'Frog_{i+1}' for i in range(num_rows)]
sex = np.random.choice(['Male', 'Female'], num_rows)
age = np.random.randint(1, 10, size=num_rows)
weight = np.random.uniform(0.5, 5.0, size=num_rows)
location = np.random.choice(['Pond_A', 'Pond_B', 'Pond_C'], num_rows)

frog_baseline = pd.DataFrame({
    'Frog_ID': frog_ids,
    'Sex': sex,
    'Age': age,
    'Weight': weight,
    'Location': location
})
frog_baseline.to_csv('frog_baseline.csv', index=False)

# Create frog_baseline_update.csv
health_status = np.random.choice(['Healthy', 'Sick'], num_rows)
diet_type = np.random.choice(['Carnivore', 'Herbivore'], num_rows)

frog_baseline_update = pd.DataFrame({
    'Frog_ID': frog_ids,
    'Health_Status': health_status,
    'Diet_Type': diet_type,
    'Last_Checkup': pd.date_range(start='2024-01-01', periods=num_rows),
    'Notes': [''] * num_rows
})

# Introduce some common errors
frog_baseline_update.loc[0, 'Frog_ID'] = 'Frog_1'  # Duplicate Frog_ID
frog_baseline_update.loc[5, 'Health_Status'] = np.nan  # Missing value
frog_baseline_update.loc[10, 'Extra_Column'] = 'Error'  # Mismatched column

frog_baseline_update.to_csv('frog_baseline_update.csv', index=False)

# Create frog_new_arrivals.csv
num_new_frogs = 80
new_frog_ids = [f'Frog_{i+122}' for i in range(num_new_frogs)]
sex = np.random.choice(['Male', 'Female'], num_new_frogs)
age = np.random.randint(1, 10, size=num_new_frogs)
weight = np.random.uniform(0.5, 5.0, size=num_new_frogs)
location = np.random.choice(['Pond_A', 'Pond_B', 'Pond_C'], num_new_frogs)
health_status = np.random.choice(['Healthy', 'Sick'], num_new_frogs)
diet_type = np.random.choice(['Carnivore', 'Herbivore'], num_new_frogs)

frog_new_arrivals = pd.DataFrame({
    'Frog_ID': new_frog_ids,
    'Sex': sex,
    'Age': age,
    'Weight': weight,
    'Location': location,
    'Health_Status': health_status,
    'Diet_Type': diet_type,
    'Last_Checkup': pd.date_range(start='2024-05-01', periods=num_new_frogs),
    'Notes': ['New arrival'] * num_new_frogs
})

# Introduce some common errors
frog_new_arrivals.loc[0, 'Frog_ID'] = 'Frog_1'  # Duplicate Frog_ID with baseline
frog_new_arrivals.loc[3, 'Weight'] = np.nan  # Missing value
frog_new_arrivals = frog_new_arrivals.drop('Diet_Type', axis=1)  # Missing column

frog_new_arrivals.to_csv('frog_new_arrivals.csv', index=False)
