In [45]:
import pandas as pd

# Load and preview baseline data
baseline_df = pd.read_csv('frog_baseline.csv')
print("Baseline data shape:", baseline_df.shape)
print(baseline_df.head(3), end="\n\n")

# Load and preview baseline update data
baseline_update_df = pd.read_csv('frog_baseline_update.csv')
print("Baseline update data shape:", baseline_update_df.shape)
print(baseline_update_df.head(3), end="\n\n")

# Load and preview new arrivals data
new_arrivals_df = pd.read_csv('frog_new_arrivals.csv')
print("New arrivals data shape:", new_arrivals_df.shape)
print(new_arrivals_df.head(3))


Baseline data shape: (79, 6)
   frog_id       species     sex  weight  age  size
0     3001     Dart Frog    Male   340.8    5  3.52
1     3002     Tree Frog  Female   432.4    7  5.00
2     3003  Leopard Frog  Female   434.0   10  5.49

Baseline update data shape: (79, 6)
   frog_id habitat   health  max_hop  thermal_limit  call_freq
0     3001    Pond  Healthy     1.66          31.15     455.50
1     3002    Pond  Healthy     2.55          27.86     369.87
2     3003   Swamp  Healthy     2.86          27.59     413.24

New arrivals data shape: (23, 12)
   frog_id       species   sex habitat   health  weight  size  age  max_hop  \
0     3078  Leopard Frog  Male  Forest  Healthy   811.1  6.44    5     2.89   
1     3079  Leopard Frog  Male    Pond     Sick   584.5  7.90    6     2.53   
2     3080     Dart Frog  Male  Forest  Healthy   413.4  5.80    7     2.31   

   thermal_limit  call_freq arrival_date  
0          27.45     479.92   2023-01-01  
1          29.99     396.86   2023-0

In [46]:
merged_df = pd.merge(baseline_df, baseline_update_df, on='frog_id', how='outer')
print(merged_df.shape, end="\n\n")
print(merged_df.head())

(81, 11)

   frog_id       species     sex  weight   age  size habitat   health  \
0        0           NaN     NaN     NaN   NaN   NaN     NaN      NaN   
1     3001     Dart Frog    Male   340.8   5.0  3.52    Pond  Healthy   
2     3002     Tree Frog  Female   432.4   7.0  5.00    Pond  Healthy   
3     3003  Leopard Frog  Female   434.0  10.0  5.49   Swamp  Healthy   
4     3004     Tree Frog  Female   124.9   7.0  2.19    Pond  Healthy   

   max_hop  thermal_limit  call_freq  
0     0.00           0.00       0.00  
1     1.66          31.15     455.50  
2     2.55          27.86     369.87  
3     2.86          27.59     413.24  
4     1.48          31.67     779.69  


In [47]:
final_df = pd.concat([merged_df, new_arrivals_df], ignore_index=True)
print(final_df.shape,end="\n\n")
print(final_df.head())

(104, 12)

   frog_id       species     sex  weight   age  size habitat   health  \
0        0           NaN     NaN     NaN   NaN   NaN     NaN      NaN   
1     3001     Dart Frog    Male   340.8   5.0  3.52    Pond  Healthy   
2     3002     Tree Frog  Female   432.4   7.0  5.00    Pond  Healthy   
3     3003  Leopard Frog  Female   434.0  10.0  5.49   Swamp  Healthy   
4     3004     Tree Frog  Female   124.9   7.0  2.19    Pond  Healthy   

   max_hop  thermal_limit  call_freq arrival_date  
0     0.00           0.00       0.00          NaN  
1     1.66          31.15     455.50          NaN  
2     2.55          27.86     369.87          NaN  
3     2.86          27.59     413.24          NaN  
4     1.48          31.67     779.69          NaN  


In [48]:
print(final_df[final_df.duplicated(keep=False)], end="\n\n")
final_df.drop_duplicates(keep='first', inplace=True)
print(final_df.shape)

    frog_id   species     sex  weight  age  size habitat   health  max_hop  \
77     3077  Bullfrog  Female   897.9  5.0  7.34   Swamp  Healthy     3.14   
78     3077  Bullfrog  Female   897.9  5.0  7.34   Swamp  Healthy     3.14   

    thermal_limit  call_freq arrival_date  
77          28.31      356.9          NaN  
78          28.31      356.9          NaN  

(103, 12)


In [52]:
print(final_df[final_df.isna().any(axis=1)])

    frog_id species   sex  weight  age  size habitat   health  max_hop  \
0         0     NaN   NaN     NaN  NaN   NaN     NaN      NaN      0.0   
79     3200     NaN  Male     NaN  5.0  10.0     NaN      NaN      NaN   
80     9999     NaN   NaN     NaN  NaN   NaN    Pond  Healthy      1.5   

    thermal_limit  call_freq  
0             0.0        0.0  
79            NaN        NaN  
80           40.0       25.0  


In [53]:
# Drop rows with NaN values
final_df.dropna(inplace=True)
print(final_df.shape)

(100, 11)


In [55]:
print(final_df.shape)
print(final_df.head(3))

(100, 11)
   frog_id       species     sex  weight   age  size habitat   health  \
1     3001     Dart Frog    Male   340.8   5.0  3.52    Pond  Healthy   
2     3002     Tree Frog  Female   432.4   7.0  5.00    Pond  Healthy   
3     3003  Leopard Frog  Female   434.0  10.0  5.49   Swamp  Healthy   

   max_hop  thermal_limit  call_freq  
1     1.66          31.15     455.50  
2     2.55          27.86     369.87  
3     2.86          27.59     413.24  


In [57]:
# Reorder the columns of final_df
desired_order = ['frog_id', 'species', 'sex', 'habitat', 'health', 'weight', 'size',
                 'age', 'max_hop', 'thermal_limit', 'call_freq']

final_df = final_df[desired_order]

In [60]:
# Print dataset
print("Final data shape:", final_df.shape)
print(final_df.head(3))

Final data shape: (100, 11)
   frog_id       species     sex habitat   health  weight  size   age  \
1     3001     Dart Frog    Male    Pond  Healthy   340.8  3.52   5.0   
2     3002     Tree Frog  Female    Pond  Healthy   432.4  5.00   7.0   
3     3003  Leopard Frog  Female   Swamp  Healthy   434.0  5.49  10.0   

   max_hop  thermal_limit  call_freq  
1     1.66          31.15     455.50  
2     2.55          27.86     369.87  
3     2.86          27.59     413.24  
