In [None]:
import pandas as pd

In [None]:
fn = 'observations-nv-09-07-25'
df = pd.read_csv(f"data/{fn}.csv")
genus_to_keep = ['Oncorhynchus', 'Salmo', 'Salvelinus']
df = df[df['scientific_name'].str.startswith(tuple(genus_to_keep))]
# remove Oncorhynchus tshawytscha, Oncorhynchus kisutch, Oncorhynchus nerka
species_to_remove = ['Oncorhynchus tshawytscha', 'Oncorhynchus kisutch', 'Oncorhynchus nerka']
df = df[~df['scientific_name'].isin(species_to_remove)]

columns_to_drop = [
    'uuid',
    'observed_on_string',
    'time_observed_at',
    'time_zone',
    'user_id',
    'user_login',
    'user_name',
    'created_at',
    'updated_at',
    'quality_grade',
    'license',
    'sound_url',
    'tag_list',
    'num_identification_agreements',
    'num_identification_disagreements',
    'captive_cultivated',
    'oauth_application_id',
    'place_guess',
    'private_place_guess',
    'private_latitude',
    'private_longitude',
    'positioning_method',
    'positioning_device',
    'public_positional_accuracy',
    'positional_accuracy',
    'taxon_geoprivacy'
    ]


# Create a mapping dictionary
name_mapping = {
    # Rainbow Trout variants
    'Rainbow Trout': 'Rainbow Trout',
    'Coastal Rainbow Trout': 'Rainbow Trout',
    'Kern River Rainbow Trout': 'Rainbow Trout',
    'Eagle Lake Rainbow Trout': 'Rainbow Trout',

    # Redband Trout variants
    'Sacramento Redband Trout': 'Redband Trout',
    'McCloud River Redband Trout': 'Redband Trout',
    'Columbia River Redband Trout': 'Redband Trout',
    
    # Golden Trout variants
    'Golden Trout': 'Golden Trout',
    'California Golden Trout': 'Golden Trout',
    'Little Kern Golden Trout': 'Golden Trout',
    
    # Cutthroat Trout variants
    'Coastal Cutthroat Trout': 'Cutthroat Trout',
    'Lahontan Cutthroat Trout': 'Cutthroat Trout',
    'Paiute Cutthroat Trout': 'Cutthroat Trout',
    'Rocky Mountain Cutthroat Trout': 'Cutthroat Trout',
    'Rio Grande Cutthroat Trout': 'Cutthroat Trout',
    'Colorado River Cutthroat Trout': 'Cutthroat Trout',
    'Snake River Fine-spotted Cutthroat Trout': 'Cutthroat Trout',
    'Greenback Cutthroat Trout': 'Cutthroat Trout',
    'Bonneville Cutthroat Trout': 'Cutthroat Trout',
    'Humboldt Cutthroat Trout': 'Cutthroat Trout',
    'Yellowstone Cutthroat Trout': 'Cutthroat Trout',
    'Westslope Cutthroat Trout': 'Cutthroat Trout',
    
    # Keep these distinct
    'Brook Trout': 'Brook Trout',
    'Brown Trout': 'Brown Trout',
    'Lake Trout': 'Lake Trout',
    'Bull Trout': 'Bull Trout',
    
    # Hybrids
    'Golden × Rainbow Trout': 'Golden × Rainbow Hybrid',
    'Tiger Trout': 'Tiger Trout',
    'Lahontan Cutbow': 'Cutbow Hybrid',
    'Rocky Mountain Cutbow': 'Cutbow Hybrid',
    
    # Salmonid categories
    'Pacific Salmons and Trouts': 'Unspecified',
    'Salmons and Trouts': 'Unspecified',
    'Salmons, Trouts, and Whitefishes': 'Unspecified',
    'Atlantic Salmons and Trouts': 'Unspecified',
    'Chinook Salmon': 'Salmon',
    'Coho Salmon': 'Salmon',
    'Sockeye Salmon': 'Salmon'
}

# Apply the mapping
df['generic_name'] = df['common_name'].map(name_mapping)

# Check for any unmapped values (in case there are names not in your list)
unmapped = df[df['generic_name'].isna()]['common_name'].unique()
if len(unmapped) > 0:
    print(f"Warning: Unmapped common names found: {unmapped}")

# Optional: fill any unmapped values with the original name
df['generic_name'] = df['generic_name'].fillna(df['common_name'])

# See the distribution of generic names
print(df['generic_name'].value_counts())

df = df.drop(columns=columns_to_drop)
df.to_csv(f'data/{fn}-clean.csv', index=False)