# Horse Names Analysis: Missing Horses from Herds File

This notebook analyzes which horse names appear in the merged manifest but are missing from the horse_herds file.
We'll implement thorough matching logic to catch variations, abbreviations, and numbering differences.

In [3]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
import os
from difflib import SequenceMatcher

# Set display options for better readability
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

## 1. Load Data Files

In [16]:
# Define file paths
data_root = os.path.expanduser("~/google-drive/horseID Project/data")
manifest_file = os.path.join(data_root, "horse_photos_manifest_normalized.csv")
herds_file = os.path.join(data_root, "horse_herds.csv")

print(f"Loading manifest from: {manifest_file}")
print(f"Loading herds from: {herds_file}")

# Load the data
manifest_df = pd.read_csv(manifest_file)
herds_df = pd.read_csv(herds_file)

print(f"\nManifest shape: {manifest_df.shape}")
print(f"Herds shape: {herds_df.shape}")

# Check the columns
print(f"\nManifest columns: {list(manifest_df.columns)}")
print(f"Herds columns: {list(herds_df.columns)}")

Loading manifest from: /Users/dleigh/google-drive/horseID Project/data/horse_photos_manifest_normalized.csv
Loading herds from: /Users/dleigh/google-drive/horseID Project/data/horse_herds.csv

Manifest shape: (5963, 16)
Herds shape: (206, 3)

Manifest columns: ['canonical_id', 'original_canonical_id', 'horse_name', 'email_date', 'message_id', 'original_filename', 'filename', 'date_added', 'num_horses_detected', 'last_merged_timestamp', 'status', 'size_ratio', 'normalized_horse_name', 'normalization_confidence', 'normalization_method', 'normalization_timestamp']
Herds columns: ['horse_name', 'herd', 'basename']


In [24]:
herds_horses = herds_df[['basename']].drop_duplicates()
manifest_horses = manifest_df[['normalized_horse_name']].drop_duplicates()

In [21]:
herds_horses.sort_values(by='basename')

Unnamed: 0,basename
125,Absinthe
6,Ace
81,Agave
40,Al
178,Angie
201,Anthem
195,Aries
171,Arvid
176,Atlantis
124,Audi


In [25]:
manifest_horses.sort_values(by='normalized_horse_name')

Unnamed: 0,normalized_horse_name
2735,Absinthe
500,Ace
521,Agave
1108,Al
1623,Angie
327,Anthem
371,Aries
5740,Artie
232,Arvid
139,Atlantis


## 2. Extract and Examine Horse Names

In [26]:
merged = pd.merge(left=manifest_horses, right=herds_horses, left_on='normalized_horse_name', right_on='basename', how='outer', indicator=True)

Names which are not in the herds list

In [38]:
not_in_herds_list = merged[merged._merge =='left_only'][['normalized_horse_name']]
print ("%s not in herds list" % not_in_herds_list.shape[0])
not_in_herds_list.to_csv('not_in_herds_list.txt', index=False)
not_in_herds_list

62 not in herds list


Unnamed: 0,normalized_horse_name
7,Artie
11,Azar
14,Baby and Scotty
18,Barney
19,Bear
24,Berrypatriot
30,Booger
31,Boost
33,Bourbon
46,Cedric


names which are in the herds list be are not found in emails

In [37]:
not_in_emails = merged[merged._merge =='right_only'][['basename']]
print ("%s not in emails" % not_in_emails.shape[0])
not_in_emails.to_csv('not_in_emails.txt', index=False)
not_in_emails

35 not in emails


Unnamed: 0,basename
13,BB
21,Belle
26,Big Prince
28,Blizzard
29,Blue
35,Bu
36,Buster
45,Casper
49,Charlie
54,Corna
