In [15]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
import time

In [20]:
input_path = "../data/ortho_programs_full.csv"
output_path = "../data/ortho_programs_cleaned.csv"

with open(input_path, "r") as infile, open(output_path, "w") as outfile:
    for i, line in enumerate(infile):
        # Keep the header as-is
        if i == 0:
            outfile.write(line.strip() + "\n")
            continue

        # Split only on the first comma
        parts = line.strip().split(",", 1)

        if len(parts) == 2:
            name = parts[0].strip()
            address = parts[1].strip()
            # Add quotes around the name if not already quoted
            if not name.startswith('"'):
                name = f'"{name}"'
            outfile.write(f"{name},{address}\n")
        else:
            print(f"Skipping malformed line {i + 1}: {line}")

In [26]:
# ========== STEP 1: Load your data ==========
law_df = pd.read_csv("../data/law_schools.csv")  # columns: Name, Full Address
#ortho_df_short = pd.read_csv("../data/ortho_programs.csv")  # columns: Name, Full Address
ortho_df= pd.read_csv("../data/ortho_programs_cleaned.csv")  # columns: Name, Full Address

In [27]:
#law_df
ortho_df

Unnamed: 0,Name,Address
0,1. Hospital for Special Surgery/Cornell Medica...,535 East 70th Street New York NY 10021
1,2. Mayo Clinic College of Medicine and Science...,200 First Street SW Rochester MN 55905
2,3. NYU Grossman School of Medicine/NYU Langone...,"301 East 17th Street, New York, NY 10003"
3,4. Washington University,"4455 Duncan Avenue, St. Louis, MO 63110"
4,5. Mass General Brigham/Massachusetts General ...,"55 Fruit Street, Boston, MA 02114"
...,...,...
202,203. Carilion Clinic Virginia Tech Carilion Sc...,"1 Riverside Circle, Roanoke, VA 24016"
203,204. Garden City Hospital Orthopaedic Surgery ...,"6245 Inkster Rd, Garden City, MI 48135"
204,205. Larkin Community Hospital Orthopaedic Sur...,"7000 SW 62nd Ave, Suite 600, South Miami, FL 3..."
205,206. St Joseph's Medical Center Orthopaedic Su...,"1800 N California St, Stockton, CA 95204"


In [29]:
# ========== STEP 2: Geocode each address ==========
geolocator = Nominatim(user_agent="law_ortho_matcher")

def geocode_address(address):
    for _ in range(3):  # retry in case of rate limit
        try:
            location = geolocator.geocode(address)
            if location:
                return pd.Series([location.latitude, location.longitude])
        except Exception:
            time.sleep(1)
    return pd.Series([None, None])

print("Geocoding law schools...")
law_df[["Lat", "Lon"]] = law_df["Full Address"].apply(geocode_address)

print("Geocoding ortho programs...")
ortho_df[["Lat", "Lon"]] = ortho_df["Full Address"].apply(geocode_address)

Geocoding law schools...
Geocoding ortho programs...


KeyError: 'Full Address'

## How Important the Rank of Law School is vs Ortho Rank 
higher number penalizes distance more as this is a minimization problem

In [5]:
law_rank_importance = 2
ortho_rank_importance= 1
dist_rank_importance= 2 

In [6]:
# ========== STEP 3: Match and score all pairwise combos ==========
matches = []
for _, law in law_df.iterrows():
    for _, ortho in ortho_df.iterrows():
        if pd.notnull(law["Lat"]) and pd.notnull(ortho["Lat"]):
            distance = geodesic((law["Lat"], law["Lon"]), (ortho["Lat"], ortho["Lon"])).miles
            law_rank = int(law["Name"].split(".")[0])
            ortho_rank = int(ortho["Name"].split(".")[0])
            
            # Example scoring weights (adjust as needed)
            score = law_rank_importance * law_rank + ortho_rank_importance * ortho_rank + dist_rank_importance * distance

            matches.append({
                "Law School": law["Name"],
                "Ortho Program": ortho["Name"],
                "Distance (miles)": round(distance, 2),
                "Score": round(score, 2)
            })


In [7]:
# ========== STEP 4: Sort and get top matches ==========
results_df = pd.DataFrame(matches).sort_values("Score").reset_index(drop=True)

# Save or display
results_df.to_csv("../results/top_law_ortho_matches.csv", index=False)
print("Top 30 matches:")
print(results_df.head(30))

Top 30 matches:
                                        Law School  \
0             8. New York University School of Law   
1                 6. Duke University School of Law   
2   5. University of Pennsylvania Carey Law School   
3             8. New York University School of Law   
4                           1. Stanford Law School   
5                            7. Harvard Law School   
6              3. University of Chicago Law School   
7   5. University of Pennsylvania Carey Law School   
8                          10. Columbia Law School   
9                          10. Columbia Law School   
10         11. Northwestern Pritzker School of Law   
11            9. University of Michigan Law School   
12         4. University of Virginia School of Law   
13            16. Vanderbilt University Law School   
14                         10. Columbia Law School   
15             3. University of Chicago Law School   
16         11. Northwestern Pritzker School of Law   
17          

In [10]:
ortho_signals= 30
law_signals = 25

# Get top 30 unique orthopedic surgery programs
top_ortho_programs = []
seen_ortho = set()
for _, row in results_df.iterrows():
    ortho = row['Ortho Program']
    if ortho not in seen_ortho:
        top_ortho_programs.append(ortho)
        seen_ortho.add(ortho)
    if len(top_ortho_programs) == ortho_signals:
        break

# Get top 25 unique law schools
top_law_schools = []
seen_law = set()
for _, row in results_df.iterrows():
    law = row['Law School']
    if law not in seen_law:
        top_law_schools.append(law)
        seen_law.add(law)
    if len(top_law_schools) == law_signals:
        break

# Now top_ortho_programs and top_law_schools are your filtered lists
print("Top 30 unique ortho programs:")
for o in top_ortho_programs:
    print(o)

print("\nTop 25 unique law schools:")
for l in top_law_schools:
    print(l)

Top 30 unique ortho programs:
3. NYU Grossman / NYU Langone Orthopedic Hospital
7. Duke Orthopaedic Surgery Residency
9. Thomas Jefferson Orthopaedic Surgery Residency (Sidney Kimmel MC)
1. Hospital for Special Surgery (HSS)
21. Stanford Orthopaedic Surgery Residency
5. Mass General Brigham (Harvard Combined Orthopaedic Residency Program)
6. Rush Medical Center Orthopaedic Surgery Residency
16. University of Pennsylvania Orthopaedic Surgery Residency
19. Univeristy of Michigan Medical Center
29. UVA Health Orthopaedic Residency
8. Vanderbilt Orthopaedic Surgery Residency
18. NYP Columbia Campus (Milstein Hospital) Orthopaedic Surgery
25. McGaw Medical Center at Northwestern Orthopaedic Residency
57. Yale New Haven Hospital Orthopaedic Surgery Residency
38. UCLA Medical Center Orthopaedic Residency
17. USC / LA General Medical Center Orthopaedic Surgery Residency
46. Boston University Medical Center Orthopaedic Residency
12. UCSF Orthopaedic Surgery Residency Institute
44. Cedars-Sinai 