In [78]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
import time

In [None]:
# input_path = "../data/ortho_programs_full.csv"
# output_path = "../data/ortho_programs_cleaned.csv"

# with open(input_path, "r") as infile, open(output_path, "w") as outfile:
#     for i, line in enumerate(infile):
#         # Keep the header as-is
#         if i == 0:
#             outfile.write(line.strip() + "\n")
#             continue

#         # Split only on the first comma
#         parts = line.strip().split(",", 1)

#         if len(parts) == 2:
#             name = parts[0].strip()
#             address = parts[1].strip()
#             # Add quotes around the name if not already quoted
#             if not name.startswith('"'):
#                 name = f'"{name}"'
#             outfile.write(f"{name},{address}\n")
#         else:
#             print(f"Skipping malformed line {i + 1}: {line}")

In [None]:
# ========== STEP 1: Load your data ==========
law_df = pd.read_csv("../data/law_schools.csv")  # columns: Name, Full Address
#ortho_df_short = pd.read_csv("../data/ortho_programs.csv")  # columns: Name, Full Address
ortho_df= pd.read_csv("../data/ortho_programs_cleaned.csv")  # columns: Name, Full Address

In [89]:
#law_df
ortho_df

Unnamed: 0,Name,Full Address
0,1. Hospital for Special Surgery/Cornell Medica...,535 East 70th Street New York NY 10021
1,2. Mayo Clinic College of Medicine and Science...,"200 First Street, Rochester MN 55905"
2,3. NYU Grossman School of Medicine/NYU Langone...,"301 East 17th Street, New York, NY 10003"
3,4. Washington University,"4455 Duncan Avenue, St. Louis, MO 63110"
4,5. Mass General Brigham/Massachusetts General ...,"55 Fruit Street, Boston, MA 02114"
...,...,...
202,203. Carilion Clinic Virginia Tech Carilion Sc...,"1 Riverside Circle, Roanoke, VA 24016"
203,204. Garden City Hospital Orthopaedic Surgery ...,"6245 Inkster Rd, Garden City, MI 48135"
204,205. Larkin Community Hospital Orthopaedic Sur...,"7000 SW 62nd Ave, South Miami, FL 33143"
205,206. St Joseph's Medical Center Orthopaedic Su...,"1800 N California St, Stockton, CA 95204"


In [98]:
# ========== STEP 2: Geocode each address ==========
geolocator = Nominatim(user_agent="law_ortho_matcher")

def geocode_address(address):
    for _ in range(3):  # retry in case of rate limit
        try:
            location = geolocator.geocode(address)
            if location:
                return pd.Series([location.latitude, location.longitude])
        except Exception:
            time.sleep(1)
    return pd.Series([None, None])

print("Geocoding law schools...")
law_df[["Lat", "Lon"]] = law_df["Full Address"].apply(geocode_address)
print("Done!")

Geocoding law schools...
Done!


In [90]:

print("Geocoding ortho programs...")
ortho_df[["Lat", "Lon"]] = ortho_df["Full Address"].apply(geocode_address)
print("Done!")

Geocoding ortho programs...
Done!


In [99]:
ortho_df.to_csv("../data/geo/ortho_lat_long.csv")
law_df.to_csv("../data/geo/law_lat_long.csv")

## How Important the Rank of Law School is vs Ortho Rank 
higher number penalizes distance more as this is a minimization problem

In [59]:
law_rank_importance = 2
ortho_rank_importance= 1
dist_rank_importance= 2 

In [62]:
# ========== STEP 3: Match and score all pairwise combos ==========
matches = []
for _, law in law_df.iterrows():
    for _, ortho in ortho_df.iterrows():
        if pd.notnull(law["Lat"]) and pd.notnull(ortho["Lat"]):
            distance = geodesic((law["Lat"], law["Lon"]), (ortho["Lat"], ortho["Lon"])).miles
            law_rank = int(law["Name"].split(".")[0])
            ortho_rank = int(ortho["Name"].split(".")[0])
            
            # Example scoring weights (adjust as needed)
            score = law_rank_importance * law_rank + ortho_rank_importance * ortho_rank + dist_rank_importance * distance

            matches.append({
                "Law School": law["Name"],
                "Ortho Program": ortho["Name"],
                "Distance (miles)": round(distance, 2),
                "Score": round(score, 2)
            })


KeyError: 'Lat'

In [None]:
# ========== STEP 4: Sort and get top matches ==========
results_df = pd.DataFrame(matches).sort_values("Score").reset_index(drop=True)

# Save or display
results_df.to_csv("../results/top_law_ortho_matches.csv", index=False)
print("Top 30 matches:")
print(results_df.head(30))



Top 30 matches:
                                        Law School  \
0             8. New York University School of Law   
1                 6. Duke University School of Law   
2             8. New York University School of Law   
3                            7. Harvard Law School   
4   5. University of Pennsylvania Carey Law School   
5              3. University of Chicago Law School   
6                          10. Columbia Law School   
7   5. University of Pennsylvania Carey Law School   
8                          10. Columbia Law School   
9          11. Northwestern Pritzker School of Law   
10            16. Vanderbilt University Law School   
11         4. University of Virginia School of Law   
12         17. Washington University School of Law   
13                         10. Columbia Law School   
14            8. New York University School of Law   
15             22. Boston University School of Law   
16                           19. UNC School of Law   
17          

In [37]:
ortho_signals= 30
law_signals = 25

# Get top 30 unique orthopedic surgery programs
top_ortho_programs = []
seen_ortho = set()
for _, row in results_df.iterrows():
    ortho = row['Ortho Program']
    if ortho not in seen_ortho:
        top_ortho_programs.append(ortho)
        seen_ortho.add(ortho)
    if len(top_ortho_programs) == ortho_signals:
        break

# Get top 25 unique law schools
top_law_schools = []
seen_law = set()
for _, row in results_df.iterrows():
    law = row['Law School']
    if law not in seen_law:
        top_law_schools.append(law)
        seen_law.add(law)
    if len(top_law_schools) == law_signals:
        break

# Now top_ortho_programs and top_law_schools are your filtered lists
print("Top 30 unique ortho programs:")
for o in top_ortho_programs:
    print(o)

print("\nTop 25 unique law schools:")
for l in top_law_schools:
    print(l)

Top 30 unique ortho programs:
3. NYU Grossman School of Medicine/NYU Langone Orthopedic Hospital
7. Duke Orthopaedic Surgery Residency
1. Hospital for Special Surgery/Cornell Medical Center
5. Mass General Brigham/Massachusetts General Hospital/Brigham and Women's Hospital/Harvard Medical School (Boston)
11. Sidney Kimmel Medical College at Thomas Jefferson University/TJUH
6. Rush Medical Center Orthopaedic Surgery Residency
18. University of Pennsylvania Health System
8. Vanderbilt Orthopaedic Surgery Residency
31. University of Virginia Orthopaedic Surgery Residency
4. Washington University
20. New York Presbyterian Hospital (Columbia Campus)
40. UCLA David Geffen School of Medicine/UCLA Medical Center Orthopaedic Surgery Residency
60. Yale-New Haven Medical Center Orthopaedic Surgery Residency
59. University of Chicago Orthopaedic Surgery Residency
19. University of Southern California/Los Angeles General Medical Center (USC/LA General)
14. University of California (San Francisco)
2

In [40]:
top_ortho_df = pd.DataFrame(top_ortho_programs)
# Now you can save it to CSV
top_ortho_df.to_csv("../results/ortho_signals.csv", index=False)