# Observatories geolocation

In [3]:
import pandas as pd

## Scrape list of astronomical observatories from Wikipedia

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/List_of_astronomical_observatories"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}

response = requests.get(url, headers=headers)
response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')

# Find the correct table by matching the headers
table = None
for tbl in soup.find_all('table', {'class':'wikitable'}):
    headers = [th.get_text(strip=True) for th in tbl.find_all('th')]
    if headers[:3] == ['Name', 'Established', 'Location']:
        table = tbl
        break

# Parse rows
rows = []
for tr in table.find_all('tr')[1:]:
    tds = tr.find_all('td')
    if len(tds) >= 3:
        name_td = tds[0]
        name_link = None
        a_tag = name_td.find('a', href=True)
        if a_tag:
            name_link = 'https://en.wikipedia.org' + a_tag['href']
        name_text = name_td.get_text(strip=True)
        established = tds[1].get_text(strip=True)
        location = tds[2].get_text(strip=True)
        rows.append({
            'Name': name_text,
            'Link': name_link,
            'Established': established,
            'Location': location
        })

observatories_df = pd.DataFrame(rows)

observatories_df.to_csv("../data/astronomical_observatories_with_links.csv", index=False)


## Geolocation

### Add coordinates using geopy Nominatim and Openstreetmap

In [None]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import logging

logging.basicConfig(level=logging.INFO)

geolocator = Nominatim(
    user_agent="your-app-name/1.0 (chiara.torri@yahoo.com)",  # required by policy
    timeout=5
)

# Respect rate limits (e.g. 1 request per second)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

def get_location(city_name: str):
    # First, try the full string
    try:
        location = geocode(city_name)
        if location:
            return (location.latitude, location.longitude)
    except Exception as e:
        logging.warning("Error geocoding %r (full): %s", city_name, e)

    # Fallback: first two comma-separated parts (as string)
    parts = [p.strip() for p in city_name.split(",") if p.strip()]
    if len(parts) >= 2:
        try:
            q = ", ".join(parts[:2])
            location = geocode(q)
            if location:
                return (location.latitude, location.longitude)
        except Exception as e:
            logging.warning("Error geocoding %r (first two parts): %s", city_name, e)

    # Fallback: just the first part (often city)
    if parts:
        try:
            q = parts[0]
            location = geocode(q)
            if location:
                return (location.latitude, location.longitude)
        except Exception as e:
            logging.warning("Error geocoding %r (first part): %s", city_name, e)

    logging.warning("⚠️ no coordinates found for: %r", city_name)
    return (None, None)



In [None]:
observatories_location=pd.DataFrame({'Location': list(observatories_df['Location'].unique())})
observatories_location['geolocation'] = observatories_location['Location'].apply(get_location)



In [None]:
observatories_location['type']=['earth' if observatories_location.geolocation[i]!='(None, None)' else pd.NA for i in range(len(observatories_location))]
observatories_location.to_csv("../data/observatories_location_geocoded.csv", index=False)

### Add missing locations

In [4]:
observatories_df=pd.read_csv("../data/astronomical_observatories_with_links.csv")
observatories_location=pd.read_csv("../data/observatories_location_geocoded.csv")
observatories_df = observatories_df.merge(observatories_location, on='Location', how='left')

In [None]:
observatories_df['type']=['space' if (observatories_df.geolocation[i]=="(None, None)") & ('orbit' in observatories_df.Location[i]) else observatories_df.type[i] for i in range(len(observatories_df))]
observatories_df['type']=['space' if (observatories_df.geolocation[i]=="(None, None)") & ('Sun' in observatories_df.Location[i]) else observatories_df.type[i] for i in range(len(observatories_df))]

In [None]:
observatories_df[(observatories_df.geolocation=="(None, None)") & (observatories_df.type.isna())].to_excel('../data/observatories_no_location_type.xlsx', index=False)

In [None]:
filter_missing=((observatories_df.geolocation=="(None, None)") & (observatories_df.type.isna()))
observatories_df=observatories_df[~filter_missing]
observatories_df['note']=pd.NA

In [None]:
observatories_no_location_type=pd.read_excel('../data/observatories_no_location_type.xlsx')

observatories_df_final=pd.concat([observatories_df, observatories_no_location_type], ignore_index=True)
observatories_df_final.to_csv("../data/astronomical_observatories_final.csv", index=False)

## Merge with exoplanet data

In [30]:
exoplanets=pd.read_csv("../data/exoplanets_data.csv")
unique_exo = exoplanets.sort_values(['pl_name', 'releasedate'], ascending=[True, False]).drop_duplicates(subset='pl_name', keep='first')

In [31]:
count_by_observatory=pd.DataFrame(unique_exo.groupby('disc_facility')['pl_name'].count()).reset_index()

In [32]:
count_by_observatory

Unnamed: 0,disc_facility,pl_name
0,Acton Sky Portal Observatory,2
1,Anglo-Australian Telescope,35
2,Apache Point Observatory,2
3,Arecibo Observatory,3
4,Atacama Large Millimeter Array (ALMA),1
...,...,...
67,Winer Observatory,1
68,XO,6
69,Xinglong Station,2
70,Yunnan Astronomical Observatory,3


In [68]:
observatories_df_final=pd.read_csv("../data/astronomical_observatories_final.csv")

In [70]:
observatories_df_final.fillna('Unknown', inplace=True)

  observatories_df_final.fillna('Unknown', inplace=True)


In [35]:
count_by_observatory_with_location=pd.merge(count_by_observatory, observatories_df_final, left_on='disc_facility', right_on='Name', how='left')

In [36]:
missing_location=count_by_observatory_with_location[count_by_observatory_with_location.geolocation.isna()]

In [None]:
# missing_location.to_excel('../data/observatories_missing_location_in_exoplanet_data.xlsx', index=False)

In [14]:
len(missing_location)

46

In [15]:
len(count_by_observatory_with_location)

72

In [16]:
len(count_by_observatory_with_location[~count_by_observatory_with_location.geolocation.isna()])

26

In [37]:
count_by_observatory_with_location_full=count_by_observatory_with_location[~count_by_observatory_with_location.geolocation.isna()]

In [55]:
missing_location=pd.read_excel('../data/observatories_missing_location_in_exoplanet_data.xlsx')

In [56]:
observatories_exo_geolocated=pd.concat([count_by_observatory_with_location_full, missing_location], ignore_index=True)

In [57]:
observatories_exo_geolocated.to_csv("../data/observatories_exoplanets_geolocated.csv", index=False)

In [72]:
df=pd.read_csv('../data/observatories_exoplanets_geolocated.csv')

In [73]:
df['geolocation']

0       (32.7892103, -105.81857)
1      (18.4047499, -66.6810353)
2       (36.8414197, -2.4628135)
3     (-24.5563029, -69.4226479)
4      (31.6890556, -110.884703)
                 ...            
71           (-32.3783, 20.8105)
72                           NaN
73          (19.8256, -155.4761)
74     (40.4659616, 117.7921122)
75        (33.35731, -116.85981)
Name: geolocation, Length: 76, dtype: object

In [74]:
df.fillna('Unknown', inplace=True)

In [76]:
df

Unnamed: 0,disc_facility,pl_name,Name,Link,Established,Location,geolocation,type,note
0,Apache Point Observatory,2,Apache Point Observatory,https://en.wikipedia.org/wiki/Apache_Point_Obs...,1985,"Sunspot, New Mexico, US","(32.7892103, -105.81857)",earth,Unknown
1,Arecibo Observatory,3,Arecibo Observatory,https://en.wikipedia.org/wiki/Arecibo_Observatory,1963,"Arecibo, Puerto Rico, US","(18.4047499, -66.6810353)",earth,Unknown
2,Calar Alto Observatory,29,Calar Alto Observatory,https://en.wikipedia.org/wiki/Calar_Alto_Obser...,1975,"Almería, Spain","(36.8414197, -2.4628135)",earth,Unknown
3,Cerro Tololo Inter-American Observatory,5,Cerro Tololo Inter-American Observatory,https://en.wikipedia.org/wiki/Cerro_Tololo_Int...,1962,"Atacama Desert, Chile","(-24.5563029, -69.4226479)",earth,Unknown
4,Fred Lawrence Whipple Observatory,6,Fred Lawrence Whipple Observatory,https://en.wikipedia.org/wiki/Fred_Lawrence_Wh...,1968,"Mount Hopkins, Arizona, US","(31.6890556, -110.884703)",earth,Unknown
6,Haleakala Observatory,2,Haleakala Observatory,https://en.wikipedia.org/wiki/Haleakala_Observ...,Unknown,"Maui, Hawaii, US","(20.8029568, -156.3106833)",earth,Unknown
7,Haute-Provence Observatory,69,Haute-Provence Observatory,https://en.wikipedia.org/wiki/Haute-Provence_O...,1937,"Alpes-de-Haute-Provence, France","(44.1640832, 6.1878515)",earth,Unknown
9,Kitt Peak National Observatory,1,Kitt Peak National Observatory,https://en.wikipedia.org/wiki/Kitt_Peak_Nation...,1958,"Tucson, Arizona, US","(32.2228765, -110.974847)",earth,Unknown
10,La Silla Observatory,303,La Silla Observatory,https://en.wikipedia.org/wiki/La_Silla_Observa...,1969,"Atacama Desert, Chile","(-24.5563029, -69.4226479)",earth,Unknown
11,Large Binocular Telescope Observatory,3,Large Binocular Telescope Observatory,https://en.wikipedia.org/wiki/Large_Binocular_...,2005,"Mount Graham, Arizona, US","(32.7016419, -109.8713876)",earth,Unknown


In [75]:
df=df[df['type']=='earth']

In [63]:
df['Latitude']=df['geolocation'].apply(lambda x: (x.strip("()").split(", ")[0]))
df['Longitude']=df['geolocation'].apply(lambda x: (x.strip("()").split(", ")[1]))

In [67]:
df.drop('Name', axis=1).rename(columns={'disc_facility':'Name', 
                                                                                                                    'pl_name':'N discovered planets',
                                                                                                                    'Established':'Year of Establishment'})


Unnamed: 0,Name,N discovered planets,Link,Year of Establishment,Location,geolocation,type,note,Latitude,Longitude
0,Apache Point Observatory,2,https://en.wikipedia.org/wiki/Apache_Point_Obs...,1985,"Sunspot, New Mexico, US","(32.7892103, -105.81857)",earth,,32.7892103,-105.81857
1,Arecibo Observatory,3,https://en.wikipedia.org/wiki/Arecibo_Observatory,1963,"Arecibo, Puerto Rico, US","(18.4047499, -66.6810353)",earth,,18.4047499,-66.6810353
2,Calar Alto Observatory,29,https://en.wikipedia.org/wiki/Calar_Alto_Obser...,1975,"Almería, Spain","(36.8414197, -2.4628135)",earth,,36.8414197,-2.4628135
3,Cerro Tololo Inter-American Observatory,5,https://en.wikipedia.org/wiki/Cerro_Tololo_Int...,1962,"Atacama Desert, Chile","(-24.5563029, -69.4226479)",earth,,-24.5563029,-69.4226479
4,Fred Lawrence Whipple Observatory,6,https://en.wikipedia.org/wiki/Fred_Lawrence_Wh...,1968,"Mount Hopkins, Arizona, US","(31.6890556, -110.884703)",earth,,31.6890556,-110.884703
6,Haleakala Observatory,2,https://en.wikipedia.org/wiki/Haleakala_Observ...,,"Maui, Hawaii, US","(20.8029568, -156.3106833)",earth,,20.8029568,-156.3106833
7,Haute-Provence Observatory,69,https://en.wikipedia.org/wiki/Haute-Provence_O...,1937,"Alpes-de-Haute-Provence, France","(44.1640832, 6.1878515)",earth,,44.1640832,6.1878515
9,Kitt Peak National Observatory,1,https://en.wikipedia.org/wiki/Kitt_Peak_Nation...,1958,"Tucson, Arizona, US","(32.2228765, -110.974847)",earth,,32.2228765,-110.974847
10,La Silla Observatory,303,https://en.wikipedia.org/wiki/La_Silla_Observa...,1969,"Atacama Desert, Chile","(-24.5563029, -69.4226479)",earth,,-24.5563029,-69.4226479
11,Large Binocular Telescope Observatory,3,https://en.wikipedia.org/wiki/Large_Binocular_...,2005,"Mount Graham, Arizona, US","(32.7016419, -109.8713876)",earth,,32.7016419,-109.8713876


In [4]:
import pandas as pd
all_observatories=pd.read_csv('../data/observatories_exoplanets_geolocated.csv').drop('Name', axis=1).rename(columns={'disc_facility':'Name', 
                                                                                                                    'pl_name':'N of discovered planets',
                                                                                                                    'Established':'Year of Establishment'})
all_observatories.fillna('Unknown', inplace=True)
all_observatories['note']=all_observatories['note'].replace('Unknown', '')

obs_space=all_observatories[all_observatories['type']=='space'].copy()
obs_space['Year of Establishment']=[x.split('.')[0] for x in obs_space['Year of Establishment']]
obs_space=obs_space.sort_values(by='N of discovered planets', ascending=False)
obs_space['Name']='🚀 ' + obs_space['Name']


In [6]:
len(obs_space)

10