<img src="https://www.lubavitch.com/wp-content/uploads/2019/05/text-1.svg" alt="Numbeo Logo" width="400">
<div style="height: 30px; background: linear-gradient(to right, white);"></div>

- website - https://www.lubavitch.com/
- Example - https://www.lubavitch.com/centers/

<br>
<div style="height: 20px; background: linear-gradient(to right, #20a4de, #1b3b6f);"></div>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import requests
import os
import re
import time
import json
from bs4 import BeautifulSoup
from IPython.display import display, clear_output
from urllib.parse import urlparse, urljoin
from tqdm import tqdm
from IPython.display import clear_output

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 200)

# global variables
terminal_size = os.get_terminal_size().columns
processing_list = range(100)  # for the tqdm process bar #for item in tqdm(processing_list, desc="Processing"):

<br>
<div style="height: 20px; background: linear-gradient(to right, #20a4de, #1b3b6f);"></div>

## Crawler for cities with chabad centers

In [None]:
# Starting URLs (States as Initial URL's)
starting_urls = [
    "https://www.lubavitch.com/centers/north-america",
    "https://www.lubavitch.com/centers/europe/",
    "https://www.lubavitch.com/centers/asia/",
    "https://www.lubavitch.com/centers/middle-east/",
    "https://www.lubavitch.com/centers/africa/",
    "https://www.lubavitch.com/centers/south-america/",
    "https://www.lubavitch.com/centers/australia/"
]

# Set to store crawled URLs
crawled_urls = set()

# Counter for processed URLs
url_counter = 0

# Create a DF to store the crawled URLs
url_df = pd.DataFrame(columns=['URL'])

def crawl(url):
    global url_counter
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract and process links
            for link in soup.find_all('a', href=True):
                absolute_url = urljoin(url, link['href'])
                if any(starting_url in absolute_url for starting_url in starting_urls) and absolute_url not in crawled_urls and not absolute_url.endswith('/the-rebbe'):
                    crawled_urls.add(absolute_url)
                    url_counter += 1
                    clear_output(wait=True)
                    url_df.loc[url_counter] = [absolute_url]  # Add URL to DataFrame
                    print(f"Processing link {url_counter}: {absolute_url}")
                    if url_counter % 10 == 0:
                        write_to_csv()
                    crawl(absolute_url)  # Recursively crawl subpages
    except Exception as e:
        print(f"An error occurred while processing {url}: {e}")

def write_to_csv():
    global url_df
    csv_filename = 'lubavitch_centers_crawled_urls.csv'
    url_df.to_csv(csv_filename, index=False)
    print(f"Crawled URLs have been written to {csv_filename}.")

# Start crawling for each starting URL
for starting_url in starting_urls:
    crawl(starting_url)

# Write any remaining URLs to the CSV file
if url_counter % 10 != 0:
    write_to_csv()

# Print crawled URLs
print("Crawled URLs:")
for url in crawled_urls:
    print(url)


## Scrape the urls

In [32]:

# Load the DF with URLs from the csv file
df_centers_urls = pd.read_csv('lubavitch_centers_crawled_urls.csv')

# Remove trailing slashes from the 'URL' column using str.rstrip()
df_centers_urls['URL'] = df_centers_urls['URL'].str.rstrip('/')

# Remove duplicates based on the 'URL' column
df_centers_urls_clean = df_centers_urls.drop_duplicates(subset=['URL'])

count = 0
count_max = len(df_centers_urls_clean)

# Initialize lists to store extracted data
urls = []
center_names = []
addresses = []
cities = []
states = []
regions = []
postcodes = []
countries = []
websites = []
phones = []
lats = []
longs = []

# Regular expression pattern to extract JavaScript object from <script> tag
pattern = re.compile(r"let locations = (\{.*?\});", re.DOTALL)

# Iterate through each URL in the DataFrame
for url in df_centers_urls_clean['URL']:
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        count += 1
        clear_output(wait=True)
        print(f'{count}/{count_max} | {url}')

        # Find the JavaScript object within the HTML content
        match = pattern.search(response.text)
        if match:
            json_data = match.group(1)
            data = json.loads(json_data)

            center_list = data.get('center_list', [])
            for center in center_list:
                urls.append(url)
                center_names.append(center.get('Institution'))
                addresses.append(center.get('Address1'))
                cities.append(center.get('City'))
                states.append(center.get('State'))
                regions.append(center.get('Region'))
                postcodes.append(center.get('PostCode'))
                countries.append(center.get('Country'))
                websites.append(center.get('WebAddress'))
                phones.append(center.get('Phone'))
                lats.append(center.get('lat'))
                longs.append(center.get('lng'))

                # Save to CSV every 10 URLs
                if count % 10 == 0:
                    data = {
                        'URL_chabad_center': urls,
                        'URL_chabad_center_name': center_names,
                        'Address': addresses,
                        'Region': regions,
                        'State': states,
                        'Country': countries,
                        'City': cities,
                        'PostCode': postcodes,
                        'Website': websites,
                        'Phone': phones,
                        'Lat': lats,
                        'Long': longs
                    }
                    df = pd.DataFrame(data)
                    df.to_csv('dataset_chabad_centers_cities.csv', index=False)

        else:
            print("No JavaScript object found in", url)
    except requests.exceptions.RequestException as e:
        print("Error fetching data from", url, ":", e)

# Create a df from the extracted data
data = {
    'URL_chabad_center': urls,
    'URL_chabad_center_name': center_names,
    'Address': addresses,
    'Region': regions,
    'State': states,
    'Country': countries,
    'City': cities,
    'PostCode': postcodes,
    'Website': websites,
    'Phone': phones,
    'Lat': lats,
    'Long': longs
}

df = pd.DataFrame(data)

# Save the final DF to CSV
df.to_csv('dataset_chabad_centers_cities.csv', index=False)

display(df)


1850/1850 | https://www.lubavitch.com/centers/australia/australia/zetland


Unnamed: 0,URL_chabad_center,URL_chabad_center_name,Address,Region,State,Country,City,PostCode,Website,Phone,Lat,Long
0,https://www.lubavitch.com/centers/north-america/caribbean,Chabad Aruba,Salina Cerca 31C,Caribbean,,Aruba,Noord,,JewishAruba.com,297-592-7613,12.5813,-70.0393
1,https://www.lubavitch.com/centers/north-america/caribbean,Chabad Cayman Jewish Community Center,215B West Bay Road,Caribbean,,Cayman Islands,Grand Cayman,KY1-1010,ChabadCaymanIslands.com,345-516-4474,19.3124,-81.385
2,https://www.lubavitch.com/centers/north-america/caribbean,Chabad Grenada,Lighthouse Drive,Caribbean,,Grenada,St. George,,ChabadGrenada.com,473-410-1315,12.0019,-61.7662
3,https://www.lubavitch.com/centers/north-america/caribbean,Chabad House,35 Springfarm Drive,Caribbean,,Jamaica,Montego Bay,,jewishjamaica.com,876-452-3223,18.5182,-77.8518
4,https://www.lubavitch.com/centers/north-america/caribbean,Chabad Jewish Center of Puerto Rico,18 Calle Rosa,Caribbean,PR,USA,Carolina,00979,JewishPuertoRico.com,787-253-0894,18.444,-66.0158
...,...,...,...,...,...,...,...,...,...,...,...,...
3046,https://www.lubavitch.com/centers/australia/australia/sunshine-coast,Chabad of Sunshine Coast,7 Whitecap Court,Australia,QLD,Australia,Sunshine Coast,4575,ChabadSunshineCoast.net,61-497-301-321,-26.7415,153.133
3047,https://www.lubavitch.com/centers/australia/australia/surfers-paradise,Chabad Gold Coast,48 The Corso,Australia,QLD,Australia,Surfers Paradise,4217,,61-4-1939-2818,-28.0113,153.421
3048,https://www.lubavitch.com/centers/australia/australia/sydney,CBDChabad Sydney,16 Gosbell Street,Australia,NSW,Australia,Sydney,2000,www.cbdchabad.org.au,61-4-2515-2404,-33.8795,151.226
3049,https://www.lubavitch.com/centers/australia/australia/sydney,Chabad Sydney 4 Israeli Tourist,427 Old South Head Road,Australia,NSW,Australia,Sydney,2029,www.chabadsydney.info,61-4-2439-9777,-33.8788,151.271
