# Vipassana Course Scraper

## 1. Import Dependencies

In [37]:
from splinter import Browser
from bs4 import BeautifulSoup
from pymongo import MongoClient
from datetime import datetime

### 1.1 Set Up Chrome Browser for Splinter

In [2]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

### 1.2 Set Up MongoDB

In [3]:
client = MongoClient('mongodb://localhost:27017/')
db = client.vispassana

## 2. Navigate to Site and Expand Location List

In [131]:
url = 'https://www.dhamma.org/en-US/locations/directory#directory-list'
browser.visit(url)
browser.click_link_by_text('Expand All')
soup = BeautifulSoup(browser.html, 'lxml')

## 3. Extract Info for Each Location & Post to MongoDB

In [132]:
locations = soup.find('div', class_='regions').find_all('div', class_='location')
for location in locations:
    data_id = int(location['data-id'])
    display_name = location.find('span', class_='display-name').text
    dhamma_name = location.find('span', class_='dhamma-name').text if location.find('span', class_='dhamma-name') else ''
    if location.find('a', class_='location-courses-link')['href'] == '#':
        links = location.find('ul', class_='dropdown-menu').find_all('a')
        courses_url = [link for link in links if 'English' in link.text][0]['href']
    else:
        courses_url = location.find('a', class_='location-courses-link')['href']
    courses_url = 'https://www.dhamma.org' + courses_url
    map_url = location.find('i', class_='glyphicon-map-marker').next_sibling.next_sibling['href']
    post = {
        'dataID': data_id,
        'displayName': display_name,
        'dhammaName': dhamma_name,
        'coursesURL': courses_url,
        'mapURL': map_url
    }
    db.locations.find_one_and_update({'dataID': data_id}, {'$set': post}, upsert=True)
    print('-----------------------------------')
    print(str(data_id), display_name, dhamma_name, courses_url, map_url)

-----------------------------------
1304 Ethiopia Vipassana Meditation in Ethiopia https://www.dhamma.org/en/schedules/noncenter/et https://www.dhamma.org/en-US/maps#et
-----------------------------------
1306 Nairobi, Nairobi 00500 Kenya Vipassana Association https://www.dhamma.org/en/schedules/noncenter/ke https://www.dhamma.org/en-US/maps#ke
-----------------------------------
1674 Antsirabé Vipassana Madagascar https://www.dhamma.org/en/schedules/noncenter/mg https://www.dhamma.org/en-US/maps#mg
-----------------------------------
1584 Mauritius Vipassana Meditation in Mauritius https://www.dhamma.org/en/schedules/noncenter/mu https://www.dhamma.org/en-US/maps#mu
-----------------------------------
1632 Mozambique Vipassana Meditiation in Mozambique https://www.dhamma.org/en/schedules/noncenter/mz https://www.dhamma.org/en-US/maps#mz
-----------------------------------
1661 Ile de la Réunion Reunion Island Vipassana Meditation https://www.dhamma.org/en/schedules/noncenter/re https:

-----------------------------------
5 Alberta, Youngstown Dhamma Karuṇā https://www.dhamma.org/en/schedules/schkaruna https://www.dhamma.org/en-US/maps#karuna
-----------------------------------
1355 British Columbia, Duncan Dhamma Modana https://www.dhamma.org/en/schedules/schmodana https://www.dhamma.org/en-US/maps#modana
-----------------------------------
1354 British Columbia, Merritt Dhamma Surabhi https://www.dhamma.org/en/schedules/schsurabhi https://www.dhamma.org/en-US/maps#surabhi
-----------------------------------
1363 Manitoba, Winnipeg Vipassana Meditation in Manitoba https://www.dhamma.org/en/schedules/noncenter/mb.ca https://www.dhamma.org/en-US/maps#mb.ca
-----------------------------------
1359 Ontario, Egbert Dhamma Toraṇa https://www.dhamma.org/en/schedules/schtorana https://www.dhamma.org/en-US/maps#torana
-----------------------------------
1360 Quebec, Montebello Dhamma Suttama https://www.dhamma.org/en/schedules/schsuttama https://www.dhamma.org/en-US/maps#sutt

-----------------------------------
1477 Bago, Indagaw Dhamma Nidhi https://www.dhamma.org/en/schedules/schnidhi https://www.dhamma.org/en-US/maps#nidhi
-----------------------------------
1681 Kachin, Nan Kway village Dhamma Sangam https://www.dhamma.org/en/schedules/schsangam https://www.dhamma.org/en-US/maps#sangam
-----------------------------------
1474 Kaytho, Myanmar Dhamma Cetiya Paṭṭhāra https://www.dhamma.org/en/schedules/schcetiyapatthara https://www.dhamma.org/en-US/maps#cetiyapatthara
-----------------------------------
1471 Mandalay, Mahã Aung Mye Township Dhamma Maṇḍapa https://www.dhamma.org/en/schedules/schmandapa https://www.dhamma.org/en-US/maps#mandapa
-----------------------------------
1470 Mandalay, Mandalay Division Dhamma Maṇḍala https://www.dhamma.org/en/schedules/schmandala https://www.dhamma.org/en-US/maps#mandala
-----------------------------------
1473 Mandalay, Mogok Dhamma Ratana https://www.dhamma.org/en/schedules/schratana https://www.dhamma.org/en-US/

-----------------------------------
1640 Kushinagar Kushinagar NonCenter Courses https://www.dhamma.org/en/schedules/noncenter/kushinagar.in https://www.dhamma.org/en-US/maps#kushinagar.in
-----------------------------------
1551 Lucknow Dhamma Lakkhaṇa https://www.dhamma.org/en/schedules/schlakkhana https://www.dhamma.org/en-US/maps#lakkhana
-----------------------------------
1552 Sarnath Dhamma Cakka https://www.dhamma.org/en/schedules/schcakka https://www.dhamma.org/en-US/maps#cakka
-----------------------------------
1553 Sravasti Dhamma Suvatthi https://www.dhamma.org/en/schedules/schsuvatthi https://www.dhamma.org/en-US/maps#suvatthi
-----------------------------------
1558 Dehradun Dhamma Salila https://www.dhamma.org/en/schedules/schsalila https://www.dhamma.org/en-US/maps#salila
-----------------------------------
1416 Bangalore Dhamma Paphulla https://www.dhamma.org/en/schedules/schpaphulla https://www.dhamma.org/en-US/maps#paphulla
-----------------------------------
1672 N

1386 Dziadowice Dhamma Pallava https://www.dhamma.org/en/schedules/schpallava https://www.dhamma.org/en-US/maps#pallava
-----------------------------------
1619 Krutyń Vipassana Meditation in Poland https://www.dhamma.org/en/schedules/schpallava#northeast.pl https://www.dhamma.org/en-US/maps#northeast.pl
-----------------------------------
1388 Cluj-Napoca Fundatia Vipassana Romania https://www.dhamma.org/en/schedules/noncenter/ro https://www.dhamma.org/en-US/maps#ro
-----------------------------------
1390 Dhamma Dullabha  https://www.dhamma.org/en/schedules/schdullabha https://www.dhamma.org/en-US/maps#dullabha
-----------------------------------
1645 St. Petersburg St. Petersburg Non Centre Vipassana https://www.dhamma.org/en/schedules/schdullabha#st-petersburg.ru https://www.dhamma.org/en-US/maps#st-petersburg.ru
-----------------------------------
1646 Yekaterinburg Yekaterinburg NonCenter Vipassana Courses https://www.dhamma.org/en/schedules/schdullabha#yekaterinburg.ru https://w

-----------------------------------
1496 Dhamma Medini Dhamma Medinī https://www.dhamma.org/en/schedules/schmedini https://www.dhamma.org/en-US/maps#medini
-----------------------------------
1325 Fiji Vipassana Meditation in Fiji https://www.dhamma.org/en/schedules/noncenter/fj https://www.dhamma.org/en-US/maps#fj
-----------------------------------
1497 New Caledonia  https://www.dhamma.org/en/schedules/noncenter/nc https://www.dhamma.org/en-US/maps#nc
-----------------------------------
1326 Tahiti Vipassana Meditation in French Polynesia https://www.dhamma.org/en/schedules/noncenter/pf https://www.dhamma.org/en-US/maps#pf


## 4. Retrieve Geolocation for Each Location

In [134]:
for location in db.locations.find():
    if 'geolocation' not in location:
        url = location['mapURL']
        browser.visit(url)
        soup = BeautifulSoup(browser.html, 'lxml')
        directions = soup.find('a', class_='directions-link')
        lat = float(directions['data-latitude'])
        lon = float(directions['data-longitude'])
        post = {
            'geolocation': {
                'lat': lat,
                'lon': lon
            }
        }
        db.locations.find_one_and_update({'_id': location['_id']}, {'$set': post})
    else:
        lat = location['geolocation']['lat']
        lon = location['geolocation']['lon']
    print('-----------------------------------')
    print(location['dataID'], location['displayName'], str(lat), str(lon))

-----------------------------------
1304 Ethiopia 9.1473 40.493046
-----------------------------------
1306 Nairobi, Nairobi 00500 -1.2833 36.8167
-----------------------------------
1674 Antsirabé -19.877999 47.029389
-----------------------------------
1584 Mauritius -20.19889 57.444562
-----------------------------------
1632 Mozambique -25.95 32.5833
-----------------------------------
1661 Ile de la Réunion -21.115141 55.536384
-----------------------------------
1642 Kigali -1.9439 30.0594
-----------------------------------
1311 Tanzania -6.36821 34.885189
-----------------------------------
1312 Uganda 1.37777 32.2874
-----------------------------------
1313 Zimbabwe -19.01328 29.146661
-----------------------------------
1301 Benguela -12.5808 13.4076
-----------------------------------
1462 Egypt 30.046506 31.235627
-----------------------------------
1307 Marrakech 31.364767 -7.79112
-----------------------------------
1588 Khartoum 12.862807 30.217636
----------------------

-----------------------------------
1483 Ayeyarwady, Irrawadi Division 17.034213 96.682999
-----------------------------------
1478 Ayeyarwady, Ma-U-Bin 16.73 95.65
-----------------------------------
1591 Ayeyarwady, Myaungmya 19.8761 96.0452
-----------------------------------
1477 Bago, Indagaw 17.32774 96.498451
-----------------------------------
1681 Kachin, Nan Kway village 25.398039 97.382402
-----------------------------------
1474 Kaytho, Myanmar 19.0718 96.682999
-----------------------------------
1471 Mandalay, Mahã Aung Mye Township 21.975 96.083333
-----------------------------------
1470 Mandalay, Mandalay Division 21.980709 96.090103
-----------------------------------
1473 Mandalay, Mogok 22.921391 96.505409
-----------------------------------
1472 Mandalay, Mogok 22.921391 96.505409
-----------------------------------
1476 Mandalay, Yechan Oo village Pyin Oo Lwin 22.034189 96.469963
-----------------------------------
1479 Mon, Than-Phyu-Za-Yet 17.062604 97.351656
--

-----------------------------------
1443 Igatpuri 19.701885 73.55337
-----------------------------------
1618 Jalgaon 20.992993 75.554254
-----------------------------------
1444 Kalyan 19.25 73.13
-----------------------------------
1445 Khadavli 19.356829 73.21894
-----------------------------------
1446 Kolhapur 16.70163 74.24636
-----------------------------------
1603 Latur 18.392848 76.615729
-----------------------------------
1677 Mahad, 402301 18.0821 73.4215
-----------------------------------
1453 Manmad 20.249901 74.43586
-----------------------------------
1449 Nagpur 21.14156 79.08629
-----------------------------------
1448 Nagpur 21.14156 79.08629
-----------------------------------
1602 Nanded 19.148 77.297
-----------------------------------
1450 Nashik 19.98463 73.79349
-----------------------------------
1454 New Mumbai 19.026785 73.032432
-----------------------------------
1601 Oza Tola, Gondia 21.45985 80.198181
-----------------------------------
1624 Palghar 19

## 5. Retrieve Courses for Each Location

In [28]:
location = db.locations.find()
url = location[70]['coursesURL']
browser.visit(url)
soup = BeautifulSoup(browser.html, 'lxml')

In [29]:
for year in soup.find_all('div', class_="sch-header-year"):
    if year.next_sibling.next_sibling:
        print()
        print(year.text.strip()[:4])
        for row in year.next_sibling.next_sibling.find_all('tr', class_=lambda x: x != 'sch-header-title'):
            cells = row.find_all('td')
            dates = cells[1].text.strip()
            course_type = cells[2].text.strip()
            status = cells[3].text.strip()
            print('-----------------------------------')
            print(dates, course_type, status)


2018
-----------------------------------
Nov 14
- Nov 25 10-Day In Progress
-----------------------------------
Nov 28
- Dec 09 10-Day Women - Closed
Men - Wait List
Servers - Open
-----------------------------------
Dec 09 1-Day Open
-----------------------------------
Dec 12
- Dec 23 10-Day Women - Closed
Men - Wait List
Female Server - Open
Male Server - Wait List
-----------------------------------
December 26, 2018
- January 06, 2019 10-Day Women - Closed
Men - Closed
Servers - Wait List

2019
-----------------------------------
Jan 16
- Jan 27 10-Day Women - Wait List
Men - Wait List
Servers - Open
-----------------------------------
Jan 30
- Feb 10 10-Day Women - Wait List
Men - Open
Servers - Open
-----------------------------------
Feb 14
- Feb 17 3-Day New Women - Closed
Old Women - Open
New Men - Closed
Old Men - Open
Servers - Open
-----------------------------------
Feb 22
- Feb 24 Old Student Program Applications accepted starting Nov 22
---------------------------------

### 5.1 Determine Date String Format & Convert to Date

Date String Formats
* Nov 04 - Nov 18
* 31 December, 2017 - 03 January, 2018
* 20 Dec - 31 Dec
* 03 Feb

In [44]:
def date_parser(date_str, year=''):
    months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
    date_ls = date_str.replace(",", "").split(" ")
    for date_elem in date_ls:
        if len(date_elem) == 2:
            day = date_elem
        elif date_elem[:3] in months:
            month = date_elem[:3]
        else:
            year = date_elem
    date = datetime.strptime(" ".join([month, day, year]), '%b %d %Y').date()
    return date
date_parser('Nov 04', '2018')

datetime.date(2018, 11, 4)