# Vipassana Course Scraper

## 1. Import Dependencies

In [56]:
from splinter import Browser
from bs4 import BeautifulSoup
from pymongo import MongoClient
from bson.objectid import ObjectId
from datetime import datetime
from dateutil.parser import parse

### 1.1 Set Up Chrome Browser for Splinter

In [2]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

### 1.2 Set Up MongoDB

In [3]:
client = MongoClient('mongodb://localhost:27017/')
db = client.vispassana

## 2. Navigate to Site and Expand Location List

In [71]:
url = 'https://www.dhamma.org/en-US/locations/directory#directory-list'
browser.visit(url)
browser.click_link_by_text('Expand All')
soup = BeautifulSoup(browser.html, 'lxml')

## 3. Extract Info for Each Location & Post to MongoDB

In [72]:
locations = soup.find('div', class_='regions').find_all('div', class_='location')
for location in locations:
    data_id = int(location['data-id'])
    display_name = location.find('span', class_='display-name').text
    dhamma_name = location.find('span', class_='dhamma-name').text if location.find('span', class_='dhamma-name') else ''
    if location.find('a', class_='location-courses-link')['href'] == '#':
        links = location.find('ul', class_='dropdown-menu').find_all('a')
        courses_url = [link for link in links if 'English' in link.text][0]['href']
    else:
        courses_url = location.find('a', class_='location-courses-link')['href']
    courses_url = 'https://www.dhamma.org' + courses_url
    map_url = location.find('i', class_='glyphicon-map-marker').next_sibling.next_sibling['href']
    map_id = map_url.split("#")[1]
    post = {
        'dataID': data_id,
        'displayName': display_name,
        'dhammaName': dhamma_name,
        'coursesURL': courses_url,
        'mapURL': map_url,
        'mapID': map_id
    }
    db.locations.find_one_and_update({'dataID': data_id}, {'$set': post}, upsert=True)
    print('-----------------------------------')
    print(str(data_id), display_name, dhamma_name, courses_url, map_url, map_id)

-----------------------------------
1304 Ethiopia Vipassana Meditation in Ethiopia https://www.dhamma.org/en/schedules/noncenter/et https://www.dhamma.org/en-US/maps#et et
-----------------------------------
1306 Nairobi, Nairobi 00500 Kenya Vipassana Association https://www.dhamma.org/en/schedules/noncenter/ke https://www.dhamma.org/en-US/maps#ke ke
-----------------------------------
1674 Antsirabé Vipassana Madagascar https://www.dhamma.org/en/schedules/noncenter/mg https://www.dhamma.org/en-US/maps#mg mg
-----------------------------------
1584 Mauritius Vipassana Meditation in Mauritius https://www.dhamma.org/en/schedules/noncenter/mu https://www.dhamma.org/en-US/maps#mu mu
-----------------------------------
1632 Mozambique Vipassana Meditiation in Mozambique https://www.dhamma.org/en/schedules/noncenter/mz https://www.dhamma.org/en-US/maps#mz mz
-----------------------------------
1661 Ile de la Réunion Reunion Island Vipassana Meditation https://www.dhamma.org/en/schedules/nonc

-----------------------------------
1554 Aragua, Cerca de Victoria Dhamma Veṇuvana https://www.dhamma.org/en/schedules/schvenuvana https://www.dhamma.org/en-US/maps#venuvana venuvana
-----------------------------------
1555 Occidente Vipassana Venezuela - Occidente https://www.dhamma.org/en/schedules/noncenter/merida.ve https://www.dhamma.org/en-US/maps#merida.ve merida.ve
-----------------------------------
5 Alberta, Youngstown Dhamma Karuṇā https://www.dhamma.org/en/schedules/schkaruna https://www.dhamma.org/en-US/maps#karuna karuna
-----------------------------------
1355 British Columbia, Duncan Dhamma Modana https://www.dhamma.org/en/schedules/schmodana https://www.dhamma.org/en-US/maps#modana modana
-----------------------------------
1354 British Columbia, Merritt Dhamma Surabhi https://www.dhamma.org/en/schedules/schsurabhi https://www.dhamma.org/en-US/maps#surabhi surabhi
-----------------------------------
1363 Manitoba, Winnipeg Vipassana Meditation in Manitoba https://www.

-----------------------------------
1576 Kamalanagar Dhamma Pubbottara https://www.dhamma.org/en/schedules/schpubbottara https://www.dhamma.org/en-US/maps#pubbottara pubbottara
-----------------------------------
1597 Bhubaneshwar Dhamma Bhubaneshwar https://www.dhamma.org/en/schedules/schbhubaneshwar https://www.dhamma.org/en-US/maps#bhubaneshwar bhubaneshwar
-----------------------------------
1598 Khariar Road Dhamma Utkal https://www.dhamma.org/en/schedules/schutkal https://www.dhamma.org/en-US/maps#utkal utkal
-----------------------------------
1575 Gangtok Dhamma Sikkim https://www.dhamma.org/en/schedules/schsikkim https://www.dhamma.org/en-US/maps#sikkim sikkim
-----------------------------------
1656 Gyanalakha, Upper Samdong Dhamma Sineru https://www.dhamma.org/en/schedules/schsineru https://www.dhamma.org/en-US/maps#sineru sineru
-----------------------------------
1533 Machmara Dhamma Puri https://www.dhamma.org/en/schedules/schpuri https://www.dhamma.org/en-US/maps#puri pu

-----------------------------------
1438 Akola Dhamma Anākula https://www.dhamma.org/en/schedules/schanakula https://www.dhamma.org/en-US/maps#anakula anakula
-----------------------------------
1607 Amravati Dhamma Amravati https://www.dhamma.org/en/schedules/schamravati https://www.dhamma.org/en-US/maps#amravati amravati
-----------------------------------
1439 Aurangabad Dhamma Ajantā https://www.dhamma.org/en/schedules/schajanta https://www.dhamma.org/en-US/maps#ajanta ajanta
-----------------------------------
1628 Bhandara Dhamma Bhanḍāra https://www.dhamma.org/en/schedules/schbhandara https://www.dhamma.org/en-US/maps#bhandara bhandara
-----------------------------------
1440 Bhusaval Dhamma Bhūsana https://www.dhamma.org/en/schedules/schbhusana https://www.dhamma.org/en-US/maps#bhusana bhusana
-----------------------------------
1441 Dhule Dhamma Sarovara https://www.dhamma.org/en/schedules/schsarovara https://www.dhamma.org/en-US/maps#sarovara sarovara
------------------------

-----------------------------------
1514 Dhamma Sacca  https://www.dhamma.org/en/schedules/noncenter/es#sacca https://www.dhamma.org/en-US/maps#sacca sacca
-----------------------------------
1512 North and East Area  https://www.dhamma.org/en/schedules/noncenter/es#este.es https://www.dhamma.org/en-US/maps#este.es este.es
-----------------------------------
1586 Spain Vipassana in Spain https://www.dhamma.org/en/schedules/noncenter/es https://www.dhamma.org/en-US/maps#es es
-----------------------------------
1563 West, Center and South Area  https://www.dhamma.org/en/schedules/noncenter/es#sur.es https://www.dhamma.org/en-US/maps#sur.es sur.es
-----------------------------------
1663 Oberösterreich, A-4324 Rechberg Dhamma Muditā https://www.dhamma.org/en/schedules/schmudita https://www.dhamma.org/en-US/maps#mudita mudita
-----------------------------------
1369 Wien, Vienna Austria https://www.dhamma.org/en/schedules/noncenter/austria https://www.dhamma.org/en-US/maps#austria austria

## 4. Retrieve Geolocation for Each Location

In [134]:
for location in db.locations.find():
    if 'geolocation' not in location:
        url = location['mapURL']
        browser.visit(url)
        soup = BeautifulSoup(browser.html, 'lxml')
        directions = soup.find('a', class_='directions-link')
        lat = float(directions['data-latitude'])
        lon = float(directions['data-longitude'])
        post = {
            'geolocation': {
                'lat': lat,
                'lon': lon
            }
        }
        db.locations.find_one_and_update({'_id': location['_id']}, {'$set': post})
    else:
        lat = location['geolocation']['lat']
        lon = location['geolocation']['lon']
    print('-----------------------------------')
    print(location['dataID'], location['displayName'], str(lat), str(lon))

-----------------------------------
1304 Ethiopia 9.1473 40.493046
-----------------------------------
1306 Nairobi, Nairobi 00500 -1.2833 36.8167
-----------------------------------
1674 Antsirabé -19.877999 47.029389
-----------------------------------
1584 Mauritius -20.19889 57.444562
-----------------------------------
1632 Mozambique -25.95 32.5833
-----------------------------------
1661 Ile de la Réunion -21.115141 55.536384
-----------------------------------
1642 Kigali -1.9439 30.0594
-----------------------------------
1311 Tanzania -6.36821 34.885189
-----------------------------------
1312 Uganda 1.37777 32.2874
-----------------------------------
1313 Zimbabwe -19.01328 29.146661
-----------------------------------
1301 Benguela -12.5808 13.4076
-----------------------------------
1462 Egypt 30.046506 31.235627
-----------------------------------
1307 Marrakech 31.364767 -7.79112
-----------------------------------
1588 Khartoum 12.862807 30.217636
----------------------

-----------------------------------
1483 Ayeyarwady, Irrawadi Division 17.034213 96.682999
-----------------------------------
1478 Ayeyarwady, Ma-U-Bin 16.73 95.65
-----------------------------------
1591 Ayeyarwady, Myaungmya 19.8761 96.0452
-----------------------------------
1477 Bago, Indagaw 17.32774 96.498451
-----------------------------------
1681 Kachin, Nan Kway village 25.398039 97.382402
-----------------------------------
1474 Kaytho, Myanmar 19.0718 96.682999
-----------------------------------
1471 Mandalay, Mahã Aung Mye Township 21.975 96.083333
-----------------------------------
1470 Mandalay, Mandalay Division 21.980709 96.090103
-----------------------------------
1473 Mandalay, Mogok 22.921391 96.505409
-----------------------------------
1472 Mandalay, Mogok 22.921391 96.505409
-----------------------------------
1476 Mandalay, Yechan Oo village Pyin Oo Lwin 22.034189 96.469963
-----------------------------------
1479 Mon, Than-Phyu-Za-Yet 17.062604 97.351656
--

-----------------------------------
1443 Igatpuri 19.701885 73.55337
-----------------------------------
1618 Jalgaon 20.992993 75.554254
-----------------------------------
1444 Kalyan 19.25 73.13
-----------------------------------
1445 Khadavli 19.356829 73.21894
-----------------------------------
1446 Kolhapur 16.70163 74.24636
-----------------------------------
1603 Latur 18.392848 76.615729
-----------------------------------
1677 Mahad, 402301 18.0821 73.4215
-----------------------------------
1453 Manmad 20.249901 74.43586
-----------------------------------
1449 Nagpur 21.14156 79.08629
-----------------------------------
1448 Nagpur 21.14156 79.08629
-----------------------------------
1602 Nanded 19.148 77.297
-----------------------------------
1450 Nashik 19.98463 73.79349
-----------------------------------
1454 New Mumbai 19.026785 73.032432
-----------------------------------
1601 Oza Tola, Gondia 21.45985 80.198181
-----------------------------------
1624 Palghar 19

## 5. Retrieve Courses for Each Location

In [92]:
locations = db.locations.find()
# url = location[6]['coursesURL']
# browser.visit(url)
# soup = BeautifulSoup(browser.html, 'lxml')

In [None]:
for location in locations:
    url = location['coursesURL']
    browser.visit(url)
    soup = BeautifulSoup(browser.html, 'lxml')
    print('-----------------------------------')
    print(f"retrieving courses for {location['displayName']}")
    for year_header in soup.find_all('div', class_="sch-header-year"):
        course_table = year_header.next_sibling.next_sibling
        if course_table:
            year = year_header.text.strip()[:4]
            for row in course_table.find_all('tr', class_=lambda x: x != 'sch-header-title'):
                try:
                    cells = row.find_all('td')
                    dates =  [date_parser(elem.strip(), year) for elem in cells[1].text.split('-')]
                    course_type = cells[2].text.strip()
                    status = cells[3].text.strip()
                    map_url = cells[4].find('a')['href']
                    if '#' in map_url:
                        map_id = map_url.split("#")[1]
                    else:
                        map_id = map_url.split("/")[-1]
                    if db.locations.find_one({'mapID': map_id}):
                        location_id = db.locations.find_one({'mapID': map_id})['_id']
                    else:
                        location_id = location['_id']
                    post = {
                        'location_id': location_id,
                        'type': course_type,
                        'dates': {
                            'start': dates[0]
                        }
                    }
                    if len(dates) > 1:
                        post['dates']['end'] = dates[1]
                    if 'cancel' in status.lower():
                        db.courses.delete_one(post)
                    else:
                        db.courses.find_one_and_update(post, {'$set': post}, upsert=True)
                except:
                    print(f"Not able to enter course for year {year}, data {row}")

-----------------------------------
retrieving courses for Ethiopia
-----------------------------------
retrieving courses for Nairobi, Nairobi 00500
-----------------------------------
retrieving courses for Antsirabé
-----------------------------------
retrieving courses for Mauritius
-----------------------------------
retrieving courses for Mozambique
-----------------------------------
retrieving courses for Ile de la Réunion
-----------------------------------
retrieving courses for Kigali
-----------------------------------
retrieving courses for Tanzania
-----------------------------------
retrieving courses for Uganda
-----------------------------------
retrieving courses for Zimbabwe
-----------------------------------
retrieving courses for Benguela
-----------------------------------
retrieving courses for Egypt
-----------------------------------
retrieving courses for Marrakech
-----------------------------------
retrieving courses for Khartoum
---------------------------

-----------------------------------
retrieving courses for Ayeyarwady, Myaungmya
-----------------------------------
retrieving courses for Bago, Indagaw
-----------------------------------
retrieving courses for Kachin, Nan Kway village
-----------------------------------
retrieving courses for Kaytho, Myanmar
-----------------------------------
retrieving courses for Mandalay, Mahã Aung Mye Township
-----------------------------------
retrieving courses for Mandalay, Mandalay Division
-----------------------------------
retrieving courses for Mandalay, Mogok
-----------------------------------
retrieving courses for Mandalay, Mogok
-----------------------------------
retrieving courses for Mandalay, Yechan Oo village Pyin Oo Lwin
-----------------------------------
retrieving courses for Mon, Than-Phyu-Za-Yet
-----------------------------------
retrieving courses for Sagaing, Monywa
-----------------------------------
retrieving courses for Sagaing, Yin-Ma-Bin Township
--------------

-----------------------------------
retrieving courses for Jaipur
-----------------------------------
retrieving courses for Jodhpur
Not able to enter course for year 2018, data <tr class="record even-record " id="as_courses-list-43407-row" style="padding: 0px; margin: 0px;">
<td class="comments_html-column " style="padding: 5px 4px; margin: 0px; font-family: Verdana, sans-serif; font-size: 11px; border-style: solid solid solid none; border-top-color: rgb(197, 219, 247); border-right-color: rgb(221, 221, 221); border-bottom-color: rgb(197, 219, 247); border-left-color: initial; border-top-width: 0px; border-right-width: 1px; border-bottom-width: 1px; color: rgb(51, 51, 51);">COURSE WILL END ON 3RD SEPTEMBER EVENING<br/> </td>
</tr>
-----------------------------------
retrieving courses for Mt. Abu
-----------------------------------
retrieving courses for Pushkar, Ajmer
-----------------------------------
retrieving courses for Hastinapur
-----------------------------------
retrieving 

-----------------------------------
retrieving courses for Herefordshire
-----------------------------------
retrieving courses for Herefordshire
-----------------------------------
retrieving courses for Herefordshire & Suffolk
-----------------------------------
retrieving courses for Northern England
-----------------------------------
retrieving courses for Scotland, Applecross
-----------------------------------
retrieving courses for Suffolk, Stowmarket, Suffolk
-----------------------------------
retrieving courses for Zagreb
-----------------------------------
retrieving courses for Crete
-----------------------------------
retrieving courses for Firenze, Lutirano
-----------------------------------
retrieving courses for Macedonia
-----------------------------------
retrieving courses for Rogil
-----------------------------------
retrieving courses for Belgrade
-----------------------------------
retrieving courses for Ljubljana
-----------------------------------
retrieving c

### 5.1 Determine Date String Format & Convert to Date

Date String Formats
* Nov 04 - Nov 18
* 31 December, 2017 - 03 January, 2018
* 20 Dec - 31 Dec
* 03 Feb

In [58]:
def date_parser(date_str, year):
    date_ls = date_str.replace(",", "").split(" ")
    if [elem for elem in date_ls if len(elem) == 4]:
        return parse(date_str)
    else:
        return parse(date_str + " " + year)