# 01. Scraper: Directory
> Author: [Dawn Graham](https://dawngraham.github.io/)

Scrape all pages of TimeBanks.org directory to get initial listing of all timebanks.

## Import libraries

In [1]:
import pandas as pd
import requests
import time
import unicodedata
import regex as re
from bs4 import BeautifulSoup

## Get slugs

In [2]:
tb_slugs = []

counter = 0
print('Getting timebank slugs... Directory page # ')

# Cycle through all pages in directory
for page in range(16):
    url = f'http://community.timebanks.org/directory?page={page}&js=1&order=title_1&sort=asc'
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')
    table = soup.find('table')
    
    for row in table.find('tbody').find_all('tr'):
        tb_slug = {}
        tb_slug['name'] = row.find('a').text.strip()
        tb_slug['slug'] = row.find('a').attrs['href'].strip('/')
        tb_slugs.append(tb_slug)
        
    print(counter+1, end=' ')
    
    time.sleep(1)
    counter += 1

Getting timebank slugs... Directory page # 
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 

In [3]:
pd.DataFrame(tb_slugs).head()

Unnamed: 0,name,slug
0,Addington TimeBank,timebanks/addington-timebank
1,AHA Time Bank (Alaskans Helping Alaskans),timebanks/aha-time-bank-alaskans-helping-alaskans
2,Alticultura,timebanks/alticultura
3,Anderson Community Timebank,timebanks/anderson-community-timebank
4,Appalachian Time Exchange,timebanks/appalachian-time-exchange


## Get timebank details

In [7]:
timebanks = []
counter = 0
total_timebanks = pd.DataFrame(tb_slugs).shape[0]

print(f'Getting {total_timebanks} timebank details... ')
for tb in range(len(tb_slugs)):
    url = f"http://community.timebanks.org/{tb_slugs[tb]['slug']}"
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')

    timebank = {}
    timebank['name'] = tb_slugs[tb]['name']
    timebank['slug'] = tb_slugs[tb]['slug']
    timebank['focus'] = soup.find('div', {'class': 'views-field-name'}).span.text.strip()
    try:
        timebank['sponsor'] = soup.find('div', {'class': 'views-field-field-tb-sponsor-value'}).span.text.strip()
    except:
        pass
    timebank['url'] = soup.find('div', {'class': 'views-field-markup-1'}).a['href'].strip()
    try:
        timebank['url_2'] = soup.find('div', {'class': 'views-field-field-custom-url-url'}).a.text.strip()
    except:
        pass
    timebank['address'] = soup.find('div', {'class': 'views-field-street'}).span.get_text(', ').strip()
    timebank['postal'] = soup.find('div', {'class': 'views-field-postal-code'}).span.text.strip()
    timebank['country'] = soup.find('div', {'class': 'views-field-country'}).span.text.strip()
    try:
        timebank['phone'] = soup.find('div', {'class': 'views-field-phone'}).span.text.strip()
    except:
        pass
    timebank['notes'] = soup.find('div', {'class': 'views-field-body'}).div.text.replace('\xa0',' ').replace('\n', ' ')
    timebank['timebank'] = re.sub('http://','', timebank['url']).replace('.timebanks.org', '')
    
    # Get mission statement
    try:
        url = f"{timebank['url']}/mission"
        res = requests.get(url)
        soup = BeautifulSoup(res.content, 'lxml')

        timebank['mission'] = soup.find('div', {'class': 'page-content'}).text.strip().replace('\xa0',' ').replace('\n', ' ').replace('\r', '')
    except:
        pass
    
    timebanks.append(timebank)
        
    if (counter+1) % 5 == 0:
        print(counter+1, end=' ')
    
    time.sleep(1)
    counter += 1
    if counter == total_timebanks:
        print('Done.')

# Save to dataframe
timebanks = pd.DataFrame(timebanks)

# Do not include demo or test-launch in directory
timebanks = timebanks[timebanks['timebank'] != 'democw3']
timebanks = timebanks[timebanks['timebank'] != 'test-launch-2018-07-28-2058']

# Save to .csv
timebanks.to_csv('../data/directory.csv', index=False)

# Preview
timebanks.head()

Getting 158 timebank details... 
5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 Done.


Unnamed: 0,address,country,focus,mission,name,notes,phone,postal,slug,sponsor,timebank,url,url_2
0,"7 Dickens St, Addington",New Zealand,,Addington TimeBank: Serving Southern Christchu...,Addington TimeBank,This TimeBank became live on 1 February 2012. ...,64 3 3381613,"Christchurch, Canterbury 8024",timebanks/addington-timebank,Addington Community House (Inc),addington,http://addington.timebanks.org,
1,"331 Sterling Highway, PO BOX 3493 (mailing add...",United States,,Our mission is to strengthen our community b...,AHA Time Bank (Alaskans Helping Alaskans),,907-299-2060,"Homer, Alaska 99603",timebanks/aha-time-bank-alaskans-helping-alaskans,,aha,http://aha.timebanks.org,
2,"Entrada al Refugio Tierra Colorada Baja , PO B...",Guatemala,,Build adaptive capacity for integrated sustain...,Alticultura,,1150230873516,"Quetzaltenango, Quetzaltenango 9001",timebanks/alticultura,Alticultura,alticultura,http://alticultura.timebanks.org,
3,2889 E. Center St,United States,,Neighbors helping neighbors by utilizing their...,Anderson Community Timebank,,530-365-6183,"Anderson, California 96007",timebanks/anderson-community-timebank,Shasta Thrive,andersoncommunity,http://andersoncommunity.timebanks.org,
4,298 School Circle,United States,,To empower our community in sharing its gifts ...,Appalachian Time Exchange,,(706) 530-0114,"Blairsville, Georgia 30512",timebanks/appalachian-time-exchange,Unity of Blairsville,ate,http://ate.timebanks.org,http://www.unityofblairsville.org/ate
