# 02. Timebanks: Daily Scraper
> Author: [Dawn Graham](https://dawngraham.github.io/)

Get updates from TimeBanks.org at the beginning of each day:
- Numbers for exchanges, hours, members, offers, requests, last exchange
- Offers, requests, and talents by category
- All offer and request listings

On first initial run, set `header=True` to establish header in all .csv files. After, set `header=False` to append to existing files.

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import requests
import time
import unicodedata
import regex as re
from bs4 import BeautifulSoup

## Read in timebank slugs and url's

In [2]:
timebanks = pd.read_csv('../data/directory.csv', usecols=['slug', 'url'])
total_timebanks = timebanks.shape[0]

## Get updated details

After initial collection, the code below can be run to get updated numbers for exchanges, hours, last_exchange, members, offers, and requests. This information can be used to get a sense of activity over time.

In [3]:
updates = []
counter = 0

print(f'Getting {total_timebanks} timebank details... ')
for timebank in range(len(timebanks)):
    url = f"http://community.timebanks.org/{timebanks['slug'][timebank]}"
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')

    update = {}
    update['members'] = soup.find('div', {'class': 'views-field-field-num-users-value'}).span.text.strip().replace(',', '')
    update['last_exchange'] = soup.find('div', {'class': 'views-field-field-last-exchange-value'}).span.text.strip()
    update['offers'] = soup.find('div', {'class': 'views-field-field-active-offers-value'}).span.text.strip().replace(',', '')
    update['requests'] = soup.find('div', {'class': 'views-field-field-active-requests-value'}).span.text.strip().replace(',', '')
    update['exchanges'] = soup.find('div', {'class': 'views-field-field-num-exchanges-value'}).span.text.strip().replace(',', '')
    update['hours'] = soup.find('div', {'class': 'views-field-field-hours-exchanged-value'}).span.text.strip().replace(',', '')
    update['timebank'] = re.sub('http://','', timebanks['url'][timebank]).replace('.timebanks.org', '')
    
    updates.append(update)
        
    if (counter+1) % 5 == 0:
        print(counter+1, end=' ')
    
    time.sleep(1)
    counter += 1
    if counter == total_timebanks:
        print('Done.')

# Convert to dataframe
updates = pd.DataFrame(updates)

# Add timestamp
filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
updates['timestamp'] = pd.to_datetime(filetime, format='%y%m%d_%H%M%S')

# Append results to .csv file
with open('../data/updates.csv', 'a') as file:
    updates.to_csv(file, header=False, index=False)

# Preview
print(updates.shape)
updates.head(10)

Getting 156 timebank details... 
5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 Done.
(156, 8)


Unnamed: 0,exchanges,hours,last_exchange,members,offers,requests,timebank,timestamp
0,1139,6029,2 days 10 hours ago,179,37,34,addington,2019-02-10 09:46:44
1,439,1375,2 years 44 weeks ago,207,0,0,aha,2019-02-10 09:46:44
2,none (start up),none (start up),none (start up),2,0,0,alticultura,2019-02-10 09:46:44
3,2,3,3 years 15 weeks ago,8,0,0,andersoncommunity,2019-02-10 09:46:44
4,1,5,1 year 33 weeks ago,8,4,4,ate,2019-02-10 09:46:44
5,none (start up),none (start up),none (start up),2,0,0,avl,2019-02-10 09:46:44
6,2179,4889,8 weeks 18 hours ago,57,90,81,atx,2019-02-10 09:46:44
7,7,14,1 year 49 weeks ago,11,0,0,labodunautilus,2019-02-10 09:46:44
8,74,384,7 weeks 5 days ago,37,2,2,blackhills,2019-02-10 09:46:44
9,210,1176,4 days 27 min ago,181,61,50,ujimaboston,2019-02-10 09:46:44


## Get offers by category

In [4]:
offers = []

counter = 0

print(f'Getting offers from {total_timebanks} timebanks... ')

# Iterate through all timebank directories
for timebank in range(len(timebanks)):
    url = f"{timebanks['url'][timebank]}/offers"
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')

    timebank_name = re.sub('http://','', timebanks['url'][timebank]).replace('.timebanks.org', '')
    
    try:
        # Get all parent categories
        for panel in soup.findAll('div', {'class': 'cw-panel'}):
            category = {}
            cat = panel.a
            category['count_offers'] = int(cat.span.text)
            cat.span.clear()
            parent_id = int(cat.get('href').strip('/ads?type=1&amp;cat='))
            category['cat_id'] = parent_id
            category['cat_parent'] = 'is_parent'
            category['category'] = cat.get_text()
            category['timebank'] = timebank_name
            offers.append(category)

            # Get all child categories
            for li in panel.findAll('li'):
                try:
                    category = {}
                    category['cat_id'] = int(li.a.get('href').strip('/ads?type=1&amp;cat='))
                    category['cat_parent'] = parent_id
                    category['category'] = li.a.get_text()
                    category['count_offers'] = int(li.span.text)
                    category['timebank'] = timebank_name
                    offers.append(category)
                except:
                    pass
    except:
        pass

    if (counter+1) % 5 == 0:
        print(counter+1, end=' ')
    
    time.sleep(1)
    counter += 1
    if counter == total_timebanks:
        print('Done.')

# Save to dateframe and drop duplicates
offers = pd.DataFrame(offers)
offers.drop_duplicates(inplace=True)
    
# Add timestamp
filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
offers['timestamp'] = pd.to_datetime(filetime, format='%y%m%d_%H%M%S')

# Append results to .csv file
with open('../data/offers.csv', 'a') as file:
    offers.to_csv(file, header=False, index=False)

# Preview
print(offers.shape)
offers.head(10)

Getting offers from 156 timebanks... 
5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 Done.
(2574, 6)


Unnamed: 0,cat_id,cat_parent,category,count_offers,timebank,timestamp
0,8,is_parent,"Arts, Crafts & Music",3,addington,2019-02-10 09:54:30
1,62,8,Crafts,2,addington,2019-02-10 09:54:30
2,67,8,Miscellaneous,1,addington,2019-02-10 09:54:30
3,65,8,Photo & Video,1,addington,2019-02-10 09:54:30
4,0,is_parent,Business Services,6,addington,2019-02-10 09:54:30
5,75,0,Clerical,2,addington,2019-02-10 09:54:30
6,76,0,Computer Support,5,addington,2019-02-10 09:54:30
7,74,0,Proof Reading,1,addington,2019-02-10 09:54:30
8,4,is_parent,Community Activities,5,addington,2019-02-10 09:54:30
9,33,4,Clean-up / Recycling,1,addington,2019-02-10 09:54:30


## Get requests by category

In [5]:
reqs = []

counter = 0

print(f'Getting requests from {total_timebanks} timebanks... ')

# Iterate through all timebank directories
for timebank in range(len(timebanks)):
    url = f"{timebanks['url'][timebank]}/requests"
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')

    timebank_name = re.sub('http://','', timebanks['url'][timebank]).replace('.timebanks.org', '')
    
    try:
        # Get all parent categories
        for panel in soup.findAll('div', {'class': 'cw-panel'}):
            category = {}
            cat = panel.a
            category['count_requests'] = int(cat.span.text)
            cat.span.clear()
            parent_id = int(cat.get('href').strip('/ads?type=2&amp;cat='))
            category['cat_id'] = parent_id
            category['cat_parent'] = 'is_parent'
            category['category'] = cat.get_text()
            category['timebank'] = timebank_name
            reqs.append(category)

            # Get all child categories
            for li in panel.findAll('li'):
                try:
                    category = {}
                    category['cat_id'] = int(li.a.get('href').strip('/ads?type=2&amp;cat='))
                    category['cat_parent'] = parent_id
                    category['category'] = li.a.get_text()
                    category['count_requests'] = int(li.span.text)
                    category['timebank'] = timebank_name
                    reqs.append(category)
                except:
                    pass
    except:
        pass

    if (counter+1) % 5 == 0:
        print(counter+1, end=' ')
    
    time.sleep(1)
    counter += 1
    if counter == total_timebanks:
        print('Done.')

# Save to dateframe and drop duplicates
reqs = pd.DataFrame(reqs)
reqs.drop_duplicates(inplace=True)
    
# Add timestamp
filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
reqs['timestamp'] = pd.to_datetime(filetime, format='%y%m%d_%H%M%S')

# Append results to .csv file
with open('../data/requests.csv', 'a') as file:
    reqs.to_csv(file, header=False, index=False)

# Preview
print(reqs.shape)
reqs.head(10)

Getting requests from 156 timebanks... 
5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 Done.
(1424, 6)


Unnamed: 0,cat_id,cat_parent,category,count_requests,timebank,timestamp
0,8,is_parent,"Arts, Crafts & Music",1,addington,2019-02-10 10:01:23
1,6,8,Crafts,1,addington,2019-02-10 10:01:23
2,64,8,Lessons,1,addington,2019-02-10 10:01:23
3,10,is_parent,Business Services,1,addington,2019-02-10 10:01:23
4,75,10,Clerical,1,addington,2019-02-10 10:01:23
5,81,10,Miscellaneous,1,addington,2019-02-10 10:01:23
6,4,is_parent,Community Activities,4,addington,2019-02-10 10:01:23
7,34,4,Community Service,1,addington,2019-02-10 10:01:23
8,36,4,Help Our TimeBank!,1,addington,2019-02-10 10:01:23
9,39,4,Miscellaneous,1,addington,2019-02-10 10:01:23


## Get talents by category
Note: Number of Timebankers' talents on timebank homepage does not correspond to number when you click on category link.

In [6]:
talents = []

counter = 0

print(f'Getting talents from {total_timebanks} timebanks... ')

# Iterate through all timebank directories
for timebank in range(len(timebanks)):
    url = f"{timebanks['url'][timebank]}/directory"
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')

    timebank_name = re.sub('http://','', timebanks['url'][timebank]).replace('.timebanks.org', '')
    
    try:
        # Get all parent categories
        for panel in soup.findAll('div', {'class': 'cw-panel'}):
            category = {}
            cat = panel.a
            category['count_talent'] = int(cat.span.text)
            cat.span.clear()
            parent_id = int(cat.get('href').strip('/directory?category='))
            category['cat_id'] = parent_id
            category['cat_parent'] = 'is_parent'
            category['category'] = cat.get_text()
            category['timebank'] = timebank_name
            talents.append(category)

            # Get all child categories
            for li in panel.findAll('li'):
                try:
                    category = {}
                    category['cat_id'] = int(li.a.get('href').strip('/directory?category='))
                    category['cat_parent'] = parent_id
                    category['category'] = li.a.get_text()
                    category['count_talent'] = int(li.span.text)
                    category['timebank'] = timebank_name
                    talents.append(category)
                except:
                    pass
    except:
        pass

    if (counter+1) % 5 == 0:
        print(counter+1, end=' ')
    
    time.sleep(1)
    counter += 1
    if counter == total_timebanks:
        print('Done.')

# Save to dateframe and drop duplicates
talents = pd.DataFrame(talents)
talents.drop_duplicates(inplace=True)
    
# Add timestamp
filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
talents['timestamp'] = pd.to_datetime(filetime, format='%y%m%d_%H%M%S')

# Append results to .csv file
with open('../data/talents.csv', 'a') as file:
    talents.to_csv(file, header=False, index=False)

# Preview
print(talents.shape)
talents.head(10)

Getting talents from 156 timebanks... 
5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 Done.
(8460, 6)


Unnamed: 0,cat_id,cat_parent,category,count_talent,timebank,timestamp
0,8,is_parent,"Arts, Crafts & Music",46,addington,2019-02-10 10:07:25
1,61,8,Classes,8,addington,2019-02-10 10:07:25
2,62,8,Crafts,19,addington,2019-02-10 10:07:25
3,63,8,Entertainment,5,addington,2019-02-10 10:07:25
4,64,8,Lessons,4,addington,2019-02-10 10:07:25
5,67,8,Miscellaneous,2,addington,2019-02-10 10:07:25
6,65,8,Photo & Video,7,addington,2019-02-10 10:07:25
7,66,8,Theater,1,addington,2019-02-10 10:07:25
8,10,is_parent,Business Services,79,addington,2019-02-10 10:07:25
9,75,10,Clerical,14,addington,2019-02-10 10:07:25


## Get all offer & request listings
**Order to capture listings:**  
- For both listings types (1 = offers, 2 = requests)
    - get all timebanks
        - get all pages for each timebank
            - get all listings on each page

In [7]:
listings = []

print(f'Getting listings from {total_timebanks} timebanks... ')

# Get type 1 (offers) and type 2 (requests)
for i in range(1, 3):
    
    print(f'Getting type {i}... ', end=' ')
    counter = 0
    
    # Iterate through all timebank directories
    for timebank in range(len(timebanks)):
        url = f"{timebanks['url'][timebank]}/ads?so=desc&o=updated&limit=100&type={i}"
        res = requests.get(url)
        soup = BeautifulSoup(res.content, 'lxml')
        
        try:
            # Get total number of listings
            get_total = soup.find('div', {'class': 'pager-state'}).text
            total_listings = re.sub(r'(Showing \d+ - \d+ of )', '', get_total)
            try:
                total_listings = (int(total_listings))
            except:
                total_listings = 0

            # Get number of pages needed to get all listings
            pages = total_listings//100 + 1

            # Iterate through pages
            for page in range(pages):

                url = f"{timebanks['url'][timebank]}/ads?so=desc&o=updated&limit=100&type={i}&offset={page * 100}"
                res = requests.get(url)
                soup = BeautifulSoup(res.content, 'lxml')

                timebank_name = re.sub('http://','', timebanks['url'][timebank]).replace('.timebanks.org', '')

                for row in soup.findAll('div', {'class': 'media'}):

                    listing = {}

                    # Get listing id
                    listing['listing_id'] = int(row.find('input', {'class': ' selection-id'}).get('value'))

                    # Add all parent and child categories for a listing to `cat` set
                    cat = set()

                    for parent in row.findAll('a', {'class': 'parent'}):
                        cat.add(int(parent.get('href').strip('/ads?cat=')))

                    for child in row.findAll('a', {'class': 'child'}):
                        cat.add(int(child.get('href').strip('/ads?cat=')))

                    listing['listing_cat'] = cat

                    # Get listing description
                    listing['description'] = row.h4.text.strip()

                    # Add timebank name
                    listing['timebank'] = timebank_name

                    # Add listing type
                    if i == 1:
                        listing['type'] = 'offer'
                    else:
                        listing['type'] = 'request'

                    listings.append(listing)
                    
        except:
            pass
                
        if (counter+1) % 5 == 0:
            print(counter+1, end=' ')

        time.sleep(1)
        counter += 1
        if counter == total_timebanks:
            print('Done.')
        
# Convert to dataframe
listings = pd.DataFrame(listings)

# Add timestamp
filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
listings['timestamp'] = pd.to_datetime(filetime, format='%y%m%d_%H%M%S')

# Append results to .csv file
with open('../data/listings.csv', 'a') as file:
    listings.to_csv(file, header=False, index=False)

# Preview
print(listings.shape)
listings.head(10)

Getting listings from 156 timebanks... 
Getting type 1...  5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 Done.
Getting type 2...  5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 Done.
(5588, 6)


Unnamed: 0,description,listing_cat,listing_id,timebank,type,timestamp
0,Proof read articles for spelling,"{10, 75, 76, 174}",2432,addington,offer,2019-02-10 10:26:33
1,Childminding,"{2, 3, 19, 24, 29}",2428,addington,offer,2019-02-10 10:26:33
2,babysitting,"{2, 19}",2348,addington,offer,2019-02-10 10:26:33
3,I offer car wash.,"{33, 34, 35, 4}",2421,addington,offer,2019-02-10 10:26:33
4,Listening Ear/Chat,"{32, 3, 5, 41, 31}",2420,addington,offer,2019-02-10 10:26:33
5,Tutoring,"{59, 7}",2418,addington,offer,2019-02-10 10:26:33
6,On Line Dating,"{56, 5, 46, 7}",2415,addington,offer,2019-02-10 10:26:33
7,Gib board,"{9, 69, 4, 37}",2409,addington,offer,2019-02-10 10:26:33
8,Pet feeding,"{9, 2, 74, 23}",2392,addington,offer,2019-02-10 10:26:33
9,Proof reading and Editing,"{10, 75}",2388,addington,offer,2019-02-10 10:26:33
