# 04. Daily Scraper
> Author: [Dawn Graham](https://dawngraham.github.io/)

Get updates at the beginning of each day.

## Import Libraries

In [8]:
import pandas as pd
import numpy as np
import requests
import time
import unicodedata
import regex as re
from bs4 import BeautifulSoup

## Get updated details

After initial collection, the code below can be run to get updated numbers for exchanges, hours, last_exchange, members, offers, and requests. This information can be used to get a sense of activity over time.

In [2]:
tb_slugs = pd.read_csv('../data/timebanks_190112_000745.csv', usecols=['slug'])

timebanks = []
counter = 0
total_timebanks = tb_slugs.shape[0]

print(f'Getting {total_timebanks} timebank details... ')
for tb in range(len(tb_slugs)):
    url = f"http://community.timebanks.org/{tb_slugs['slug'][tb]}"
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')

    timebank = {}
    timebank['members'] = soup.find('div', {'class': 'views-field-field-num-users-value'}).span.text.strip().replace(',', '')
    timebank['last_exchange'] = soup.find('div', {'class': 'views-field-field-last-exchange-value'}).span.text.strip()
    timebank['offers'] = soup.find('div', {'class': 'views-field-field-active-offers-value'}).span.text.strip().replace(',', '')
    timebank['requests'] = soup.find('div', {'class': 'views-field-field-active-requests-value'}).span.text.strip().replace(',', '')
    timebank['exchanges'] = soup.find('div', {'class': 'views-field-field-num-exchanges-value'}).span.text.strip().replace(',', '')
    timebank['hours'] = soup.find('div', {'class': 'views-field-field-hours-exchanged-value'}).span.text.strip().replace(',', '')
    timebank['url'] = soup.find('div', {'class': 'views-field-markup-1'}).a['href'].strip()
    
    timebanks.append(timebank)
        
    if (counter+1) % 5 == 0:
        print(counter+1, end=' ')
    
    time.sleep(1)
    counter += 1
    if counter == total_timebanks:
        print('Done.')

timebanks = pd.DataFrame(timebanks)
        
# Export to csv
filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
timebanks.to_csv(f'../data/timebank_updates_{filetime}.csv', index=False)

# Preview
print(timebanks.shape)
timebanks.head(10)

Getting 158 timebank details... 
5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 Done.


Unnamed: 0,exchanges,hours,last_exchange,members,offers,requests,url
0,1132,5997,16 hours 36 min ago,178,36,27,http://addington.timebanks.org
1,439,1375,2 years 41 weeks ago,207,0,0,http://aha.timebanks.org
2,none (start up),none (start up),none (start up),2,0,0,http://alticultura.timebanks.org
3,2,3,3 years 12 weeks ago,8,0,0,http://andersoncommunity.timebanks.org
4,1,5,1 year 30 weeks ago,8,4,4,http://ate.timebanks.org
5,none (start up),none (start up),none (start up),2,0,0,http://avl.timebanks.org
6,2179,4889,5 weeks 2 days ago,58,91,80,http://atx.timebanks.org
7,7,14,1 year 47 weeks ago,11,0,0,http://labodunautilus.timebanks.org
8,74,384,5 weeks 1 hour ago,37,2,2,http://blackhills.timebanks.org
9,199,1140,1 day 10 hours ago,176,59,50,http://ujimaboston.timebanks.org


## Get offers by category

In [3]:
timebanks = pd.read_csv('../data/timebanks_190112_000745.csv', usecols=['url'])

offers = []

counter = 0
total_timebanks = timebanks.shape[0]

print(f'Getting offers from {total_timebanks} timebanks... ')

# Iterate through all timebank directories
for timebank in range(len(timebanks)):
    url = f"{timebanks['url'][timebank]}/offers"
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')

    timebank_name = timebanks['url'][timebank].replace('.timebanks.org', '').strip('http://')
    
    try:
        # Get all parent categories
        for panel in soup.findAll('div', {'class': 'cw-panel'}):
            category = {}
            cat = panel.a
            category['count_offers'] = int(cat.span.text)
            cat.span.clear()
            parent_id = int(cat.get('href').strip('/ads?type=1&amp;cat='))
            category['cat_id'] = parent_id
            category['cat_parent'] = 'is_parent'
            category['category'] = cat.get_text()
            category['timebank'] = timebank_name
            offers.append(category)

            # Get all child categories
            for li in panel.findAll('li'):
                try:
                    category = {}
                    category['cat_id'] = int(li.a.get('href').strip('/ads?type=1&amp;cat='))
                    category['cat_parent'] = parent_id
                    category['category'] = li.a.get_text()
                    category['count_offers'] = int(li.span.text)
                    category['timebank'] = timebank_name
                    offers.append(category)
                except:
                    pass
    except:
        pass

    if (counter+1) % 5 == 0:
        print(counter+1, end=' ')
    
    time.sleep(1)
    counter += 1
    if counter == total_timebanks:
        print('Done.')

# Save to dateframe and drop duplicates
offers = pd.DataFrame(offers)
offers.drop_duplicates(inplace=True)
    
# Export to csv
filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
offers.to_csv(f'../data/offers_{filetime}.csv', index=False)

# Preview
print(offers.shape)
offers.head(10)

Getting offers from 158 timebanks... 
5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 Done.


Unnamed: 0,cat_id,cat_parent,category,count_offers,timebank
0,8,is_parent,"Arts, Crafts & Music",3,addington
1,62,8,Crafts,2,addington
2,67,8,Miscellaneous,1,addington
3,65,8,Photo & Video,1,addington
4,0,is_parent,Business Services,6,addington
5,75,0,Clerical,2,addington
6,76,0,Computer Support,4,addington
7,8,0,Miscellaneous,1,addington
8,74,0,Proof Reading,1,addington
9,4,is_parent,Community Activities,5,addington


## Get requests by category

In [4]:
reqs = []

counter = 0
total_timebanks = timebanks.shape[0]

print(f'Getting requests from {total_timebanks} timebanks... ')

# Iterate through all timebank directories
for timebank in range(len(timebanks)):
    url = f"{timebanks['url'][timebank]}/requests"
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')

    timebank_name = timebanks['url'][timebank].replace('.timebanks.org', '').strip('http://')
    
    try:
        # Get all parent categories
        for panel in soup.findAll('div', {'class': 'cw-panel'}):
            category = {}
            cat = panel.a
            category['count_requests'] = int(cat.span.text)
            cat.span.clear()
            parent_id = int(cat.get('href').strip('/ads?type=2&amp;cat='))
            category['cat_id'] = parent_id
            category['cat_parent'] = 'is_parent'
            category['category'] = cat.get_text()
            category['timebank'] = timebank_name
            reqs.append(category)

            # Get all child categories
            for li in panel.findAll('li'):
                try:
                    category = {}
                    category['cat_id'] = int(li.a.get('href').strip('/ads?type=2&amp;cat='))
                    category['cat_parent'] = parent_id
                    category['category'] = li.a.get_text()
                    category['count_requests'] = int(li.span.text)
                    category['timebank'] = timebank_name
                    reqs.append(category)
                except:
                    pass
    except:
        pass

    if (counter+1) % 5 == 0:
        print(counter+1, end=' ')
    
    time.sleep(1)
    counter += 1
    if counter == total_timebanks:
        print('Done.')

# Save to dateframe and drop duplicates
reqs = pd.DataFrame(reqs)
reqs.drop_duplicates(inplace=True)
    
# Export to csv
filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
reqs.to_csv(f'../data/requests_{filetime}.csv', index=False)

# Preview
print(reqs.shape)
reqs.head(10)

Getting requests from 158 timebanks... 
5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 Done.


Unnamed: 0,cat_id,cat_parent,category,count_requests,timebank
0,8,is_parent,"Arts, Crafts & Music",1,addington
1,6,8,Crafts,1,addington
2,64,8,Lessons,1,addington
3,10,is_parent,Business Services,1,addington
4,75,10,Clerical,1,addington
5,81,10,Miscellaneous,1,addington
6,4,is_parent,Community Activities,2,addington
7,34,4,Community Service,1,addington
8,39,4,Miscellaneous,1,addington
9,38,4,Work For Social Change,1,addington


## Get talents by category

In [5]:
talents = []

counter = 0
total_timebanks = timebanks.shape[0]

print(f'Getting talents from {total_timebanks} timebanks... ')

# Iterate through all timebank directories
for timebank in range(len(timebanks)):
    url = f"{timebanks['url'][timebank]}/directory"
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')

    timebank_name = timebanks['url'][timebank].replace('.timebanks.org', '').strip('http://')
    
    try:
        # Get all parent categories
        for panel in soup.findAll('div', {'class': 'cw-panel'}):
            category = {}
            cat = panel.a
            category['count_talent'] = int(cat.span.text)
            cat.span.clear()
            parent_id = int(cat.get('href').strip('/directory?category='))
            category['cat_id'] = parent_id
            category['cat_parent'] = 'is_parent'
            category['category'] = cat.get_text()
            category['timebank'] = timebank_name
            talents.append(category)

            # Get all child categories
            for li in panel.findAll('li'):
                try:
                    category = {}
                    category['cat_id'] = int(li.a.get('href').strip('/directory?category='))
                    category['cat_parent'] = parent_id
                    category['category'] = li.a.get_text()
                    category['count_talent'] = int(li.span.text)
                    category['timebank'] = timebank_name
                    talents.append(category)
                except:
                    pass
    except:
        pass

    if (counter+1) % 5 == 0:
        print(counter+1, end=' ')
    
    time.sleep(1)
    counter += 1
    if counter == total_timebanks:
        print('Done.')

# Save to dateframe and drop duplicates
talents = pd.DataFrame(talents)
talents.drop_duplicates(inplace=True)
    
# Export to csv
filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
talents.to_csv(f'../data/talents_{filetime}.csv', index=False)

# Preview
print(talents.shape)
talents.head(10)

Getting talents from 158 timebanks... 
5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 Done.


Unnamed: 0,cat_id,cat_parent,category,count_talent,timebank
0,8,is_parent,"Arts, Crafts & Music",46,addington
1,61,8,Classes,8,addington
2,62,8,Crafts,19,addington
3,63,8,Entertainment,5,addington
4,64,8,Lessons,4,addington
5,67,8,Miscellaneous,2,addington
6,65,8,Photo & Video,7,addington
7,66,8,Theater,1,addington
8,10,is_parent,Business Services,79,addington
9,75,10,Clerical,14,addington


## Get all offer & request descriptions

In [9]:
listings = []

total_timebanks = timebanks.shape[0]
print(f'Getting listings from {total_timebanks} timebanks... ')

# Get type 1 (offers) and type 2 (requests)
for i in range(1, 3):
    
    print(f'Getting type {i}... ', end=' ')
    counter = 0
    
    # Iterate through all timebank directories
    for timebank in range(len(timebanks)):
        url = f"{timebanks['url'][timebank]}/ads?so=desc&o=updated&limit=100&type={i}"
        res = requests.get(url)
        soup = BeautifulSoup(res.content, 'lxml')
        
        # Get total number of listings
        get_total = soup.find('div', {'class': 'pager-state'}).text
        total_listings = re.sub(r'(Showing \d+ - \d+ of )', '', get_total)
        try:
            total_listings = (int(total_listings))
        except:
            total_listings = 0
        
        # Get number of pages needed to get all listings
        pages = total_listings//100 + 1
        
        # Iterate through pages
        for page in range(pages):
            
            url = f"{timebanks['url'][timebank]}/ads?so=desc&o=updated&limit=100&type={i}&offset={page * 100}"
            res = requests.get(url)
            soup = BeautifulSoup(res.content, 'lxml')

            timebank_name = timebanks['url'][timebank].replace('.timebanks.org', '').strip('http://')

            for row in soup.findAll('div', {'class': 'media'}):

                listing = {}

                # Get listing id
                listing['listing_id'] = int(row.find('input', {'class': ' selection-id'}).get('value'))

                # Add all parent and child categories for a listing to `cat` set
                cat = set()

                for parent in row.findAll('a', {'class': 'parent'}):
                    cat.add(int(parent.get('href').strip('/ads?cat=')))

                for child in row.findAll('a', {'class': 'child'}):
                    cat.add(int(child.get('href').strip('/ads?cat=')))

                listing['listing_cat'] = cat

                # Get listing description
                listing['description'] = row.h4.text.strip()

                # Add timebank name
                listing['timebank'] = timebank_name

                # Add listing type
                if i == 1:
                    listing['type'] = 'offer'
                else:
                    listing['type'] = 'request'

                listings.append(listing)
                
        if (counter+1) % 5 == 0:
            print(counter+1, end=' ')

        time.sleep(1)
        counter += 1
        if counter == total_timebanks:
            print('Done.')
        
# Save to dateframe
listings = pd.DataFrame(listings)

# Export to csv
filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
listings.to_csv(f'../data/listings_{filetime}.csv', index=False)

# Preview
print(listings.shape)
listings.head(10)

Getting listings from 158 timebanks... 
Getting type 1...  5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 Done.
Getting type 2...  5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 Done.


Unnamed: 0,description,listing_cat,listing_id,timebank,type
0,I offer car wash.,"{33, 34, 35, 4}",2421,addington,offer
1,Listening Ear/Chat,"{32, 3, 5, 41, 31}",2420,addington,offer
2,Tutoring,"{59, 7}",2418,addington,offer
3,On Line Dating,"{56, 5, 46, 7}",2415,addington,offer
4,Gib board,"{9, 69, 4, 37}",2409,addington,offer
5,Window cleaning,"{2, 22}",2411,addington,offer
6,Pet feeding,"{9, 2, 74, 23}",2392,addington,offer
7,Proof reading and Editing,"{10, 75}",2388,addington,offer
8,"Advice or any IT development - Website, Iot, d...","{70, 7, 9, 10, 76, 56}",2380,addington,offer
9,Fix electronics - don't throw away your electr...,"{70, 7, 9, 10, 76, 56}",2379,addington,offer
