# 02. Scraper: Categories
> Author: [Dawn Graham](https://dawngraham.github.io/)

Get all of the categories from timebank directory pages to match with talents, offers, and requests.

## Import libraries

In [1]:
import pandas as pd
import numpy as np
import requests
import time
from bs4 import BeautifulSoup

Note: Number of Timebankers' talents on timebank homepage does not correspond to number when you click on category link.

## Get timebank url's

In [2]:
timebanks = pd.read_csv('./data/timebanks_190112_000745.csv', usecols=['url'])
timebanks.head()

Unnamed: 0,url
0,http://addington.timebanks.org
1,http://aha.timebanks.org
2,http://alticultura.timebanks.org
3,http://andersoncommunity.timebanks.org
4,http://ate.timebanks.org


## Get offers

In [3]:
offers = []

counter = 0
total_timebanks = timebanks.shape[0]

print(f'Getting offers from {total_timebanks} timebanks... ')

# Iterate through all timebank directories
for timebank in range(len(timebanks)):
    url = f"{timebanks['url'][timebank]}/offers"
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')

    timebank_name = timebanks['url'][timebank].replace('.timebanks.org', '').strip('http://')
    
    try:
        # Get all parent categories
        for panel in soup.findAll('div', {'class': 'cw-panel'}):
            category = {}
            cat = panel.a
            category['count_offers'] = int(cat.span.text)
            cat.span.clear()
            parent_id = int(cat.get('href').strip('/ads?type=1&amp;cat='))
            category['cat_id'] = parent_id
            category['cat_parent'] = 'is_parent'
            category['category'] = cat.get_text()
            category['timebank'] = timebank_name
            offers.append(category)

            # Get all child categories
            for li in panel.findAll('li'):
                try:
                    category = {}
                    category['cat_id'] = int(li.a.get('href').strip('/ads?type=1&amp;cat='))
                    category['cat_parent'] = parent_id
                    category['category'] = li.a.get_text()
                    category['count_offers'] = int(li.span.text)
                    category['timebank'] = timebank_name
                    offers.append(category)
                except:
                    pass
    except:
        pass

    print(counter + 1, end=' ')
    
    time.sleep(1)
    counter += 1

# Save to dateframe and drop duplicates
offers = pd.DataFrame(offers)
offers.drop_duplicates(inplace=True)
    
# Export to csv
filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
offers.to_csv(f'./data/offers_{filetime}.csv', index=False)

Getting offers from 158 timebanks... 
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 

In [4]:
offers.head(10)

Unnamed: 0,cat_id,cat_parent,category,count_offers,timebank
0,8,is_parent,"Arts, Crafts & Music",3,addington
1,62,8,Crafts,2,addington
2,67,8,Miscellaneous,1,addington
3,65,8,Photo & Video,1,addington
4,0,is_parent,Business Services,6,addington
5,75,0,Clerical,2,addington
6,76,0,Computer Support,4,addington
7,8,0,Miscellaneous,1,addington
8,74,0,Proof Reading,1,addington
9,4,is_parent,Community Activities,5,addington


## Get requests

In [5]:
reqs = []

counter = 0
total_timebanks = timebanks.shape[0]

print(f'Getting requests from {total_timebanks} timebanks... ')

# Iterate through all timebank directories
for timebank in range(len(timebanks)):
    url = f"{timebanks['url'][timebank]}/requests"
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')

    timebank_name = timebanks['url'][timebank].replace('.timebanks.org', '').strip('http://')
    
    try:
        # Get all parent categories
        for panel in soup.findAll('div', {'class': 'cw-panel'}):
            category = {}
            cat = panel.a
            category['count_requests'] = int(cat.span.text)
            cat.span.clear()
            parent_id = int(cat.get('href').strip('/ads?type=2&amp;cat='))
            category['cat_id'] = parent_id
            category['cat_parent'] = 'is_parent'
            category['category'] = cat.get_text()
            category['timebank'] = timebank_name
            reqs.append(category)

            # Get all child categories
            for li in panel.findAll('li'):
                try:
                    category = {}
                    category['cat_id'] = int(li.a.get('href').strip('/ads?type=2&amp;cat='))
                    category['cat_parent'] = parent_id
                    category['category'] = li.a.get_text()
                    category['count_requests'] = int(li.span.text)
                    category['timebank'] = timebank_name
                    reqs.append(category)
                except:
                    pass
    except:
        pass

    print(counter + 1, end=' ')
    
    time.sleep(1)
    counter += 1

# Save to dateframe and drop duplicates
reqs = pd.DataFrame(reqs)
reqs.drop_duplicates(inplace=True)
    
# Export to csv
filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
reqs.to_csv(f'./data/requests_{filetime}.csv', index=False)

Getting requests from 158 timebanks... 
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 

In [6]:
reqs.head(10)

Unnamed: 0,cat_id,cat_parent,category,count_requests,timebank
0,8,is_parent,"Arts, Crafts & Music",1,addington
1,6,8,Crafts,1,addington
2,64,8,Lessons,1,addington
3,10,is_parent,Business Services,1,addington
4,75,10,Clerical,1,addington
5,81,10,Miscellaneous,1,addington
6,4,is_parent,Community Activities,2,addington
7,34,4,Community Service,1,addington
8,39,4,Miscellaneous,1,addington
9,38,4,Work For Social Change,1,addington


## Get talents

In [9]:
talents = []

counter = 0
total_timebanks = timebanks.shape[0]

print(f'Getting talents from {total_timebanks} timebanks... ')

# Iterate through all timebank directories
for timebank in range(len(timebanks)):
    url = f"{timebanks['url'][timebank]}/directory"
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')

    timebank_name = timebanks['url'][timebank].replace('.timebanks.org', '').strip('http://')
    
    try:
        # Get all parent categories
        for panel in soup.findAll('div', {'class': 'cw-panel'}):
            category = {}
            cat = panel.a
            category['count_talent'] = int(cat.span.text)
            cat.span.clear()
            parent_id = int(cat.get('href').strip('/directory?category='))
            category['cat_id'] = parent_id
            category['cat_parent'] = 'is_parent'
            category['category'] = cat.get_text()
            category['timebank'] = timebank_name
            talents.append(category)

            # Get all child categories
            for li in panel.findAll('li'):
                try:
                    category = {}
                    category['cat_id'] = int(li.a.get('href').strip('/directory?category='))
                    category['cat_parent'] = parent_id
                    category['category'] = li.a.get_text()
                    category['count_talent'] = int(li.span.text)
                    category['timebank'] = timebank_name
                    talents.append(category)
                except:
                    pass
    except:
        pass

    print(counter + 1, end=' ')
    
    time.sleep(1)
    counter += 1

# Save to dateframe and drop duplicates
talents = pd.DataFrame(talents)
talents.drop_duplicates(inplace=True)
    
# Export to csv
filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
talents.to_csv(f'./data/talents_{filetime}.csv', index=False)

Getting talents from 158 timebanks... 
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 

In [10]:
talents.head(10)

Unnamed: 0,cat_id,cat_parent,category,count_talent,timebank
0,8,is_parent,"Arts, Crafts & Music",46,addington
1,61,8,Classes,8,addington
2,62,8,Crafts,19,addington
3,63,8,Entertainment,5,addington
4,64,8,Lessons,4,addington
5,67,8,Miscellaneous,2,addington
6,65,8,Photo & Video,7,addington
7,66,8,Theater,1,addington
8,10,is_parent,Business Services,79,addington
9,75,10,Clerical,14,addington


In [None]:
talents.pivot_table(values='count_talent', index=['cat_id', 'cat_parent', 'category'], columns=['timebank'], aggfunc=np.sum)

## Drop duplicate rows
Challenges: Some child categories have the same id and name but are assigned to different parent categories.

In [None]:
# Drop duplicates of id 2: "help at home"
categories.drop(26, inplace=True)
categories.drop(76, inplace=True)

In [None]:
# Drop duplicates of id 3: "companionship / socializing / home care"
categories.drop(218, inplace=True)
categories.drop(1319, inplace=True)

In [None]:
# Drop duplicates of id 5: "wellness"
categories.drop(1008, inplace=True)

In [None]:
# Drop duplicates of id 7: "education"
categories.drop(1759, inplace=True)

In [None]:
# Drop duplicates of id 8: "arts, crafts, music"
categories.drop(32, inplace=True)
categories.drop(1920, inplace=True)

In [None]:
# Drop duplicates of id 9: "home care / repairs"
categories.drop(99, inplace=True)
categories.drop(873, inplace=True)

In [None]:
# Drop duplicates of id 13: "local rides"
categories.drop(374, inplace=True)

In [None]:
# Drop duplicates of id 14: "long distance rides"
categories.drop(375, inplace=True)

In [None]:
# Drop duplicates of id 15: "medical rides"
categories.drop(376, inplace=True)

In [None]:
# Drop duplicates of id 16: "train / bus / airport rides"
categories.drop(378, inplace=True)

In [None]:
# Drop duplicates of id 20: "cooking / sewing"
categories.drop(78, inplace=True)
categories.drop(344, inplace=True)
categories.drop(857, inplace=True)
categories.drop(1158, inplace=True)

In [None]:
# Drop duplicates of id 24: "respite care"
categories.drop(1071, inplace=True)

In [None]:
# Drop duplicates of id 25: "other help at home"
categories.drop(81, inplace=True)

In [None]:
# Drop duplicates of id 26: "clubs / social groups"
categories.drop(220, inplace=True)

In [None]:
# Drop duplicates of id 28: "social correspondence / email / etc."
categories.drop(226, inplace=True)
categories.drop(329, inplace=True)

In [None]:
# Drop duplicates of id 29: "home visits"
categories.drop(1320, inplace=True)

In [None]:
# Drop duplicates of id 30: "errands"
categories.drop(224, inplace=True)

In [None]:
# Drop duplicates of id 31: "phone calls"
categories.drop(68, inplace=True)
categories.drop(1381, inplace=True)

In [None]:
# Drop duplicates of id 32: "other companionship"
categories.drop(67, inplace=True)

In [None]:
# Drop duplicates of id 35: "fundraising"
categories.drop(57, inplace=True)

In [None]:
# Drop duplicates of id 36: "help our timebank"
categories.drop(1692, inplace=True)

In [None]:
# Drop duplicates of id 39: "other community activities"
categories.drop(60, inplace=True)

In [None]:
# Give more descriptive title for id 46
categories.at[137, 'category'] = 'Other Wellness'

In [None]:
# Drop duplicates of id 47: "books & videos"
categories.drop(455, inplace=True)

In [None]:
# Drop duplicates of id 51: "sports & games"
categories.drop(370, inplace=True)

In [None]:
# Drop duplicates of id 58: "legal / finances"
categories.drop(1390, inplace=True)

In [None]:
# Drop duplicates of id 60: "other education"
categories.drop(74, inplace=True)

In [None]:
# Drop duplicates of id 61: "classes"
categories.drop(1921, inplace=True)
categories.drop(1925, inplace=True)

In [None]:
# Drop duplicates of id 62: "arts & crafts"
categories.drop(1357, inplace=True)
categories.drop(1922, inplace=True)

In [None]:
# Drop duplicates of id 65: "photo & video"
categories.drop(398, inplace=True)

In [None]:
# Drop duplicates of id 66: "theater"
categories.drop(699, inplace=True)

In [None]:
# Drop duplicates of id 67: "arts misc"
categories.drop(1017, inplace=True)

In [None]:
# Drop duplicates of id 68: "car care"
categories.drop(1406, inplace=True)

In [None]:
# Drop duplicates of id 71: "garden, yard work"
categories.drop(354, inplace=True)
categories.drop(446, inplace=True)
categories.drop(953, inplace=True)
categories.drop(1274, inplace=True)

In [None]:
# Drop duplicates of id 76: "computer / it support"
categories.drop(10, inplace=True)
categories.drop(494, inplace=True)

In [None]:
# Drop duplicates of id 81: "other business services"
categories.drop(50, inplace=True)
categories.drop(905, inplace=True)

In [None]:
# Drop duplicates of id 85: "free stuff"
categories.drop(452, inplace=True)

In [None]:
# Drop duplicates of id 86: "group projects"
categories.drop(1375, inplace=True)

In [None]:
# Drop duplicates of id 88: "cooking/sewing"
categories.drop(1151, inplace=True)

In [None]:
categories[categories['id'] == 96]

In [None]:
# Give more descriptive title for id 46
categories.at[137, 'category'] = 'Other Wellness'

In [None]:
# Give more descriptive title for id 53
categories.at[120, 'category'] = 'Other Recreation'

In [None]:
# Give more descriptive title for id 67
categories.at[5, 'category'] = 'Other Arts, Crafts & Music'

In [None]:
# Give more descriptive title for id 74
categories.at[106, 'category'] = 'Other Home Care / Maintenance'

In [None]:
# Give more descriptive title for id 88
categories.at[863, 'category'] = 'Cooking / Sewing'