In [1]:
import urllib.request, urllib.parse, urllib.error, json, re, random
from bs4 import BeautifulSoup
import pandas as pd
import time
import requests
import datetime as dt
from tqdm import tqdm

In [1]:
from platform import python_version
print(python_version())

3.6.5


In [2]:
# function that takes in a url and returns the html soup

THROTTLE_TIME = 0.2

def getPage(link):
    time.sleep(THROTTLE_TIME * (random.random() * 2))
    result = ''
    while result == '':
        try:
            result = requests.get(url)
        except:
            print("Connection refused by the server, sleeping for 15 seconds...")
            time.sleep(15)
            print("Continuing...")
            continue
    if result.status_code == 200:
        response = result.content.decode('utf-8')
        response = BeautifulSoup(response, "html.parser")
        return response
    else:
        print("Request failed")
        return None

In [3]:
# Starting the slow process of breaking down scraper into component functions. Add new functions to this cell.

def subtype_ghg_commitments(commitment_string, ghg_subtype = None):
    if commitment_string.startswith('Community'):
        ghg_subtype = 'Community'
        # remove word 'Community' from commitment string
        commitment_string = commitment_string[len('Community'):]
    elif commitment_string.startswith('Local Government'):
        # remove phrase 'Local Government' from commitment string
        ghg_subtype = 'Local Government'
        commitment_string = commitment_string[len('Local Government'):]
    return (commitment_string, ghg_subtype)

def process_commitment_text(commitment_text_html):
    commitment = commitment_text_html.get_text()
    commitment = commitment.replace('\n', ' ')
    commitment = commitment.replace('\xa0', ' ')
    commitment = commitment.replace('.', '. ')
    commitment = commitment.strip()
    return commitment
    

In [4]:
# initialize dataframes that will eventually output as csvs
summary_stats = pd.DataFrame()
targets = pd.DataFrame()
mitigation_actions = pd.DataFrame()
adaptation_actions =pd.DataFrame()
action_plans = pd.DataFrame()
inventories = pd.DataFrame()
sector_breakdowns = pd.DataFrame()
skipped_cities = pd.DataFrame()

In [4]:
url = 'https://carbonn.org/entities?page='
p = BeautifulSoup(requests.get(url + '1').content, 'lxml')
n_pages = p.find('span', {'class': 'results-counter'}).text.replace(' registered entities', '')
n_pages = int(n_pages) // 20

In [6]:
cities = []
for i in tqdm(range(1,n_pages+1)):
    p = requests.get(url + str(i)).content
    p = BeautifulSoup(p, 'lxml')
    for lf in p.find_all('div', {'class': 'list-frame'}):
        temp = {}
        name = lf.find('a', {'class': 'open'}).text.split(', ')
        temp['country'] = name[-1]
        temp['name'] = ', '.join(name)
        temp['city_profile_url'] = 'https://carbonn.org' + lf.find('a', {'class': 'btn'})['href']
        cities.append(temp)

100%|██████████| 38/38 [1:11:29<00:00, 125.16s/it]


In [7]:
# read in output of first scraper, which will give entity names and city profile URLs
# cities = pd.read_csv('../../../../output/actors/carbonn/carbonn.csv')
# cities = cities.rename(columns={'city': 'name'})
cities = pd.DataFrame(cities)
# cities.to_csv('../../../../output/actors/carbonn/2019carbonn_cities.csv')

In [None]:
# main scraping loop -- currently takes FOREVER
for city, row in tqdm(cities.iterrows()):
    name = row['name']
    country = row['country']
    url = row['city_profile_url']
    url = url.split('&text')[0]
    soup = getPage(url)
    if soup == None:
        holder={}
        holder['city']=name
        holder['url']=url
        skipped_cities = skipped_cities.append(holder, ignore_index=True)
        continue


    # SUMMARY STATS

    holder = {}
    holder['city'] = name
    holder['country'] = country
    results = soup.find('div', id='tab1')
    if results is not None:
        counters = results.find_all(class_='counter')
        pop = counters[0].get_text()
        area = counters[1].get_text()
        gdp = counters[2].get_text()
        target = counters[3].get_text()
    holder['population'] = pop
    holder['area (km2)'] = area
    holder['gdp'] = gdp
    holder['target'] = target
    summary_stats = summary_stats.append(holder, ignore_index=True)

    # TARGETS

    results = soup.find('div', id='tab2')
    if results is not None:
        results = results.find_all('div', class_='frame')

        if results is not None and len(results) > 0:

            # GHG commitments
            holder={}
            
            # Baseline data
            charts = results[0].find_all('div', {'class': 'chart_div'})
            for chart_num, chart_data in enumerate(charts):
                if chart_data is not None:
                    chart_unit = chart_data['data-unit']
                    chart_data = eval(chart_data['data-chart'])
                    if chart_unit == 'CO2e':
                        holder['baseline_year'] = chart_data[0][0].replace('Base year ', '')
                        holder['target_year'] = chart_data[1][0].replace('Target year ', '')
                        holder['baseline_year_emissions'] = chart_data[0][1]
                        holder['target_year_emissions'] = chart_data[1][1]
                        holder['emissions_reduction_by_target_year'] = chart_data[1][2]
                    else:
                        holder['baseline_year'] = chart_data[0][0].replace('Base year ', '')
                        holder['target_year'] = chart_data[1][0].replace('Target year ', '')
                        holder['percent_reduction'] = chart_data[1][2]


                    ghg_coms = results[0].find_all('div', class_='text')
                    com = ghg_coms[chart_num]
                    if com.get_text().strip() == '' or com.get_text().strip() == 'Community':
                        com = ghg_coms[chart_num + 1]
                    commitment = process_commitment_text(com)
                    
                    holder['city'] = name
                    holder['country'] = country
                    holder['commitment_type'] = 'GHG emission reduction target'
                    holder['commitment'], holder['ghg_subtype'] = subtype_ghg_commitments(commitment)
                    
                    targets = targets.append(holder, ignore_index=True)            
            
    # Seems like carbonn re-formatted website such that the commitments tab only shows things that are present.
    # Incorporate checks for length of soup before scraping
            # Renewable energy commitments
        if len(results) > 1:
            holder={}
            renewable_coms = results[1].find_all('div', class_='text')
            for com in renewable_coms:
                holder['city'] = name
                holder['country'] = country
                holder['commitment'] = process_commitment_text(com)
                holder['commitment_type'] = 'Renewable energy target'
                targets = targets.append(holder, ignore_index=True)

            # Energy efficiency commitments
        if len(results) > 2:
            holder={}
            energy_coms = results[2].find_all('div', class_='text')
            for com in energy_coms:
                holder['city'] = name
                holder['country'] = country
                holder['commitment'] = process_commitment_text(com)
                holder['commitment_type'] = 'Energy efficiency target'
                targets = targets.append(holder, ignore_index=True)

            # Other mitigation commitments
        if len(results) > 3:
            holder={}
            other_coms = results[3].find_all('div', class_='text')
            for com in other_coms:
                holder['city'] = name
                holder['country'] = country
                holder['commitment'] = process_commitment_text(com)
                holder['commitment_type'] = 'Other mitigation target'
                targets = targets.append(holder, ignore_index=True)

            # Adaptation & resilience commitments
        if len(results) > 4:
            holder={}
            adapt_coms = results[4].find_all('div', class_='text')
            for com in adapt_coms:
                holder['city'] = name
                holder['country'] = country
                holder['commitment'] = process_commitment_text(com)
                holder['commitment_type'] = 'Adaptation and resilience target'
                targets = targets.append(holder, ignore_index=True)

    # ACTIONS

    holder = {}
    # action plans
    results = soup.find('div', id='tab5')
    if results is not None:
        plans = results.find_all('div', class_='frame')
        for plan in plans:
            holder['city'] = name
            holder['country'] = country
            plan_text = plan.find('a', class_='opener').get_text()
            plan_text = plan_text.replace('\n', '')
            plan_text = plan_text.strip()
            holder['plan_name'] = plan_text
            text= plan.li.get_text()
            year = text.split(': ')[1]
            holder['start year'] = year
            text2= plan.li.next_sibling.next_sibling.get_text()
            type_ = text2.split(': ')[1]
            holder['plan_type'] = type_
            action_plans = action_plans.append(holder, ignore_index=True)

    holder = {}
    # mitigation actions
    results = soup.find('div', id='tab6')
    if results is not None:
        actions = results.find_all('div', class_='frame')
        for action in actions:
            holder['city'] = name
            holder['country'] = country
            action_name = action.find('a', class_='opener').get_text()
            holder['name']=action_name
            text=action.li.get_text()
            year = text.split(': ')[1]
            holder['start year'] = year
            text2=action.li.next_sibling.next_sibling.get_text()
            type_ = text2.split(': ')[1]
            holder['type'] = type_
            text3=action.li.next_sibling.next_sibling.next_sibling.next_sibling.get_text()
            status = text3.split(': ')[1]
            holder['status'] = status
            description = action.find('div', class_='text-holder').p.get_text()
            holder['description']=description
            text4 = action.find('ul', class_='sectors-list').get_text()
            mitigation_actions = mitigation_actions.append(holder, ignore_index=True)

    holder = {}
    # adaptation actions
    results = soup.find('div', id='tab7')
    if results is not None:
        actions = results.find_all('div', class_='frame')
        for action in actions:
            holder['city'] = name
            holder['country'] = country
            action_name = action.find('a', class_='opener').get_text()
            holder['action']=action_name
            text=action.li.get_text()
            year = text.split(': ')[1]
            holder['start year'] = year
            text2=action.li.next_sibling.next_sibling.get_text()
            type_ = text2.split(': ')[1]
            holder['type'] = type_
            text3=action.li.next_sibling.next_sibling.next_sibling.next_sibling.get_text()
            status = text3.split(': ')[1]
            holder['status'] = status
            description = action.find('div', class_='text-holder').p.get_text()
            holder['description']=description
            adaptation_actions = adaptation_actions.append(holder, ignore_index=True)
            
    holder = {}
    # GHG Inventories
    results = soup.find('div', id='tab8')
    if results is not None:
        actions = results.find_all('div', class_='frame')
        for action in actions:
            holder['city'] = name
            holder['country'] = country
            inventory_type = action.find('a', class_='opener').get_text()
            holder['inventory_type'] = inventory_type
            description = action.p.get_text()
#             holder['description'] = description
            graph_title = action.find('strong', class_='graph-title').get_text()
#             holder['graph_title'] = graph_title
            chart_data_all = action.find_all('div', class_='chart_div')
            for chart_data in chart_data_all:
                holder2 = {}
                chart_type = chart_data.get('data-chart-type')
#                 holder[chart_type] = chart_type
                chart_id = chart_data.get('id')
#                 holder[chart_id] = chart_id
                if 'inventory' in chart_type.lower():
                    for entry in json.loads(chart_data.get('data-chart')):
                        if entry:
                            year, emissions = entry[0], entry[1]
                            holder2['year'] = year
                            holder2['emissions'] = emissions
                            holder2.update(holder)
                            inventories = inventories.append(holder2, ignore_index=True)
                elif 'latest' in chart_type.lower():
                    # Use if you want sector emissions in one column with a dictionary of breakdown by sector
                    # holder['sector_emissions'] = {k:v for k,v in json.loads(chart_data.get('data-chart'))}
                    # Use if you want each sector emissions in their own column
                    if not ['N/A', 0] in json.loads(chart_data.get('data-chart')):
                        [holder2.update({sector:emissions}) for sector, emissions in json.loads(chart_data.get('data-chart'))]
                        holder2.update(holder)
                        sector_breakdowns = sector_breakdowns.append(holder2, ignore_index=True)
                else:
                    print('Warning: A GHG emissions chart is not in a known format')
                    print('{}, {}'.format(country, name))
                    print(json.loads(chart_data.get('data-chart')))

In [None]:
# Reorder columns, remove rows with blank entries
targets = targets.replace(to_replace='', value=pd.np.nan)
targets = targets.dropna(how='all', axis=0, subset=['commitment', 'ghg_subtype',
       'baseline_year', 'baseline_year_emissions', 'target_year',
       'target_year_emissions', 'percent_reduction',
       'emissions_reduction_by_target_year'])

In [None]:
# Do some data checks
skipped_cities # 2 skipped cities 
len(pd.unique(summary_stats['city'])) # Total of 759 unique cities scrapped. Carbonn website says 764
len(pd.unique(mitigation_actions['city'])) # 444 cities with mitigation actions
mitigation_actions.shape # 5514 total mitigation actions
adaptation_actions.shape # 1620 total adaptation actions
len(pd.unique(adaptation_actions['city'])) # 351 cities with adaptation actions
inventories.shape # 1591 inventories reported 
len(pd.unique(inventories['city'])) # 546 unique cities with inventories 
targets.shape # 767 targets 
len(pd.unique(targets['city'])) # 432 unique cities with targets

In [34]:
skipped_cities

Unnamed: 0.1,Unnamed: 0,city,url
0,0,"City of Santa Rosa, Laguna, Philippines",https://carbonn.org/city_profiles/City_of_Sant...
1,1,"Municipality of Donostia/San Sebastián, Spain",https://carbonn.org/city_profiles/Municipality...


In [14]:
# output dataframes to csv
date = dt.datetime.today().strftime("%m.%d.%y")
summary_stats.to_csv('../../../../output/actors/carbonn/'+date+'carbonn_summary_stats.csv', encoding = "utf-8")
targets.to_csv('../../../../output/actors/carbonn/'+date+'carbonn_commitments.csv', encoding = "utf-8")
adaptation_actions.to_csv('../../../../output/actors/carbonn/'+date+'carbonn_adaptation_actions.csv', encoding = "utf-8")
mitigation_actions.to_csv('../../../../output/actors/carbonn/'+date+'carbonn_mitigation_actions.csv', encoding = "utf-8")
action_plans.to_csv('../../../../output/actors/carbonn/'+date+'carbonn_action_plans.csv', encoding = "utf-8")
skipped_cities.to_csv('../../../../output/actors/carbonn/'+date+'carbonn_skipped_cities.csv', encoding = "utf-8")
inventories.to_csv('../../../../output/actors/carbonn/'+date+'carbonn_inventories.csv', encoding = "utf-8")
sector_breakdowns.to_csv('../../../../output/actors/carbonn/'+date+'carbonn_sector_breakdowns.csv', encoding = "utf-8")