In [62]:
# Initialization
import urllib.request, urllib.parse, urllib.error, json, re, random
from bs4 import BeautifulSoup
import pandas as pd
import time
import requests
import datetime as dt
from tqdm import tqdm

In [63]:
# Read in files from main scraping file. Change date in wd_string to confirm updates
wd_string = "../../../../output/actors/carbonn/02.25.19carbonn_"
summary_stats = pd.read_csv("%ssummary_stats.csv" % wd_string)
targets = pd.read_csv("%scommitments.csv" % wd_string)
adaptation_actions = pd.read_csv("%sadaptation_actions.csv" % wd_string)
mitigation_actions = pd.read_csv("%smitigation_actions.csv" % wd_string)
action_plans = pd.read_csv("%saction_plans.csv" % wd_string)
skipped_cities = pd.read_csv("%sskipped_cities.csv" % wd_string)
inventories = pd.read_csv("%sinventories.csv" % wd_string)
sector_breakdowns = pd.read_csv("%ssector_breakdowns.csv" % wd_string)

In [64]:
skipped_cities

Unnamed: 0.1,Unnamed: 0,city,url
0,0,"City of Santa Rosa, Laguna, Philippines",https://carbonn.org/city_profiles/City_of_Sant...
1,1,"Municipality of Donostia/San Sebastián, Spain",https://carbonn.org/city_profiles/Municipality...


In [65]:
# Initialize new files for writing the skipped cities to
summary_stats_skipped = pd.DataFrame()
targets_skipped = pd.DataFrame()
mitigation_actions_skipped = pd.DataFrame()
adaptation_actions_skipped =pd.DataFrame()
action_plans_skipped = pd.DataFrame()
inventories_skipped = pd.DataFrame()
sector_breakdowns_skipped = pd.DataFrame()
skipped_cities2 = pd.DataFrame()

In [66]:
# Define scraping functions
# function that takes in a url and returns the html soup

THROTTLE_TIME = 0.2

def getPage(link):
    time.sleep(THROTTLE_TIME * (random.random() * 2))
    result = ''
    while result == '':
        try:
            result = requests.get(url)
        except:
            print("Connection refused by the server, sleeping for 15 seconds...")
            time.sleep(15)
            print("Continuing...")
            continue
    if result.status_code == 200:
        response = result.content.decode('utf-8')
        response = BeautifulSoup(response, "html.parser")
        return response
    else:
        print("Request failed")
        return None
    
def subtype_ghg_commitments(commitment_string, ghg_subtype = None):
    if commitment_string.startswith('Community'):
        ghg_subtype = 'Community'
        # remove word 'Community' from commitment string
        commitment_string = commitment_string[len('Community'):]
    elif commitment_string.startswith('Local Government'):
        # remove phrase 'Local Government' from commitment string
        ghg_subtype = 'Local Government'
        commitment_string = commitment_string[len('Local Government'):]
    return (commitment_string, ghg_subtype)

def process_commitment_text(commitment_text_html):
    commitment = commitment_text_html.get_text()
    commitment = commitment.replace('\n', ' ')
    commitment = commitment.replace('\xa0', ' ')
    commitment = commitment.replace('.', '. ')
    commitment = commitment.strip()
    return commitment

In [68]:
# Run scraping loop for skipped cities
for city, row in tqdm(skipped_cities.iterrows()):
    name = row['city']
    url = row['url']
    url = url.split('&text')[0]
    soup = getPage(url)
    if soup == None:
        holder={}
        holder['city']=name
        holder['url']=url
        skipped_cities2 = skipped_cities2.append(holder, ignore_index=True)
        continue
    country = soup.find_all("strong", class_ = "subheading")[0].string


    # SUMMARY STATS

    holder = {}
    holder['city'] = name
    holder['country'] = country
    results = soup.find('div', id='tab1')
    if results is not None:
        counters = results.find_all(class_='counter')
        pop = counters[0].get_text()
        area = counters[1].get_text()
        gdp = counters[2].get_text()
        target = counters[3].get_text()
    holder['population'] = pop
    holder['area (km2)'] = area
    holder['gdp'] = gdp
    holder['target'] = target
    summary_stats_skipped = summary_stats_skipped.append(holder, ignore_index=True)

    # TARGETS

    results = soup.find('div', id='tab2')
    if results is not None:
        results = results.find_all('div', class_='frame')

        if results is not None and len(results) > 0:

            # GHG commitments
            holder={}
            
            # Baseline data
            charts = results[0].find_all('div', {'class': 'chart_div'})
            for chart_num, chart_data in enumerate(charts):
                if chart_data is not None:
                    chart_unit = chart_data['data-unit']
                    chart_data = eval(chart_data['data-chart'])
                    if chart_unit == 'CO2e':
                        holder['baseline_year'] = chart_data[0][0].replace('Base year ', '')
                        holder['target_year'] = chart_data[1][0].replace('Target year ', '')
                        holder['baseline_year_emissions'] = chart_data[0][1]
                        holder['target_year_emissions'] = chart_data[1][1]
                        holder['emissions_reduction_by_target_year'] = chart_data[1][2]
                        holder['percent_reduction'] = ""
                    else:
                        holder['baseline_year'] = chart_data[0][0].replace('Base year ', '')
                        holder['target_year'] = chart_data[1][0].replace('Target year ', '')
                        holder['percent_reduction'] = chart_data[1][2]


                    ghg_coms = results[0].find_all('div', class_='text')
                    com = ghg_coms[chart_num]
                    if com.get_text().strip() == '' or com.get_text().strip() == 'Community':
                        com = ghg_coms[chart_num + 1]
                    commitment = process_commitment_text(com)
                    
                    holder['city'] = name
                    holder['country'] = country
                    holder['commitment_type'] = 'GHG emission reduction target'
                    holder['commitment'], holder['ghg_subtype'] = subtype_ghg_commitments(commitment)
                    
                    targets_skipped = targets_skipped.append(holder, ignore_index=True)            
            
    # Seems like carbonn re-formatted website such that the commitments tab only shows things that are present.
    # Incorporate checks for length of soup before scraping
            # Renewable energy commitments
        if len(results) > 1:
            holder={}
            renewable_coms = results[1].find_all('div', class_='text')
            for com in renewable_coms:
                holder['city'] = name
                holder['country'] = country
                holder['commitment'] = process_commitment_text(com)
                holder['commitment_type'] = 'Renewable energy target'
                targets_skipped = targets_skipped.append(holder, ignore_index=True)

            # Energy efficiency commitments
        if len(results) > 2:
            holder={}
            energy_coms = results[2].find_all('div', class_='text')
            for com in energy_coms:
                holder['city'] = name
                holder['country'] = country
                holder['commitment'] = process_commitment_text(com)
                holder['commitment_type'] = 'Energy efficiency target'
                targets_skipped = targets_skipped.append(holder, ignore_index=True)

            # Other mitigation commitments
        if len(results) > 3:
            holder={}
            other_coms = results[3].find_all('div', class_='text')
            for com in other_coms:
                holder['city'] = name
                holder['country'] = country
                holder['commitment'] = process_commitment_text(com)
                holder['commitment_type'] = 'Other mitigation target'
                targets_skipped = targets_skipped.append(holder, ignore_index=True)

            # Adaptation & resilience commitments
        if len(results) > 4:
            holder={}
            adapt_coms = results[4].find_all('div', class_='text')
            for com in adapt_coms:
                holder['city'] = name
                holder['country'] = country
                holder['commitment'] = process_commitment_text(com)
                holder['commitment_type'] = 'Adaptation and resilience target'
                targets_skipped = targets_skipped.append(holder, ignore_index=True)

    # ACTIONS

    holder = {}
    # action plans
    results = soup.find('div', id='tab5')
    if results is not None:
        plans = results.find_all('div', class_='frame')
        for plan in plans:
            holder['city'] = name
            holder['country'] = country
            plan_text = plan.find('a', class_='opener').get_text()
            plan_text = name.replace('\n', '')
            plan_text = name.replace('                ', '') # I think we can remove and just use .strip() -- andrew
            plan_text = name.replace('              ', '')
            plan_text = name.strip()
            holder['plan_name'] = plan_text
            text= plan.li.get_text()
            year = text.split(': ')[1]
            holder['start year'] = year
            text2= plan.li.next_sibling.next_sibling.get_text()
            type_ = text2.split(': ')[1]
            holder['plan_type'] = type_
            action_plans_skipped = action_plans_skipped.append(holder, ignore_index=True)

    holder = {}
    # mitigation actions
    results = soup.find('div', id='tab6')
    if results is not None:
        actions = results.find_all('div', class_='frame')
        for action in actions:
            holder['city'] = name
            holder['country'] = country
            action_name = action.find('a', class_='opener').get_text()
            holder['name']=action_name
            text=action.li.get_text()
            year = text.split(': ')[1]
            holder['start year'] = year
            text2=action.li.next_sibling.next_sibling.get_text()
            type_ = text2.split(': ')[1]
            holder['type'] = type_
            text3=action.li.next_sibling.next_sibling.next_sibling.next_sibling.get_text()
            status = text3.split(': ')[1]
            holder['status'] = status
            description = action.find('div', class_='text-holder').p.get_text()
            holder['description']=description
            text4 = action.find('ul', class_='sectors-list').get_text()
            mitigation_actions_skipped = mitigation_actions_skipped.append(holder, ignore_index=True)

    holder = {}
    # adaptation actions
    results = soup.find('div', id='tab7')
    if results is not None:
        actions = results.find_all('div', class_='frame')
        for action in actions:
            holder['city'] = name
            holder['country'] = country
            action_name = action.find('a', class_='opener').get_text()
            holder['action']=action_name
            text=action.li.get_text()
            year = text.split(': ')[1]
            holder['start year'] = year
            text2=action.li.next_sibling.next_sibling.get_text()
            type_ = text2.split(': ')[1]
            holder['type'] = type_
            text3=action.li.next_sibling.next_sibling.next_sibling.next_sibling.get_text()
            status = text3.split(': ')[1]
            holder['status'] = status
            description = action.find('div', class_='text-holder').p.get_text()
            holder['description']=description
            adaptation_actions_skipped = adaptation_actions_skipped.append(holder, ignore_index=True)
            
    holder = {}
    # GHG Inventories
    results = soup.find('div', id='tab8')
    if results is not None:
        actions = results.find_all('div', class_='frame')
        for action in actions:
            holder['city'] = name
            holder['country'] = country
            inventory_type = action.find('a', class_='opener').get_text()
            holder['inventory_type'] = inventory_type
            description = action.p.get_text()
#             holder['description'] = description
            graph_title = action.find('strong', class_='graph-title').get_text()
#             holder['graph_title'] = graph_title
            chart_data_all = action.find_all('div', class_='chart_div')
            for chart_data in chart_data_all:
                holder2 = {}
                chart_type = chart_data.get('data-chart-type')
#                 holder[chart_type] = chart_type
                chart_id = chart_data.get('id')
#                 holder[chart_id] = chart_id
                if 'inventory' in chart_type.lower():
                    for entry in json.loads(chart_data.get('data-chart')):
                        if entry:
                            year, emissions = entry[0], entry[1]
                            holder2['year'] = year
                            holder2['emissions'] = emissions
                            holder2.update(holder)
                            inventories = inventories.append(holder2, ignore_index=True)
                elif 'latest' in chart_type.lower():
                    # Use if you want sector emissions in one column with a dictionary of breakdown by sector
                    # holder['sector_emissions'] = {k:v for k,v in json.loads(chart_data.get('data-chart'))}
                    # Use if you want each sector emissions in their own column
                    if not ['N/A', 0] in json.loads(chart_data.get('data-chart')):
                        [holder2.update({sector:emissions}) for sector, emissions in json.loads(chart_data.get('data-chart'))]
                        holder2.update(holder)
                        sector_breakdowns_skipped = sector_breakdowns_skipped.append(holder2, ignore_index=True)
                else:
                    print('Warning: A GHG emissions chart is not in a known format')
                    print('{}, {}'.format(country, name))
                    print(json.loads(chart_data.get('data-chart')))
                    

1it [00:00,  1.18it/s]

Request failed


2it [00:01,  1.36it/s]


In [70]:
summary_stats_skipped

Unnamed: 0,area (km2),city,country,gdp,population,target
0,55.0,"City of Santa Rosa, Laguna, Philippines",Philippines,,284670,Reduce 30% GHG emissions


In [71]:
# Reorder columns, remove rows with blank entries
targets_skipped = targets_skipped[['city', 'country', 'commitment_type','commitment', 'ghg_subtype',
         'baseline_year', 'baseline_year_emissions', 'target_year', 'target_year_emissions', 
         'percent_reduction', 'emissions_reduction_by_target_year']]
targets_skipped = targets_skipped.replace(to_replace='', value=pd.np.nan)
targets_skipped = targets_skipped.dropna(how='all', axis=0, subset=['commitment', 'ghg_subtype',
       'baseline_year', 'baseline_year_emissions', 'target_year',
       'target_year_emissions', 'percent_reduction',
       'emissions_reduction_by_target_year'])

In [72]:
# Append the scrapped results from the skipped cities
summary_stats = summary_stats.append(summary_stats_skipped, ignore_index = True)
targets = targets.append(targets_skipped, ignore_index = True)
adaptation_actions = adaptation_actions.append(adaptation_actions_skipped, ignore_index = True)
mitigation_actions = mitigation_actions.append(mitigation_actions_skipped, ignore_index = True)
action_plans = action_plans.append(action_plans_skipped, ignore_index = True)
skipped_cities = skipped_cities2
inventories = inventories.append(inventories_skipped, ignore_index = True)
sector_breakdowns = sector_breakdowns.append(sector_breakdowns_skipped, ignore_index = True)

In [74]:
len(summary_stats)

760

In [75]:
# Write out the new data
date = dt.datetime.today().strftime("%m.%d.%y")
summary_stats.to_csv('../../../../output/actors/carbonn/'+date+'carbonn_summary_stats.csv', encoding = "utf-8")
targets.to_csv('../../../../output/actors/carbonn/'+date+'carbonn_commitments.csv', encoding = "utf-8")
adaptation_actions.to_csv('../../../../output/actors/carbonn/'+date+'carbonn_adaptation_actions.csv', encoding = "utf-8")
mitigation_actions.to_csv('../../../../output/actors/carbonn/'+date+'carbonn_mitigation_actions.csv', encoding = "utf-8")
action_plans.to_csv('../../../../output/actors/carbonn/'+date+'carbonn_action_plans.csv', encoding = "utf-8")
skipped_cities.to_csv('../../../../output/actors/carbonn/'+date+'carbonn_skipped_cities.csv', encoding = "utf-8")
inventories.to_csv('../../../../output/actors/carbonn/'+date+'carbonn_inventories.csv', encoding = "utf-8")
sector_breakdowns.to_csv('../../../../output/actors/carbonn/'+date+'carbonn_sector_breakdowns.csv', encoding = "utf-8")