In [1]:
import pandas as pd
import nltk
import re
from itertools import groupby

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/morganedaniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/morganedaniel/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
bdd_sbt_df = pd.read_csv("BDD_SBT_from_excel.csv")

In [4]:
bdd_sbt_df.head()


Unnamed: 0,Company Name,ISIN,Near term - Target Status,Near term - Target Classification,Near term - Target Year,Long term - Target Status,Long term - Target Classification,Long term - Target Year,Net-Zero Committed,Net-Zero Year,Organization Type,BA1.5?,BA1.5 Date,Country,Region,Sector,Date,Target,Target Classification,Extension
0,Hongkong Land Holdings Limited,BMG4587L1090,Committed,,,,,,No,,Company,No,,Hong Kong,Asia,Real Estate,01/02/2022,,,
1,Iress Limited,,Committed,,,,,,No,,Company,No,,Australia,Oceania,Software and Services,01/02/2022,,,
2,Jacuzzi Brands,,Committed,,,,,,No,,Company,No,,United States of America (USA),North America,"Consumer Durables, Household and Personal Prod...",01/02/2022,,,
3,Klimasan AŞ,,Committed,,,,,,No,,Company,No,,Turkey,Asia,"Consumer Durables, Household and Personal Prod...",01/02/2022,,,
4,Mavi,TREMAVI00037,Committed,,,,,,No,,Company,No,,Turkey,Asia,Retailing,01/02/2022,,,


In [5]:
companies_with_target_df = bdd_sbt_df[bdd_sbt_df['Target'].notna()]

In [254]:
def refine_perimeter(perimeter_group):
    """
    looking for another target goal in the perimeter group
    eg: perimeter_group="emissions 50% by 2030 and" -> <e want to extract the new reduction_percentage and target_year
    """

    reg = r'([0-9]+.?[0-9]+?\ ?%) by (f?y?[0-9]+).*and'
    sub_group = re.search(reg, perimeter_group)
    if sub_group:
        return {"reduction_percentage": sub_group.groups()[0], "target_year": sub_group.groups()[1], "perimeter": re.sub(reg, "", perimeter_group) }
    

In [265]:
def which_scopes(scope_str):
    """
    find which scopes are related to the perimeters found in scope_str
    eg: Extract scopes from: "scope 1, 2 and 3" or "scope 1 and scope 2"
    """
    scope_tags = nltk.pos_tag(nltk.word_tokenize(scope_str))

    grammar =""" SCOPES: {<JJ>?<NN><CD><CD>?<CC><VB>?<CD>}
                         {<JJ>?<NN><CD><CD><CC><CD>}
                         {<JJ>?<VBZ><CD><CD><CC><CD>}
                         {<JJ>?<VBZ><CD><CC><VB>?<CD>}
                         {<JJ>?<NN>?<NN><CD>}
                         {<JJ>?<NN>?<VB.*><CD>}
                         {<JJ>?<NNS><CD>}
            """
    cp = nltk.RegexpParser(grammar)
    result = cp.parse(scope_tags)

    all_scopes = get_scopes_from_subtrees(result)
    return all_scopes

def get_scopes_from_subtrees(result):
    scopes = []
    for subtree in result.subtrees():
        if subtree.label() == 'SCOPES':
            scopes.append([token for token, tag in subtree.leaves() if tag == 'CD'])
    return scopes
            

In [266]:
def aggregate_all_target_info(row_to_list, scopes, target_years, reduction_percentages, perimeter):
    target_infos = []
    for scope in scopes:
        for reduction_percentage, target_year in zip(reduction_percentages, target_years):   
            #print(f"scope: {scope}, reduction_percentage: {reduction_percentage}, target_year: {target_year}, base_year: {base_year}, perimeter: {perimeter}")
            target_infos.append(row_to_list+[scope, reduction_percentage, target_year, base_year, perimeter])
    return target_infos
    

In [271]:
count_not_found = 0
target_info = []
target_info_not_found = [np.nan, np.nan, np.nan, np.nan, np.nan]

for index, row in  companies_with_target_df.iterrows():
    row_to_list = [index]+row.values.tolist()
    found_for_row = False
    if index !=318:
        continue

    pattern = r'(reduce .*scope[a-z]?\s?[0-9].* g?h?g?\s?emissions.* [0-9]+.?[0-9]+?\s?%.* by f?y?\s?[0-9]+/?[0-9]+? from a?n?\s?f?y?\s?[0-9]+/?[0-9]+? base)'

    # Split the target into sentences
    sentences = nltk.sent_tokenize(row['Target'])

    for sentence in sentences:
        sentence = sentence.replace(',','').lower()
        results = re.findall(pattern, sentence)

        if not results:
            continue

        for result in results:
            results_groups = re.search(r'reduce (.*)(scope[a-z]?\s?[0-9].*) (g?h?g?\s?emissions.*) ([0-9]+.?[0-9]+?\s?%).* by f?y?\s?([0-9]+/?[0-9]+?) from a?n?\s?f?y?\s?([0-9]+/?[0-9]+?) base', result)
            if not results_groups:
                count_not_found += 1
                target_info.append(row_to_list+target_info_not_found)
                print(f"Clean Data at index {index},: {result}")
                continue

            reduction_percentages = [results_groups.groups()[3]]
            target_years = [results_groups.groups()[4]]
            base_year = results_groups.groups()[5]
            perimeter_group = results_groups.groups()[2]
    
            new_perimeter = refine_perimeter(perimeter_group)
            if perimeter_group == "ghg emissions" or not new_perimeter:
                perimeter = perimeter_group
            else: #Another target and perimeter has been found!
                target_years.insert(0, new_perimeter["target_year"])
                reduction_percentages.insert(0, new_perimeter["reduction_percentage"])
                perimeter = new_perimeter["perimeter"]
            
            scope_str_to_analyse = " ".join([results_groups.groups()[0], results_groups.groups()[1]])
            all_scopes = which_scopes(scope_str_to_analyse)

            if not all_scopes:
                count_not_found += 1
                print(f"Scope not found at index {index}, {scope_str_to_analyse}")
                target_info.append(row_to_list+target_info_not_found)
                continue

            if len(all_scopes) == 1:
                scopes = all_scopes[0]

            if len(all_scopes) == 3: # 
                """
                Another target was found ! 
                eg: if format of scope was "scope 1 and 2 30% and scope 3"
                Output of all_scopes is [[1,2], [30], [3]]
                """
                scopes = all_scopes[-1]
                target_info += aggregate_all_target_info(row_to_list, all_scopes[0], target_years, all_scopes[1], perimeter)
            
            found_for_row = True
            target_info += aggregate_all_target_info(row_to_list, scopes, target_years, reduction_percentages, perimeter)

    if not found_for_row:
        count_not_found += 1
        print(f"Target not analysed at index {index}:\n{row['Target']}\n")
        target_info.append(row_to_list+target_info_not_found)
    

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Target not analysed at index 318:
Asia Cement Corporation commits to reduce scope 1 and 2 GHG emissions 8% per ton of cementitious materials by 2025 from a 2019 base year. *The target boundary includes biogenic emissions and removals from bioenergy feedstocks.



In [258]:
count_not_found

54

In [241]:
df_with_target_info = pd.DataFrame(target_info, columns = ['row_id']+list(companies_with_target_df.columns)+["scope", "reduction_percentage", "target_year", "base_year", "perimeter"])

In [242]:
df_with_target_info

Unnamed: 0,row_id,Company Name,ISIN,Near term - Target Status,Near term - Target Classification,Near term - Target Year,Long term - Target Status,Long term - Target Classification,Long term - Target Year,Net-Zero Committed,...,Sector,Date,Target,Target Classification,Extension,scope,reduction_percentage,target_year,base_year,perimeter
0,14,(ACIP) Alexandria Company for Industrial Packages,,Targets Set,Well-below 2°C,2030,,,,No,...,Containers and Packaging,01/02/2022,This target was approved using a streamlined t...,The targets covering greenhouse gas emissions ...,,1,25%,2030,2020,emissions
1,14,(ACIP) Alexandria Company for Industrial Packages,,Targets Set,Well-below 2°C,2030,,,,No,...,Containers and Packaging,01/02/2022,This target was approved using a streamlined t...,The targets covering greenhouse gas emissions ...,,2,25%,2030,2020,emissions
2,15,AM Værktøj Odense A/S,,Targets Set,1.5°C,2030,,,,No,...,Technology Hardware and Equipment,01/02/2022,This target was approved using a streamlined t...,The targets covering greenhouse gas emissions ...,,1,50%,2030,2018,emissions
3,15,AM Værktøj Odense A/S,,Targets Set,1.5°C,2030,,,,No,...,Technology Hardware and Equipment,01/02/2022,This target was approved using a streamlined t...,The targets covering greenhouse gas emissions ...,,2,50%,2030,2018,emissions
4,16,Julie Sandlau Vietnam. ltd,,Targets Set,1.5°C,2030,,,,No,...,"Textiles, Apparel, Footwear and Luxury Goods",01/02/2022,This target was approved using a streamlined t...,The targets covering greenhouse gas emissions ...,,1,42%,2030,2020,emissions
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3047,2649,U&We AB,,Targets Set,1.5°C,2030,,,,No,...,Professional Services,01/01/2022,This target was approved using a streamlined t...,The targets covering greenhouse gas emissions ...,,2,46%,2030,2019,emissions
3048,2650,WindowMaster International A/S,,Targets Set,1.5°C,2030,,,,No,...,Building Products,01/01/2022,This target was approved using a streamlined t...,The targets covering greenhouse gas emissions ...,,1,46%,2030,2019,emissions
3049,2650,WindowMaster International A/S,,Targets Set,1.5°C,2030,,,,No,...,Building Products,01/01/2022,This target was approved using a streamlined t...,The targets covering greenhouse gas emissions ...,,2,46%,2030,2019,emissions
3050,2651,Höegh Autoliners,,Targets Set,Well-below 2°C,2030,,,,No,...,Water Transportation - Water Transportation,01/01/2022,This target was approved using a streamlined t...,The targets covering greenhouse gas emissions ...,,1,30%,2030,2018,emissions


In [243]:
df_with_target_info.to_csv("bdd_sbt_with_target_Morgane_v2.csv")