In [1]:
"""
This file goes through the formulas and marks up the values with generic variables.
For example G13-G12 will become A-B
"""

'\nThis file goes through the formulas and marks up the values with generic variables.\nFor example G13-G12 will become A-B\n'

In [2]:
import pandas as pd
import os
import re



In [3]:
IN_DIR = "data/"

In [4]:
f = ["Ch03-Oil.csv", "Ch04-Gas.csv", "Ch05-Coal.csv", "Ch08-Demand_and_India.csv", "Ch9.csv", "Ch9_4.csv", "Ch9_PH.csv"]

In [5]:
files = [os.path.join(IN_DIR, name) for name in f]

In [6]:
files

['data/Ch03-Oil.csv',
 'data/Ch04-Gas.csv',
 'data/Ch05-Coal.csv',
 'data/Ch08-Demand_and_India.csv',
 'data/Ch9.csv',
 'data/Ch9_4.csv',
 'data/Ch9_PH.csv']

In [7]:
formula_regex =  "[A-Z]+[0-9]+"

In [8]:
re.findall(formula_regex, "SUM(D11:D16)/D11")

['D11', 'D16', 'D11']

In [9]:
variables = "abcdefghijklmnopqrstuvwxyz"

In [10]:
variables_list = [v for v in variables]

In [11]:
def get_variables_for_formula(cell_references, variables):
    """
    :param cell_references: list which contains the variables of the formulas 
                            (example: ["G11", "G14", "G22", "G11"])
    :param variables: list containing all the letters
    returns a dict that maps the cell_references to the variables that should be replaced in the formula
    for the exmple above, we return {"G11":"a", "G14":"b", "G22": "c"]
    """
    ret_dict = dict()
    var_idx = 0
    for ref in cell_references:
        if not ref in ret_dict:
            ret_dict[ref] = variables[var_idx]
            var_idx += 1
    return ret_dict

In [12]:
def replace_claim_with_variables(formula, ref_var_dict):
    """
    claim: str of the formula (example: G11-G21/3)
    ref_var_dict: dict with keys the cell fererences (exist in claim) and values the variables that 
                  should replace them
    """
    ret_formula = formula
    for ref, var in ref_var_dict.items():
        ret_formula = ret_formula.replace(ref, var)
    return ret_formula

In [13]:
def remove_white_space(s):
    return s.replace(" ", "")

In [14]:
d = get_variables_for_formula(["G11", "G12", "G11"], variables_list)

In [15]:
d

{'G11': 'a', 'G12': 'b'}

In [16]:
replace_claim_with_variables("G11 - G12/G11", d).replace(" ", "")

'a-b/a'

In [17]:
def cleanup_df(row):
    if "LOOKUP" in row["Published value"] or "LOOKUP" in row["Author Calculation Equations"]:
        return False
    elif len(re.findall(formula_regex, row["Author Calculation Equations"])) == 0:
        return False
    elif len(re.findall(formula_regex, row["Published value"])) > 0:
        return False
    else:
        return True

In [18]:
def substitute_claims_with_vars(row):
    formula = row["Author Calculation Equations"]
    # G12, G1, ... etc.
    cell_references = re.findall(formula_regex, formula)
    ref_vars_dict = get_variables_for_formula(cell_references, variables_list)
    template_formula = replace_claim_with_variables(formula, ref_vars_dict)
    template_formula = remove_white_space(template_formula)
    return template_formula

In [19]:
# read input files and get the template_formulas from each and merge everything together

In [20]:
cols = ["Text", "Published value", "Author Calculation Equations"]

In [21]:
main_df = pd.DataFrame()

In [22]:
for file in files:
    print(file)
    temp_df = pd.read_csv(file)
    temp_df = temp_df[cols]
    temp_df["file"] = file
    # remove unwanted rows
    temp_df["keep"] = temp_df.apply(cleanup_df, axis=1)
    temp_df = temp_df[temp_df.keep == True]
    temp_df["template_formula"] = temp_df.apply(substitute_claims_with_vars, axis=1)
    main_df = pd.concat([main_df, temp_df])

data/Ch03-Oil.csv
data/Ch04-Gas.csv
data/Ch05-Coal.csv
data/Ch08-Demand_and_India.csv
data/Ch9.csv
data/Ch9_4.csv
data/Ch9_PH.csv


In [23]:
main_df

Unnamed: 0,Text,Published value,Author Calculation Equations,file,keep,template_formula
0,Global oil demand grows by nearly 950 thousand...,grows,"=IF(G6>0,""ok"",""no"")",data/Ch03-Oil.csv,True,"=IF(a>0,""ok"",""no"")"
1,Global oil demand grows by nearly 950 thousand...,nearly 950 thousand,=((D6-D5)/(B6-B5))*1000,data/Ch03-Oil.csv,True,=((a-b)/(c-d))*1000
3,Global oil demand grows by nearly 950 thousand...,slows,"=IF(G9<G6,""ok"",""no"")",data/Ch03-Oil.csv,True,"=IF(a<b,""ok"",""no"")"
4,Global oil demand grows by nearly 950 thousand...,250 kb/d,=((D7-D6)/(B7-B6))*1000,data/Ch03-Oil.csv,True,=((a-b)/(c-d))*1000
5,All of this growth occurs in developing econom...,All of this growth,"=IF(D20>D19,""ok"",""no"")",data/Ch03-Oil.csv,True,"=IF(a>b,""ok"",""no"")"
...,...,...,...,...,...,...
2242,"Flexibility needs experience a step change, wi...",0.37,"=ROUND(G190, 2)",data/Ch9_PH.csv,True,"=ROUND(a,2)"
2243,"Flexibility needs experience a step change, wi...",0.37,=SUM(G192:G193)/G191,data/Ch9_PH.csv,True,=SUM(a:b)/c
2247,"All in all, demand-side response facilitated b...",450,=G211,data/Ch9_PH.csv,True,=a
2248,"All in all, demand-side response facilitated b...",300,=G202,data/Ch9_PH.csv,True,=a


In [24]:
len(main_df.drop_duplicates(subset=["Text", "Published value", "template_formula"]))

1828

In [212]:
main_df.groupby("Author Calculation Equations")["template_formula"]

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f15a47f8b00>

In [25]:
main_df.to_csv("data/templates_from_7_files", index=False)

In [29]:
len(main_df.drop_duplicates(subset=["Text", "Published value", "template_formula"]).drop_duplicates("template_formula"))

478

In [30]:
1828/478

3.8242677824267783