In [76]:
"""
This file goes through the formulas and marks up the values with generic variables.
For example G13-G12 will become A-B
"""

'\nThis file goes through the formulas and marks up the values with generic variables.\nFor example G13-G12 will become A-B\n'

In [1]:
import pandas as pd
import os
import re



In [2]:
IN_DIR = "data/"

In [3]:
f = ["Ch03-Oil.csv", "Ch04-Gas.csv", "Ch05-Coal.csv", "Ch08-Demand_and_India.csv", "Ch9.csv", "Ch9_4.csv", "Ch9_PH.csv", "Ch3_Claims_ExtractedByUs.csv"]

In [4]:
files = [os.path.join(IN_DIR, name) for name in f]

In [5]:
files

['data/Ch03-Oil.csv',
 'data/Ch04-Gas.csv',
 'data/Ch05-Coal.csv',
 'data/Ch08-Demand_and_India.csv',
 'data/Ch9.csv',
 'data/Ch9_4.csv',
 'data/Ch9_PH.csv',
 'data/Ch3_Claims_ExtractedByUs.csv']

In [6]:
formula_regex =  "[A-Z]+[0-9]+"

In [7]:
other_file_ref_regex = "\'.*\'"

In [8]:
str_const_regex = "(?<![0-9])[a-z]+(?![0-9])"

In [85]:
num_const_regex = "(?<![a-zA-Z])[0-9]+.?[0-9]*(?![a-zA-Z])"

In [11]:
if_regex = "IF\(.*\)"

In [12]:
re.findall(formula_regex, "SUM(D11:D16)/D11")

['D11', 'D16', 'D11']

In [13]:
re.findall(other_file_ref_regex, "='3.2.1 - 3.2.5 + 3.2.8'!G593")

["'3.2.1 - 3.2.5 + 3.2.8'"]

In [14]:
re.findall(str_const_regex, "D12=\"ok\"")

['ok']

In [33]:
temp = "\(.*\)"

In [36]:
q = re.findall(temp, "IF(a<b, \"ok\", \"nok\")")

In [37]:
q[0]

'(a<b, "ok", "nok")'

In [32]:
q[0].split("(")

['IF', 'AND', 'a>b,c>b), "ok", "nok")']

In [90]:
variables = "abcdefghijklmnopqrstuvwxyz"

In [91]:
variables_list = [v for v in variables]

In [92]:
def get_variables_for_formula(cell_references, variables):
    """
    :param cell_references: list which contains the variables of the formulas 
                            (example: ["G11", "G14", "G22", "G11"])
    :param variables: list containing all the letters
    returns a dict that maps the cell_references to the variables that should be replaced in the formula
    for the exmple above, we return {"G11":"a", "G14":"b", "G22": "c"]
    """
    ret_dict = dict()
    var_idx = 0
    for ref in cell_references:
        if not ref in ret_dict:
            ret_dict[ref] = variables[var_idx]
            var_idx += 1
    return ret_dict

In [93]:
def replace_claim_with_variables(formula, ref_var_dict):
    """
    claim: str of the formula (example: G11-G21/3)
    ref_var_dict: dict with keys the cell references (exist in claim) and values the variables that 
                  should replace them
    """
    ret_formula = formula
    for ref, var in ref_var_dict.items():
        ret_formula = ret_formula.replace(ref, var)
    return ret_formula

In [94]:
def replace_str_formula(formula, str_list):
    """
    replace all the constant strings in the formula with STR
    """
    const_str = "STR"
    ret_formula = formula
    for s in str_list:
        ret_formula = ret_formula.replace(s, const_str)
    return ret_formula

In [95]:
def remove_white_space(s):
    return s.replace(" ", "")

In [96]:
d = get_variables_for_formula(["G11", "G12", "G11"], variables_list)

In [97]:
d

{'G11': 'a', 'G12': 'b'}

In [98]:
replace_claim_with_variables("G11 - G12/G11", d).replace(" ", "")

'a-b/a'

In [99]:
def cleanup_df(row):
    if "LOOKUP" in row["Published value"] or "LOOKUP" in row["Author Calculation Equations"]:
        return False
    if "Fig" in row["Author Calculation Equations"]:
        return False
    elif len(re.findall(formula_regex, row["Author Calculation Equations"])) == 0:
        return False
    elif len(re.findall(formula_regex, row["Published value"])) > 0:
        return False
    elif len(re.findall(other_file_ref_regex, row["Author Calculation Equations"])) > 0:
        return False
    else:
        return True

In [100]:
def substitute_claims_with_vars(row):
    formula = row["Author Calculation Equations"]
    # G12, G1, ... etc.
    cell_references = re.findall(formula_regex, formula)
    string_references = re.findall(str_const_regex, formula)
    ref_vars_dict = get_variables_for_formula(cell_references, variables_list)
    template_formula = replace_claim_with_variables(formula, ref_vars_dict)
    template_formula = replace_str_formula(template_formula, string_references)
    template_formula = remove_white_space(template_formula)
    return template_formula

In [101]:
# read input files and get the template_formulas from each and merge everything together

In [102]:
cols = ["Text", "Published value", "Author Calculation Equations"]

In [103]:
main_df = pd.DataFrame()

In [104]:
for file in files:
    print(file)
    temp_df = pd.read_csv(file)
#     temp_df = temp_df[cols]
    temp_df["file"] = file
    # remove unwanted rows
    temp_df["keep"] = temp_df.apply(cleanup_df, axis=1)
    temp_df = temp_df[temp_df.keep == True]
    temp_df["template_formula"] = temp_df.apply(substitute_claims_with_vars, axis=1)
    main_df = pd.concat([main_df, temp_df])

data/Ch03-Oil.csv
data/Ch04-Gas.csv
data/Ch05-Coal.csv
data/Ch08-Demand_and_India.csv
data/Ch9.csv
data/Ch9_4.csv
data/Ch9_PH.csv
data/Ch3_Claims_ExtractedByUs.csv


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.


In [105]:
main_df.head(30)

Unnamed: 0,Author Calculation Equations,Author calc Value,Look-up value,Look-up year,Published value,Scenario,Text,file,keep,template_formula
0,"=IF(G6>0,""ok"",""no"")",,TPEDoil_mbd,2017,grows,NPS,Global oil demand grows by nearly 950 thousand...,data/Ch03-Oil.csv,True,"=IF(a>0,""STR"",""STR"")"
1,=((D6-D5)/(B6-B5))*1000,,TPEDoil_mbd,2025,nearly 950 thousand,NPS,Global oil demand grows by nearly 950 thousand...,data/Ch03-Oil.csv,True,=((a-b)/(c-d))*1000
3,"=IF(G9<G6,""ok"",""no"")",,TPEDoil_mbd,2040,slows,NPS,Global oil demand grows by nearly 950 thousand...,data/Ch03-Oil.csv,True,"=IF(a<b,""STR"",""STR"")"
4,=((D7-D6)/(B7-B6))*1000,,TPEDoil_mbd,2040,250 kb/d,NPS,Global oil demand grows by nearly 950 thousand...,data/Ch03-Oil.csv,True,=((a-b)/(c-d))*1000
5,"=IF(D20>D19,""ok"",""no"")",,TPEDoil_mbd,2017,All of this growth,NPS,All of this growth occurs in developing econom...,data/Ch03-Oil.csv,True,"=IF(a>b,""STR"",""STR"")"
6,"=IF(G18<0,""ok"",""no"")",,TPEDoil_mbd,2025,drops,NPS,All of this growth occurs in developing econom...,data/Ch03-Oil.csv,True,"=IF(a<0,""STR"",""STR"")"
7,=((D18-D16)/(B18-B16))*1000,,TPEDoil_mbd,2040,over 400 kb/d on average,NPS,All of this growth occurs in developing econom...,data/Ch03-Oil.csv,True,=((a-b)/(c-d))*1000
8,"=IF(A30<A29,""ok"",""false"")",,China,,slows noticeably,,Even though its demand growth slows noticeably...,data/Ch03-Oil.csv,True,"=IF(a<b,""STR"",""STR"")"
9,"=IF(D28>D35,""ok"",""false"")",,TPEDoil_mbd,2017,overtakes,NPS,Even though its demand growth slows noticeably...,data/Ch03-Oil.csv,True,"=IF(a>b,""STR"",""STR"")"
10,"=IF(AND(D28>D35,D28>D37),""ok"",""false"")",,TPEDoil_mbd,2025,world's largest,NPS,Even though its demand growth slows noticeably...,data/Ch03-Oil.csv,True,"=IF(AND(a>b,a>c),""STR"",""STR"")"


In [64]:
size = len(main_df.drop_duplicates(subset=["Text", "Published value", "template_formula"]))

In [65]:
size

2091

In [61]:
main_df.groupby("Author Calculation Equations")["template_formula"]

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fb8c426cc18>

In [62]:
# main_df.to_csv("data/templates_from_8_files.csv", index=False)

In [66]:
 len(main_df.drop_duplicates(subset=["Text", "Published value", "template_formula"]).drop_duplicates("template_formula"))

In [67]:
num_labels = len(main_df.drop_duplicates(subset="template_formula"))

In [68]:
num_labels

556

In [69]:
size/num_labels

3.7607913669064748

In [61]:
main_df[main_df.file == "data/Ch3_Claims_ExtractedByUs.csv"]

Unnamed: 0,Text,Published value,Author Calculation Equations,file,keep,template_formula
0,Global oil demand grows by nearly 950 thousand...,grows,"IF(G6>0,""ok"",""no"")",data/Ch3_Claims_ExtractedByUs.csv,True,"IF(a>0,""ok"",""no"")"
1,Global oil demand grows by nearly 950 thousand...,nearly 950 thousand,((D6-D5)/(B6-B5))*1000,data/Ch3_Claims_ExtractedByUs.csv,True,((a-b)/(c-d))*1000
3,Global oil demand grows by nearly 950 thousand...,slows,"IF(G9<G6,""ok"",""no"")",data/Ch3_Claims_ExtractedByUs.csv,True,"IF(a<b,""ok"",""no"")"
4,Global oil demand grows by nearly 950 thousand...,250 kb/d,((D7-D6)/(B7-B6))*1000,data/Ch3_Claims_ExtractedByUs.csv,True,((a-b)/(c-d))*1000
5,All of this growth occurs in developing econom...,All of this growth,"IF(D20>D19,""ok"",""no"")",data/Ch3_Claims_ExtractedByUs.csv,True,"IF(a>b,""ok"",""no"")"
...,...,...,...,...,...,...
647,Robust production growth in Brazil increases C...,Robust production growth,G261>1%,data/Ch3_Claims_ExtractedByUs.csv,True,a>1%
648,Robust production growth in Brazil increases C...,increases,"AND(G269>G264,G271>G264)",data/Ch3_Claims_ExtractedByUs.csv,True,"AND(a>b,c>b)"
649,Robust production growth in Brazil increases C...,decline,G274<G273,data/Ch3_Claims_ExtractedByUs.csv,True,a<b
650,Robust production growth in Brazil increases C...,waning,G277<G276,data/Ch3_Claims_ExtractedByUs.csv,True,a<b


In [75]:
q = pd.merge(main_df, main_df, on="Published value")

In [None]:
q[]