In [1]:
# This file uses sentences from ClaimsExtractedByUs to learn the sentence-look_up_value distribution
# considering unique (sentence, claim) pairs

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import FeatureUnion
import numpy as np

In [3]:
path = "data/annotated_11_19_2019/ClaimsExtractedByUs.csv"

In [4]:
df = pd.read_csv(path, encoding="latin1")

In [5]:
df

Unnamed: 0,Text,Published value,Author calc Value,Look-up value,Look-up year,Scenario,Fixed_Formulas
0,The Sustainable Development Scenario starts wi...,7,7.1,,,,'C:\Users\Mohammed Saeed\Desktop\Energy Fact C...
1,The Sustainable Development Scenario starts wi...,3.9,3.9,,,,'C:\Users\Mohammed Saeed\Desktop\Energy Fact C...
2,The Sustainable Development Scenario starts wi...,13,13,,,,'C:\Users\Mohammed Saeed\Desktop\Energy Fact C...
3,For the first time the number of people withou...,For the first time,TRUE,,,,'C:\Users\Mohammed Saeed\Desktop\Energy Fact C...
4,For the first time the number of people withou...,fell below 1 billion,TRUE,,,,'C:\Users\Mohammed Saeed\Desktop\Energy Fact C...
5,For the first time the number of people withou...,is declining gradually,TRUE,,,,'C:\Users\Mohammed Saeed\Desktop\Energy Fact C...
6,"Despite significant steps forward in Kenya, Et...",Ethiopia,0.4014,,,,G29-G28
7,Total electrification rate,Tanzania,0.223,Ethiopia,2017,Historic,G32-G31
8,Total electrification rate,Nigeria,0.197945098,Tanzania,2017,Historic,G35-G34
9,Total electrification rate,600 million,TRUE,Nigeria,2017,Historic,'C:\Users\Mohammed Saeed\Desktop\Energy Fact C...


In [6]:
df_lookup = df[df["Fixed_Formulas"].str.contains("LOOKUP")]

In [7]:
df_lookup

Unnamed: 0,Text,Published value,Author calc Value,Look-up value,Look-up year,Scenario,Fixed_Formulas
18,Energy-related carbon dioxide (CO2) emissions ...,increased deployment,723.284262,PGINrenew,2016,Historic,"VLOOKUP($A54,G:\EO2018\FrozenResults\Demand\_A..."
19,Energy-related carbon dioxide (CO2) emissions ...,increased deployment,770.4351901,#REF!,#VALUE!,Historic,"VLOOKUP($A55,G:\EO2018\FrozenResults\Demand\_A..."
32,Stronger policy action leads to substantially ...,substantially higher,38.27872429,InvCosts_IND_Eff_Total,cum2040,,"VLOOKUP(A94,G:\EO2018\FrozenResults\Investment..."
33,Stronger policy action leads to substantially ...,substantially higher,309.3837303,InvCosts_TRA_Eff_Total,cum2040,,"VLOOKUP(A95,G:\EO2018\FrozenResults\Investment..."
34,Stronger policy action leads to substantially ...,substantially higher,224.3022014,InvCosts_BLDG_Eff_Total,cum2040,,"VLOOKUP(A96,G:\EO2018\FrozenResults\Investment..."
35,Stronger policy action leads to substantially ...,substantially higher,97.30096692,InvCosts_IND_Eff_Total,cum2040,,"VLOOKUP(A97,G:\EO2018\FrozenResults\Investment..."
36,Stronger policy action leads to substantially ...,substantially higher,338.900707,InvCosts_TRA_Eff_Total,cum2040,,"VLOOKUP(A98,G:\EO2018\FrozenResults\Investment..."
37,Stronger policy action leads to substantially ...,substantially higher,279.5660694,InvCosts_BLDG_Eff_Total,cum2040,,"VLOOKUP(A99,G:\EO2018\FrozenResults\Investment..."
39,Stronger policy action leads to substantially ...,stays close to,13971.58601,TPEDtotal,2017,SDS,"VLOOKUP($A101,G:\EO2018\FrozenResults\Demand\_..."
40,Stronger policy action leads to substantially ...,stays close to,13715.01342,TPEDtotal,2040,SDS,"VLOOKUP($A102,G:\EO2018\FrozenResults\Demand\_..."


In [8]:
# drop duplicates
df_lookup_no_dups = df_lookup.drop_duplicates(subset="Text")

In [9]:
df_lookup_no_dups

Unnamed: 0,Text,Published value,Author calc Value,Look-up value,Look-up year,Scenario,Fixed_Formulas
18,Energy-related carbon dioxide (CO2) emissions ...,increased deployment,723.284262,PGINrenew,2016,Historic,"VLOOKUP($A54,G:\EO2018\FrozenResults\Demand\_A..."
32,Stronger policy action leads to substantially ...,substantially higher,38.27872429,InvCosts_IND_Eff_Total,cum2040,,"VLOOKUP(A94,G:\EO2018\FrozenResults\Investment..."
53,for the first time the number of people withou...,for the first time,1422.12966,PopwoElecAccess,2010,Hist,"HLOOKUP($B278,G:\EO2018\FrozenResults\Demand\A..."
64,In sub-Saharan Africa 600 million people are s...,600 million people,580.67282,PopwoElecAccess,2030,NPS,"HLOOKUP($B321,G:\EO2018\FrozenResults\Demand\A..."
72,Energy-related carbon dioxide (CO2) emissions ...,for the first time in three years,32138.38726,CO2TPEDtotal,2014,Historic,"VLOOKUP($A352,G:\EO2018\FrozenResults\Demand\_..."
82,0,increased deployment of renewables,723.284262,PGINrenew,2016,Historic,"VLOOKUP($A372,G:\EO2018\FrozenResults\Demand\_..."
91,Substantial reductions in major air pollutant ...,Substantial reductions,48404.7228,SO2_total,2030,NPS,"HLOOKUP($B26,G:\EO2018\FrozenResults\Pollutant..."
99,Energy-related CO2 emissions peak and then dec...,peak,34884.22829,CO2_TPED_IP_total,2017,SDS,"VLOOKUP($A40,G:\EO2018\FrozenResults\Demand\_A..."
110,Given expected strong population growth over t...,around 1.2 billion,1218.768702,CumPopElecAccess,2030,SDS,"HLOOKUP($B11,G:\EO2018\FrozenResults\Demand\Ac..."
115,The cheapest way to achieve universal electric...,cheapest,375.3342133,AccessPGOUTtotal,2030,SDS,"HLOOKUP($B21,G:\EO2018\FrozenResults\Demand\Ac..."


In [10]:
# Experiment 1
# The features will be the text and the label will be the lookup-value here

In [11]:
from src.tokenizer.tokenizer_driver import TokenizerDriver
from src.featurizer.feature_extractor import FeatureExtractor

In [12]:
tok_driver = TokenizerDriver()
featurizer = FeatureExtractor(mode="tfidf")

In [13]:
claim_sents = list(df_lookup_no_dups["Text"])

In [14]:
tokenized_sents = tok_driver.tokenize_claims(claim_sents)

In [15]:
features = featurizer.featurize_claims(tokenized_sents)

In [16]:
features.shape

(140, 2444)

In [17]:
labels = list(df_lookup_no_dups["Look-up value"])

In [18]:
len(labels)

140

In [19]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.20, random_state=42)

In [20]:
X_train.shape, len(y_train), X_test.shape, len(y_test)

((112, 2444), 112, (28, 2444), 28)

In [21]:
model = LinearSVC()

In [22]:
model.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [23]:
final_model = CalibratedClassifierCV(base_estimator=model, cv="prefit")

In [24]:
final_model.fit(X_train, y_train)

CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
            cv='prefit', method='sigmoid')

In [25]:
def _linear_scale_confidence(confidences):
    """
    return the ratio of prob according to the sum of top n probabilities for the predicted intents.
    if probs = [p1, p2, p3] then the return probabilities will be scaled as
    [p1/sum(p1,p2,p3), p2/sum(p1,p2,p3), p3/sum(p1,p2,p3)]
    Args:
        confidences: probabilities of intents
    Returns:
        numpy array: the scaled confidences
    """
    s = np.sum(confidences)
    return confidences/s

In [26]:
def predict_utt_top_n(mod, featurized_utt, n=3):
    """
    predict the top3 intents along with the confidence probability for each one.
    Note that model.classes_ contains the trained labels in alphabetical order. Here, we sort the
    confidences together with the labels, and return the top3 from this sorted order
    Args:
        featurized_utt (str): featurized and tokenized single utterance
    Returns:
        One list of strings and one list of floats
    """
    raw_confidences = mod.predict_proba(featurized_utt)[0]
    # indices of sorted confidences from high to low confidence
    sorted_conf_idx = np.argsort(raw_confidences)[::-1][:n]
    labels = np.take(mod.classes_, sorted_conf_idx)
    confidences = np.take(raw_confidences, sorted_conf_idx)
    scaled_confidences = _linear_scale_confidence(confidences)

    return labels, scaled_confidences

In [27]:
temp = X_test[0]

In [28]:
predict_utt_top_n(final_model, temp.reshape(1, -1))

(array(['SubsidyTotal_World', 'CO2TPEDtotal', 'PGINrenew'], dtype='<U32'),
 array([0.46084942, 0.39177897, 0.14737161]))

In [29]:
predictions = [predict_utt_top_n(final_model, test.reshape(1, -1), n=5) for test in X_test]

In [30]:
predictions

[(array(['SubsidyTotal_World', 'CO2TPEDtotal', 'PGINrenew', 'CO2PGINcoal',
         'CO2_TPED_IP_total'], dtype='<U32'),
  array([0.37595172, 0.31960543, 0.1202228 , 0.09787882, 0.08634122])),
 (array(['InvAccess_cookingtotal', 'PGINrenew', 'AccessPGInvInfra_total',
         'TPEDperGDP', 'PM_Bldg_Total'], dtype='<U32'),
  array([0.41828627, 0.17121213, 0.16958955, 0.12451565, 0.1163964 ])),
 (array(['TPEDgas', 'PGINrenew', 'PGCap_coal', 'PGOUTtotal',
         'PM_Bldg_Total'], dtype='<U32'),
  array([0.21492098, 0.2047772 , 0.19995913, 0.19163121, 0.18871147])),
 (array(['PGINrenew', 'PGOUTtotal', 'ElecGenElecOnly_Coal_Subcritical',
         'INDIC_CO2_Elec_g_per_kWh', 'TPEDperGDP'], dtype='<U32'),
  array([0.25922123, 0.23523486, 0.1879944 , 0.15986291, 0.1576866 ])),
 (array(['TFCtotal', 'PGINrenew', 'TPEDperGDP', 'InvCosts_IND_Eff_Total',
         'TFCelec'], dtype='<U32'),
  array([0.30423608, 0.26883416, 0.15053009, 0.13955997, 0.1368397 ])),
 (array(['CO2TPEDtotal', 'PGINrenew',

In [31]:
len(y_test)

28

In [35]:
len(prediction_labels)

28

In [33]:
prediction_labels = [pred[0][0] for pred in predictions]

In [34]:
n_correct = np.sum(y_test == prediction_labels)
accuracy = float(n_correct / len(y_test))

In [36]:
n_correct

0

In [37]:
[x == y for (x, y) in zip(y_test, prediction_labels)]

[True,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True]

In [38]:
num_correct = 0
for test, pred in zip(y_test, predictions):
    topn = pred[0]
    if test in topn:
        num_correct += 1

In [39]:
num_correct / len(y_test)

0.25

In [41]:
len(df_lookup_no_dups.drop_duplicates(subset="Look-up value"))

87