# Experimenting with zero-shot classifiers as a noisy label source

At sentence level within parsed policy documents.

<a href="https://githubtocolab.com/climatepolicyradar/policy-search/blob/master/nbs/notebooks/2021-10-04_experiment_-_zero-shot_sentence_classification.ipynb" target="_blank"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/></a>

In [164]:
# notebook-specific dependencies

!python -m pip install torch transformers 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
from pathlib import Path

import yaml
import pandas as pd
from transformers import pipeline

DATA_FOLDER = Path("../data")

In [41]:
def flatten_list_of_lists(l: list) -> list:
    """
    [[1, 2], [3]] -> [1, 2, 3]
    [[1, 2], 3] -> [1, 2, 3]
    """

    res = []

    for item in l:
        if isinstance(item, list):
            res = res + item
        else:
            res.append(item)

    return res


In [7]:
df = pd.read_csv(DATA_FOLDER/"policy_dataset.csv")

df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1666918 entries, 0 to 1666917
Data columns (total 4 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   policy_id    1666918 non-null  int64 
 1   policy_name  1666918 non-null  object
 2   page_id      1666918 non-null  int64 
 3   text         1666918 non-null  object
dtypes: int64(2), object(2)
memory usage: 50.9+ MB


## 1. Get class labels, download models

### 1a) Get class labels

In [38]:
SCHEMA_FOLDER = Path("../schema")

with open(SCHEMA_FOLDER/"instruments.yml", "r") as f:
    instruments = yaml.safe_load(f)
    
instruments_names = {i['name']: [v['name'] for v in i['levels']] for i in instruments}

In [130]:
class Schema:
    def __init__(self, data):
        self.data = data
    
    @classmethod
    def from_yaml_path(cls, path: Path):
        with open(path, "r") as f:
            data = yaml.safe_load(f)
            
        return Schema(data)

    @property
    def name_subsector_mapping(self):
        return {i['name']: [v['name'] for v in i['levels']] for i in self.data}
    
    @property
    def name_keyword_mapping(self):
        return {i['name']: flatten_list_of_lists([v['keywords'] for v in i['levels']]) for i in self.data}
    
    @property
    def subsector_keyword_mapping(self):
        return {v['name']: v['keywords'] for i in self.data for v in i['levels']}
    
    @property
    def all_keywords(self):
        return list(self.keyword_subsector_mapping.keys())
    
    @property
    def keyword_subsector_mapping(self):
        kwd_subsector_mapping = dict()
        
        for subsector, kwd_list in self.subsector_keyword_mapping.items():
            for keyword in kwd_list:
                kwd_subsector_mapping.update({keyword: subsector})
                                             
        return kwd_subsector_mapping
    

In [132]:
instruments = Schema.from_yaml_path(SCHEMA_FOLDER/"instruments.yml")
sectors = Schema.from_yaml_path(SCHEMA_FOLDER/"sectors.yml")

### 1b) Get models

#### typeform, distilbert MNLI

In [118]:
# typeform, distilbert MNLI: https://huggingface.co/typeform/distilbert-base-uncased-mnli
clf_bert = pipeline("zero-shot-classification", model="typeform/distilbert-base-uncased-mnli")

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


In [126]:
text = """Support around £3.6 billion of investment to upgrade around a million homes through the Energy Company Obligation (ECO), and extend support for home energy efficiency improvements until 2028 at the current level of ECO funding"""

classify(clf_bert, text, i.all_keywords, threshold=0.7)

[('investment', 0.9992654323577881),
 ('obligations', 0.976239800453186),
 ('funding', 0.9000128507614136),
 ('finance', 0.7846159338951111),
 ('interest', 0.7175412774085999)]

### 1c) Classification helper


In [141]:
def classify(clf: pipeline, text: str, classes: list, threshold: float, multilabel: bool = True):
    results_raw = clf(text, classes, multi_label=multilabel)
    res = []
    
    for idx in range(len(results_raw['labels'])):
        if results_raw['scores'][idx] >= threshold:
            res.append(
                (results_raw['labels'][idx], results_raw['scores'][idx])
            )
            
    return res
    

class ZeroShotClassifier:
    def __init__(self, schema: Schema, huggingface_pipeline_name: str = "typeform/distilbert-base-uncased-mnli", multi_label: bool = True):
        self.schema = schema
        self.multi_label = multi_label
        self.clf = self._load_pipeline_from_huggingface(huggingface_pipeline_name)
        
        self._keyword_subsector_mapping = self.schema.keyword_subsector_mapping
    
    def _load_pipeline_from_huggingface(self, name: str):
        return pipeline("zero-shot-classification", model=name)
    
    @property
    def _class_labels(self):
        return self.schema.all_keywords
        
    def predict(self, text: str, threshold: float):
        pipeline_result = self.clf(text, self._class_labels, multi_label=self.multi_label)
        
        res = []
        for idx in range(len(pipeline_result['labels'])):
            if pipeline_result['scores'][idx] >= threshold:
                _label = pipeline_result['labels'][idx]
                _score = pipeline_result['scores'][idx]
                res.append(
                    (_label, self._keyword_subsector_mapping[_label], pipeline_result['scores'][idx])
                )

        return res


In [142]:
clf = ZeroShotClassifier(instruments)
clf.predict(text, threshold=0.7)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


[('investment', 'Infrastructure', 0.9992654323577881),
 ('obligations', 'Obligations', 0.976239800453186),
 ('funding', 'Finance', 0.9000128507614136),
 ('finance', 'Finance', 0.7846159338951111),
 ('interest', 'Finance', 0.7175412774085999)]

## 2. Run zero-shot classifiers on a few examples

In [148]:
n_examples = 15
examples = df.sample(n_examples, random_state=42)['text'].tolist()

clf_instruments = ZeroShotClassifier(instruments)
clf_sectors = ZeroShotClassifier(sectors)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


In [163]:
THRESHOLD = 0.8

for _str in examples:
    print(_str)
    print()
    print("INSTRUMENT PREDICTIONS")
    print("\n".join([f"- {pred}" for pred in clf_instruments.predict(_str, THRESHOLD)]))
    print()
    print("SECTOR PREDICTIONS")
    print("\n".join([f"- {pred}" for pred in clf_sectors.predict(_str, THRESHOLD)]))
    print("----------------------")

Governance comprises the traditions, institutions and processes that determine how power is shared and exercised, how decisions are made and how authority responds on issues of public concern.
INSTRUMENT PREDICTIONS
- ('structures', 'Structures and processes', 0.999853789806366)
- ('processes', 'Designing processes', 0.99956214427948)
- ('organisation', 'Creating bodies/institutions', 0.9960892796516418)

SECTOR PREDICTIONS
- ('power', 'Energy use', 0.9722657799720764)
----------------------
Our strategy to achieve carbon budgets 43
INSTRUMENT PREDICTIONS
- ('strategic plan', 'Developing plans and stratgegies', 0.9998148083686829)
- ('carbon budget', 'Designing processes', 0.9997868537902832)
- ('strategy', 'Developing plans and stratgegies', 0.9997297525405884)
- ('plan', 'Developing plans and stratgegies', 0.9995908141136169)
- ('carbon pricing', 'Market-based instruments', 0.9854298830032349)
- ('processes', 'Designing processes', 0.8829991221427917)
- ('stakeholder engagement', 'St