In [56]:
import pandas as pd
import numpy as np

import time
from pathlib import Path

import torch
from transformers import BartForSequenceClassification, BartTokenizer, BartConfig

# set a seed value
torch.manual_seed(555)

# internal libraries
from ressources import target_to_label

In [57]:
results_dir = Path("results")
now = time.time()

results = {**{"text": [], "time": []}, **{k: [] for k in target_to_label.keys()}}

label_to_target = {v: k for k, v in target_to_label.items()}

with open(results_dir / Path(f"{now}.csv"), "w") as f:
    f.write("text,time,12.1,12.2,12.3,12.4,12.5,12.6,12.7,12.8,12.a,12.b,12.c\n")

In [58]:
config = BartConfig.from_pretrained("valhalla/distilbart-mnli-12-9")
tokenizer = BartTokenizer.from_pretrained("valhalla/distilbart-mnli-12-9")
model = BartForSequenceClassification.from_pretrained("valhalla/distilbart-mnli-12-9")

In [59]:
data = pd.read_csv("osdg-data.csv")

df = data[(data["sdg"] == 12) & (data["label_osdg"] == "accepted")]

text = df["text"].iloc[0]
print(text)

This makes tourism a co-ordination-intensive, as well as information-intensive, industry (Zhang et al. The key components of tourism are accommodation, transport, attractions and excursions, and restaurants; all are ‘perishable’. This means that airline seats, hotel rooms and daily ticket sales, for example, cannot be stored for potential future sales. This level of uncertainty, coupled with the uncertainty of global trends and exogenous shocks, has become an important area of tourism supply chain research. Areas of particular interest include demand forecasting, yield or revenue management and inventory management (Zhang et al. Finally, the supply chains in tourism that already exist are usually part of the wider global operations of major hotels and resorts (for example Hilton and Four Seasons hotels) and of cruise ship operators (for example Carnival Corporation and Royal Caribbean).


In [60]:
def predict(premise, hypothesis):
    # run through model pre-trained on MNLI
    input_ids = tokenizer.encode(premise, hypothesis, return_tensors="pt")
    logits = model(input_ids)[0]

    # we throw away "neutral" (dim 1) and take the probability of
    # "entailment" (2) as the probability of the label being true
    entail_contradiction_logits = logits[:, [0, 2]]

    probs = entail_contradiction_logits.softmax(dim=1)
    true_prob = probs[:, 1].item() * 100
    print(f"Probability that '{hypothesis}' is true: {true_prob:0.2f}%")

    return true_prob

In [61]:
labels = list(target_to_label.values())

for text in df["text"]:

    start_time = time.time()
    results["text"].append(text)

    for label in labels:
        # Build hypothesis
        hypothesis = "The context is " + label

        # Run prediction
        true_prob = predict(text, hypothesis)

        target_id = label_to_target[label]
        results[target_id].append(true_prob)

    total_time = time.time() - start_time
    print(f"Total prediction time : {total_time:0.2f}s")

    results["time"].append(total_time)
    break

Probability that 'The context is countries taking action on sustainable consumption and production' is true: 68.07%
Probability that 'The context is sustainable management and efficient use of natural resources' is true: 12.63%
Probability that 'The context is halve per capita global food waste and reduce food losses' is true: 24.30%
Probability that 'The context is environmentally sound management of all wastes' is true: 1.72%
Probability that 'The context is substantially reduce waste generation' is true: 12.65%
Probability that 'The context is encourage companies to integrate sustainability into their reporting cycle' is true: 47.04%
Probability that 'The context is promote public procurement practices that are sustainable' is true: 37.22%
Probability that 'The context is ensure that people have the relevant information and awareness for sustainable development and lifestyles' is true: 78.46%
Probability that 'The context is strengthen their scientific and technological capacity for

In [71]:
with open(results_dir / Path(f"{now}.csv"), "a") as f:
    for i in range(len(results["time"])):
        text = results["text"][i]
        time = results["time"][i]
        new_line = (
            f'"{text}",'
            + ",".join([f"{v[i]:.2f}" for k, v in results.items() if k != "text"])
            + "\n"
        )
        f.write(new_line)