# Initialization

## Import libraries

In [24]:
#!pip install nltk
#!pip install torch
#!pip install torchvision 
#!pip install transformers
#!pip install -U scikit-learn
#!pip install -U sentence-transformers
#!pip install -U datasets
#!pip install -U numpy
#!pip install pyyaml
#!pip install Pillow
#!pip install image
#!pip install tabulate

In [25]:
import requests
import os
import json
import nltk
import re
import torch
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error



from tabulate import tabulate
from tqdm import trange
import random

import transformers
from datasets import Dataset,load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification

## Set Variables

In [53]:
use_colab = False
if (use_colab):
    from google.colab import drive
    drive.mount('/content/drive')
    dataset_directory = '/content/drive/MyDrive/Colab/datasets/wikipedia_to_sdgs'
    workspace = '/content/drive/MyDrive/Colab/wikipedia_to_sdgs'
else:
    workspace = '.'

sdgs_corpus_titles_path = os.path.join(workspace, 'sdgs_titles.json')
corpus_directory = os.path.join(workspace, 'articles')
country_articles_file_path = os.path.join(workspace, 'country_articles.json')
similarity_output_directory = os.path.join(workspace, 'similarities')
similarity_threshold = 0.3
sdg_indicator_file_path=os.path.join(workspace, 'sdgs_data.csv')

# Build the SDG-Similarity Network

In [27]:
def retrieve_wikipedia_article(article_uri):
    if article_uri.startswith('http'):
        article_title = article_uri.rsplit('/', 1)[-1]
        print('Title = ', article_title)
    else:
        article_title = article_uri
    
    request= 'https://en.wikipedia.org/w/api.php?format=json&action=query&titles='+article_title+'&prop=extracts&exlimit=max&explaintext&exlimit=max'
    response = requests.get(request).json()
    pageid = list(response['query']['pages'])[0]
    content = response['query']['pages'][pageid]['extract']
    return content

## Get SDG article

In [28]:
def retrieve_sdg_article(article_uri):
    if 'wikipedia' in article_uri:
        article_content = retrieve_wikipedia_article(article_uri)
    else:
        print('unsupported article source ', article_uri)
    
    return article_content


## Read the goals titles

In [29]:
sdgs_corpus_titles_file = open(sdgs_corpus_titles_path, 'r')
sdgs_corpus_titles = json.load(sdgs_corpus_titles_file)
print('Number of goals found in the corpus file = ', len(sdgs_corpus_titles))

Number of goals found in the corpus file =  17


## Download the corpus files

In [30]:
sdg_corpus = dict()

for goal in sdgs_corpus_titles:
    article_contents = []
    title = sdgs_corpus_titles[goal]['title']
    articles = sdgs_corpus_titles[goal]['articles']
    print('Retrieving articles for ', goal+': '+title)
    for article in articles:
        article_content = retrieve_sdg_article(article)
        article_contents.append(article_content)
    sdg_corpus[goal+': '+title] = article_contents
        

Retrieving articles for  Goal 1: No Poverty
Title =  Sustainable_Development_Goal_1
Retrieving articles for  Goal 2: Zero Unger
Title =  Sustainable_Development_Goal_2
Retrieving articles for  Goal 3: Good Health and Well-being
Title =  Sustainable_Development_Goal_3
Retrieving articles for  Goal 4: Quality Education
Title =  Sustainable_Development_Goal_4
Retrieving articles for  Goal 5: Gender Equality
Title =  Sustainable_Development_Goal_5
Retrieving articles for  Goal 6: Clean Water and Sanitation
Title =  Sustainable_Development_Goal_6
Retrieving articles for  Goal 7: Affordable and Clean Engergy
Title =  Sustainable_Development_Goal_7
Retrieving articles for  Goal 8: Decent Work and Economic Growth
Title =  Sustainable_Development_Goal_8
Retrieving articles for  Goal 9: Industry, Innovation and Infrastructure
Title =  Sustainable_Development_Goal_9
Retrieving articles for  Goal 10: Reduced Inequalities
Title =  Sustainable_Development_Goal_10
Retrieving articles for  Goal 11: Su

# Feed documents into BERT

## Convert corpus dictionary to dataframe

In [31]:
sdgs_documents_df = pd.DataFrame(columns=['document', 'sdg'])

row_index = 0
for goal in sdg_corpus:
    documents = sdg_corpus[goal]
    for document in documents:
        row = {'document':document, 'sdg':goal}
        sdgs_documents_df.loc[row_index] = row
        row_index+=1

sdgs = sdgs_documents_df.sdg.values
sdgs_documents = sdgs_documents_df.document.values
#sorted_goals = np.sort(np.unique(goals))
#labels = np.searchsorted(sorted_goals,goals)

# Build the BERT Model

In [32]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

loading configuration file /kuacc/users/asafa22/.cache/torch/sentence_transformers/sentence-transformers_bert-base-nli-mean-tokens/config.json
Model config BertConfig {
  "_name_or_path": "/kuacc/users/asafa22/.cache/torch/sentence_transformers/sentence-transformers_bert-base-nli-mean-tokens/",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file /kuacc/users/asafa22/.cache/torch/sentence_transformers/sentence-transformers_bert-base-nli-me

## Construct the sentence embeddings for the goals titles

In [33]:
sdgs_embeddings = model.encode(sdgs_documents, batch_size = 8, show_progress_bar = True)
np.shape(sdgs_embeddings)

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

(17, 768)

# Find the similarity between the goals documents and the input documents

## Read the country article dictionary file

In [34]:
with open(country_articles_file_path, 'r') as file:
    country_articles_dict = json.load(file)

## Construct the documents embeddings for the input documents

In [35]:
countries = country_articles_dict.keys()
country_embedding_dict = dict()
for country in countries:
  print('Start constructing document embeddings for ', country)
  article_embeddings = []
  articles = country_articles_dict[country]
  for article in articles:
    article_title = article['title']
    article_file_path = os.path.join(corpus_directory, article['path'])
    with open(article_file_path, 'r', encoding='utf-8') as file:
      content = file.read()
      file.close()
    article_embedding = model.encode(content, show_progress_bar = False)
    article_embeddings.append(article_embedding)
  country_embedding_dict[country] = article_embeddings
  print('Finished constructing document embeddings for ', country)
print('Finished construct the document embeddings for all countries')
      
    

Start constructing document embeddings for  Jordan
Finished constructing document embeddings for  Jordan
Start constructing document embeddings for  Lebanon
Finished constructing document embeddings for  Lebanon
Start constructing document embeddings for  Turkey
Finished constructing document embeddings for  Turkey
Start constructing document embeddings for  Syria
Finished constructing document embeddings for  Syria
Start constructing document embeddings for  Egypt
Finished constructing document embeddings for  Egypt
Start constructing document embeddings for  Iraq
Finished constructing document embeddings for  Iraq
Start constructing document embeddings for  Saudi Arabia
Finished constructing document embeddings for  Saudi Arabia
Start constructing document embeddings for  Yemen
Finished constructing document embeddings for  Yemen
Start constructing document embeddings for  Cyprus
Finished constructing document embeddings for  Cyprus
Start constructing document embeddings for  Qatar
F

# Find Similarity between DSGs articles and country articles

## Find cosine similarity

In [36]:
country_similarities = dict()
for country in country_embedding_dict:
  article_embeddings = country_embedding_dict[country]
  #for article_embedding in article_embeddings:
  similarities = cosine_similarity(article_embeddings, sdgs_embeddings)
  country_similarities[country] = similarities

## Print the similarities to file

In [37]:
if not os.path.exists(similarity_output_directory):
    os.makedirs(similarity_output_directory)
for country in country_similarities.keys():
    similarity_file_path = os.path.join(similarity_output_directory,country+'_similarity'+'.csv')
    np.savetxt(fname=similarity_file_path, X=country_similarities[country], delimiter=',')

## Get the candidate documents for each SDG

In [54]:
sdgs_similar_documents_dict = dict()
for goal_index in range(len(sdgs)):
  # get documents with similarity >= 0.5
  sdg_similar_documents_dict = dict()
  for country in countries:
    sdg_similar_documents = []
    similarities = country_similarities[country]
    #for document_index in range(len(similarities[goal_index])):
    for document_index in range(len(similarities)):
      if similarities[document_index][goal_index]>=similarity_threshold:
        sdg_similar_documents.append(document_index)
    sdg_similar_documents_dict[country]=sdg_similar_documents
  sdgs_similar_documents_dict[sdgs[goal_index]] = sdg_similar_documents_dict
  

# Regression Model

## Construct the dataset for the sdg

In [39]:
def build_sdg_dataset(sdg, sdg_index, sdgs_similar_documents_dict, country_sdg_indicator_values):
  sdg_dataset = pd.DataFrame(columns=['document','label'])

  sdg_country_documents = sdgs_similar_documents_dict[sdg]
  row_index = 0
  for country in sdg_country_documents.keys():
    country_sdg_indicator_value = country_sdg_indicator_values[country][sdg_index-1]
    country_article_indecies = sdg_country_documents[country]
    country_articles = country_articles_dict[country]
    for article_index in country_article_indecies:
      article = country_articles[article_index]
      article_title = article['title']
      article_file_path = os.path.join(corpus_directory, article['path'])
      with open(article_file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        file.close()
      sdg_dataset.loc[row_index] = [content,country_sdg_indicator_value]
      row_index+=1
  return sdg_dataset
      
  

## Select the SDG to work on

In [40]:
sdg_index = 1
sdg = sdgs[sdg_index]

## Read the sdg indicator values

In [41]:
sdg_indicator_values = pd.read_csv(sdg_indicator_file_path, header=0).set_index('country').T.to_dict('list')
print(sdg_indicator_values)

{'Jordan': [0.157, nan, nan], 'Lebanon': [0.82, nan, nan], 'Turkey': [0.102, nan, nan], 'Syria': [0.9, nan, nan], 'Egypt': [0.273, nan, nan], 'Iraq': [0.25, nan, nan], 'Saudi Arabia': [0.2, nan, nan], 'Yemen': [0.55, nan, nan], 'Cyprus': [0.138, nan, nan], 'Qatar': [0.0, nan, nan], 'Oman': [0.0, nan, nan], 'Iran': [0.276, nan, nan], 'United Arab Emirates': [0.0, nan, nan], 'Kuwait': [0.0, nan, nan], 'Bahrain': [0.0, nan, nan], 'Morocco': [0.024, nan, nan], 'Tunisia': [0.034, nan, nan], 'Libya': [0.37, nan, nan], 'Sudan': [0.32, nan, nan], 'Mauritania': [0.063, nan, nan]}


  sdg_indicator_values = pd.read_csv(sdg_indicator_file_path, header=0).set_index('country').T.to_dict('list')


## Construct the dataset

In [55]:
sdg1_dataset = build_sdg_dataset(sdg, sdg_index, sdgs_similar_documents_dict, sdg_indicator_values)
print('Number of samples is ', sdg1_dataset.shape[0])
dataset = Dataset.from_pandas(sdg1_dataset,preserve_index=False)
dataset = dataset.train_test_split(test_size=0.3)
dataset['train']['document'][:5]

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_function(examples):
    return tokenizer(examples["document"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Number of samples is  15047


loading configuration file config.json from cache at /kuacc/users/asafa22/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.0",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /kuacc/users/asafa22/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file tokenizer.json from cache at /kuacc/users/asafa22/.cache/huggingfa

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

## Build the model

In [56]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)
model.resize_token_embeddings(len(tokenizer))


loading configuration file config.json from cache at /kuacc/users/asafa22/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.0",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /kuacc/users/asafa22/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba26

Embedding(30522, 768, padding_idx=0)

## Build the metrics

In [57]:
from datasets import load_metric


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

## Train the model

In [58]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer",
                                  logging_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=32,
                                  per_device_eval_batch_size=32,
                                  num_train_epochs=8,
                                  save_total_limit = 2,
                                  save_strategy = 'no',
                                  load_best_model_at_end=False
                                  )


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics
)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: document. If document are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10532
  Num Epochs = 8
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1320
  Number of trainable parameters = 66954241


Epoch,Training Loss,Validation Loss,Rmse
1,0.0119,0.006321,0.079508
2,0.0065,0.005905,0.076846
3,0.0052,0.006052,0.077795
4,0.0041,0.005511,0.074234
5,0.0039,0.005956,0.077177
6,0.0034,0.005794,0.076113
7,0.003,0.00579,0.076092
8,0.0028,0.00588,0.076679


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: document. If document are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4515
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: document. If document are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4515
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: document. If document are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation

TrainOutput(global_step=1320, training_loss=0.005107356111208597, metrics={'train_runtime': 2269.7128, 'train_samples_per_second': 37.122, 'train_steps_per_second': 0.582, 'total_flos': 1.1160974097481728e+16, 'train_loss': 0.005107356111208597, 'epoch': 8.0})

## Save the model

In [59]:
# save the model/tokenizer

model.save_pretrained("model")
tokenizer.save_pretrained("tokenizer")



# load the model/tokenizer

from transformers import AutoModelForTokenClassification
model = AutoModelForSequenceClassification.from_pretrained("model")
tokenizer = AutoTokenizer.from_pretrained("tokenizer")

Configuration saved in model/config.json
Model weights saved in model/pytorch_model.bin
tokenizer config file saved in tokenizer/tokenizer_config.json
Special tokens file saved in tokenizer/special_tokens_map.json
loading configuration file model/config.json
Model config DistilBertConfig {
  "_name_or_path": "model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "regression",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.26.0",
  "vocab_size": 30522
}

loading weights file model/pytorch_model.bin
All mod