# Phrases

Process the phrases that appear in a relation.

In [135]:
import copy
import json
import pandas as pd

## 1. Read the data

In [6]:
DATA_FILE = "../../data/femke.jsonl"

In [163]:
json_data=[]
infile = open(DATA_FILE, "r")
for line in infile:
    json_data.append(json.loads(line))
infile.close()

## 2. Give an example of the data

In [142]:
json_data[0]

{'id': 6660,
 'data': 'Today I want to send a clear message to the people of this great country, of Greece. I know that many people feel without hope. Many are making extremely difficult sacrifices. And many people ask why they should do more. I understand those concerns. And I agree that some of the efforts seem unfair. But I ask people to recognise the other alternatives which will be much more difficult for Greece and will affect even more the most vulnerable in the Greek society. So this is why it is the right approach to ask Greece to reform, to increase its competitiveness to have a viable future, irrespective of the crisis. You, in Greece, with our support, need to rebuild your country, your structures, your administration, your economy to increase the competitiveness of Greece. And the best hope of a return to growth and job creation is inside the euro area. Staying in the euro is the best chance to avoid worse hardship and difficulties to the Greek people, namely for those in 

## 3. Count the frequency of the labels in each data item

In [31]:
def get_patterns(json_data):
    base_cases = []
    results = {}
    for data in json_data:
        key = [0, 0, 0]
        for label in data["label"]:
            if label[2] == 'Content_Concept_1':
                key[0] += 1
            elif label[2] == 'Content_Relation_Explanation':
                key[1] += 1
            elif label[2] == 'Content_Concept_2':
                key[2] += 1
            else:
                print("cannot happen")
        for i in range(0, len(key)):
            key[i] = str(key[i])
        results_key = " ".join(key)
        if not results_key in results:
            results[results_key] = 1
        else:
            results[results_key] += 1
        if results_key == "1 1 1":
             base_cases.append(data)
    return [results, base_cases]

results, base_cases = get_patterns(json_data)
print(results)

{'1 1 1': 1400, '2 1 1': 230, '1 1 2': 136, '2 1 2': 24, '1 2 1': 2, '1 0 0': 3, '1 1 0': 5, '0 1 1': 5, '3 1 1': 30, '1 1 3': 13, '2 1 0': 3, '4 1 1': 4, '0 1 3': 1, '3 1 2': 3, '2 1 3': 3, '5 1 1': 1, '1 1 4': 1, '4 1 2': 3}


Some data items have more than two content concepts because of split phrases.

## 4. Convert base cases (1,1,1) to table

In [32]:
len(base_cases)

1400

In [33]:
base_cases[0]

{'id': 6660,
 'data': 'Today I want to send a clear message to the people of this great country, of Greece. I know that many people feel without hope. Many are making extremely difficult sacrifices. And many people ask why they should do more. I understand those concerns. And I agree that some of the efforts seem unfair. But I ask people to recognise the other alternatives which will be much more difficult for Greece and will affect even more the most vulnerable in the Greek society. So this is why it is the right approach to ask Greece to reform, to increase its competitiveness to have a viable future, irrespective of the crisis. You, in Greece, with our support, need to rebuild your country, your structures, your administration, your economy to increase the competitiveness of Greece. And the best hope of a return to growth and job creation is inside the euro area. Staying in the euro is the best chance to avoid worse hardship and difficulties to the Greek people, namely for those in 

In [34]:
table = []
for data_in in base_cases:
    data_out = [data_in["data"], "", "", ""]
    for label_data in data_in["label"]:
        data_out_id = -1
        if label_data[2] == 'Content_Concept_1':
            data_out_id = 1
        elif label_data[2] == 'Content_Relation_Explanation':
            data_out_id = 2
        elif label_data[2] == 'Content_Concept_2':
            data_out_id = 3
        else:
            print(f"unexpected label data: {label_data}")
        if data_out[data_out_id] != "":
            print("duplicate data in label_data: {data_in['label']}")
        data_out[data_out_id] = data_in["data"][label_data[0]:label_data[1]]
    table.append(data_out)

In [35]:
table[0]

['Today I want to send a clear message to the people of this great country, of Greece. I know that many people feel without hope. Many are making extremely difficult sacrifices. And many people ask why they should do more. I understand those concerns. And I agree that some of the efforts seem unfair. But I ask people to recognise the other alternatives which will be much more difficult for Greece and will affect even more the most vulnerable in the Greek society. So this is why it is the right approach to ask Greece to reform, to increase its competitiveness to have a viable future, irrespective of the crisis. You, in Greece, with our support, need to rebuild your country, your structures, your administration, your economy to increase the competitiveness of Greece. And the best hope of a return to growth and job creation is inside the euro area. Staying in the euro is the best chance to avoid worse hardship and difficulties to the Greek people, namely for those in a more vulnerable pos

In [36]:
pd.DataFrame(table, columns=["Paragraph", "Content_Concept_1", "Content_Relation_Explanation", "Content_Concept_2"])

Unnamed: 0,Paragraph,Content_Concept_1,Content_Relation_Explanation,Content_Concept_2
0,Today I want to send a clear message to the pe...,"You, in Greece, with our support, need to rebu...",to,increase the competitiveness of Greece
1,Today I want to send a clear message to the pe...,"You, in Greece, with our support, need to rebu...",And the best hope of a,return to growth
2,"To conclude, let me say a few words on the eur...","We have taken important, fundamental decisions",to safeguard,the stability of the euro area
3,"To conclude, let me say a few words on the eur...",We need sustained efforts and determination,As we said there will not be,magic solutions
4,Giving to the ECB the ultimate responsibility ...,confidence between the banks,and in this way,increase the financial stability in the euro area
...,...,...,...,...
1395,But today I want to focus on our economic prio...,cut business taxes,You've got to,succeed
1396,But today I want to focus on our economic prio...,tackle the bloat in welfare,You've got to,succeed
1397,But today I want to focus on our economic prio...,make sure your schools and your universities a...,and crucially you've got to,succeed
1398,Now yesterday I gave a speech setting out the ...,When you have a single currency,you move inexorably towards,a banking union


## 5. Combine all data of duplicate paragraps

In [173]:
combined_data = {}
for data in json_data:
    key = f"{data['source_id']} {data['speech_id']} {data['paragraph_id']}"
    if key not in combined_data:
        combined_data[key] = copy.deepcopy(data)
    else:
        if len(data["data"]) != len(combined_data[key]["data"]):
            print("cannot happen")
        for label_data in data["label"]:
            if label_data not in combined_data[key]["label"]:
                combined_data[key]["label"].append(label_data)
                if combined_data[key]["label"][-1][1] > len(combined_data[key]["data"]):
                    combined_data[key]["label"][-1][1] = len(combined_data[key]["data"])

for key in combined_data:
    for label_data in combined_data[key]["label"]:
        label_data.append(combined_data[key]["data"][label_data[0]:label_data[1]])

In [139]:
len(combined_data)

526

In [140]:
results, base_cases = get_patterns(list(combined_data.values()))
print(results)

{'5 2 4': 3, '2 2 2': 52, '1 1 1': 95, '3 2 3': 10, '3 2 2': 24, '5 3 3': 3, '5 2 2': 6, '5 4 6': 5, '4 3 3': 5, '1 2 2': 5, '8 2 4': 1, '2 3 3': 1, '8 5 8': 1, '2 1 1': 30, '1 1 2': 22, '3 3 3': 15, '2 1 2': 8, '9 5 6': 1, '3 3 4': 8, '3 1 2': 5, '2 2 6': 4, '4 2 2': 11, '3 1 1': 12, '4 2 3': 8, '8 3 4': 2, '6 4 4': 3, '2 2 4': 5, '4 1 1': 5, '5 1 1': 4, '14 6 7': 1, '13 4 4': 1, '3 4 4': 2, '1 1 3': 7, '6 2 2': 5, '7 5 5': 2, '4 3 4': 7, '7 4 4': 2, '5 1 2': 1, '2 3 4': 2, '6 4 5': 1, '4 4 4': 5, '3 5 8': 1, '1 3 4': 1, '2 2 3': 9, '6 5 5': 2, '8 5 5': 1, '4 2 1': 2, '7 1 2': 1, '1 1 4': 3, '3 2 5': 2, '6 3 4': 2, '8 3 9': 2, '3 3 5': 1, '6 4 6': 4, '4 2 4': 5, '4 4 5': 1, '10 9 11': 1, '15 11 16': 1, '11 5 6': 1, '1 1 5': 3, '10 3 4': 1, '1 1 8': 1, '8 3 11': 1, '1 2 3': 2, '14 5 9': 1, '3 2 4': 2, '7 5 6': 1, '4 5 6': 1, '7 2 2': 2, '9 4 6': 2, '6 1 1': 2, '4 3 2': 1, '9 4 11': 1, '4 1 3': 3, '2 1 12': 1, '3 1 4': 2, '11 4 6': 1, '3 5 7': 1, '6 3 5': 1, '2 1 5': 1, '4 4 7': 1, '2 2

In [141]:
combined_data[list(combined_data.keys())[0]]

{'id': 6660,
 'data': 'Today I want to send a clear message to the people of this great country, of Greece. I know that many people feel without hope. Many are making extremely difficult sacrifices. And many people ask why they should do more. I understand those concerns. And I agree that some of the efforts seem unfair. But I ask people to recognise the other alternatives which will be much more difficult for Greece and will affect even more the most vulnerable in the Greek society. So this is why it is the right approach to ask Greece to reform, to increase its competitiveness to have a viable future, irrespective of the crisis. You, in Greece, with our support, need to rebuild your country, your structures, your administration, your economy to increase the competitiveness of Greece. And the best hope of a return to growth and job creation is inside the euro area. Staying in the euro is the best chance to avoid worse hardship and difficulties to the Greek people, namely for those in 

## 6. Make character labels

Several tokens have more than one label. We use the following labeling scheme:

* 1: Content_Concept_1
* 2: Content_Concept_2
* 3: both Content_Concept_1 and Content_Concept_2
* E: Content_Relation_Explanation
* F: both Content_Relation_Explanation and Content_Concept_1
* G: both Content_Relation_Explanation and Content_Concept_2
* \*: all three labels: Content_Relation_Explanation and Content_Concept_1 and Content_Concept_2
* .: no label

In [192]:
class Label_Clash:
    def __init__(self):
        self.data = {}
        
    def add(self, key):
        if key not in self.data:
            self.data[key] = 1
        else:
            self.data[key] += 1
            
    def print(self):
        print(self.data)

In [195]:
label_clash = Label_Clash()
for key in combined_data:
    combined_data[key]['labels'] = len(combined_data[key]["data"])*["."]
    for label in combined_data[key]["label"]:
        for i in range(label[0], label[1]):
            if label[2] == "Content_Concept_1":
                if combined_data[key]['labels'][i] != ".":
                    combined_data[key]['labels'][i] = "1"
                elif combined_data[key]['labels'][i] != "1":
                    combined_data[key]['labels'][i] = "1"
                elif combined_data[key]['labels'][i] != "E":
                    combined_data[key]['labels'][i] = "F"
                elif combined_data[key]['labels'][i] != "2":
                    combined_data[key]['labels'][i] = "3"
                elif combined_data[key]['labels'][i] != "3":
                    combined_data[key]['labels'][i] = "3"
                elif combined_data[key]['labels'][i] != "F":
                    combined_data[key]['labels'][i] = "F"
                elif combined_data[key]['labels'][i] != "G":
                    combined_data[key]['labels'][i] = "*"
                else:
                    print("cannot happen")
            elif label[2] == "Content_Relation_Explanation":
                if combined_data[key]['labels'][i] != ".":
                    combined_data[key]['labels'][i] = "E"
                elif combined_data[key]['labels'][i] != "1":
                    combined_data[key]['labels'][i] = "F"
                elif combined_data[key]['labels'][i] != "E":
                    combined_data[key]['labels'][i] = "E"
                elif combined_data[key]['labels'][i] != "2":
                    combined_data[key]['labels'][i] = "G"
                elif combined_data[key]['labels'][i] != "3":
                    combined_data[key]['labels'][i] = "*"
                elif combined_data[key]['labels'][i] != "F":
                    combined_data[key]['labels'][i] = "F"
                elif combined_data[key]['labels'][i] != "G":
                    combined_data[key]['labels'][i] = "G"
                else:
                    print("cannot happen")
            elif label[2] == "Content_Concept_2":
                if combined_data[key]['labels'][i] != ".":
                    combined_data[key]['labels'][i] = "2"
                elif combined_data[key]['labels'][i] != "1":
                    combined_data[key]['labels'][i] = "3"
                elif combined_data[key]['labels'][i] != "E":
                    combined_data[key]['labels'][i] = "G"
                elif combined_data[key]['labels'][i] != "2":
                    combined_data[key]['labels'][i] = "2"
                elif combined_data[key]['labels'][i] != "3":
                    combined_data[key]['labels'][i] = "3"
                elif combined_data[key]['labels'][i] != "F":
                    combined_data[key]['labels'][i] = "*"
                elif combined_data[key]['labels'][i] != "G":
                    combined_data[key]['labels'][i] = "G"
                else:
                    print("cannot happen")
            else:
                print(f"unknown label: {label}")

In [210]:
("".join(combined_data[list(combined_data.keys())[0]]["labels"]))[:933]

'........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................11111111111111111111111111111111111111111111111111111111111111..111111111111111..1111111111111111111..111111111111.FF.33333333333333333333333333333333333333..EEEEEEEEEEEEEEEEEEEEEE.2222222222222222.....222222222222.......................................................................................................'

## 7. Machine learning

In [1]:
from transformers import RobertaTokenizer, RobertaForTokenClassification
import torch

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForTokenClassification.from_pretrained('roberta-base')

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0)  # Batch size 1

outputs = model(**inputs, labels=labels)
loss = outputs.loss
logits = outputs.logits

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

In [5]:
outputs

TokenClassifierOutput(loss=tensor(0.6889, grad_fn=<NllLossBackward>), logits=tensor([[[ 0.0486, -0.2147],
         [-0.0511,  0.0683],
         [-0.0463,  0.2299],
         [-0.0694, -0.0103],
         [ 0.0008, -0.0916],
         [-0.1262,  0.2167],
         [ 0.1081,  0.1149],
         [ 0.0721, -0.2160]]], grad_fn=<AddBackward0>), hidden_states=None, attentions=None)