# Load dataset
---

In [1]:
import datasets
from xsum_dataset import XsumDataset

In [2]:
xsum_data_raw = datasets.load_dataset("xsum")

Using custom data configuration default
Reusing dataset xsum (/home/wk247/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
# train/val/test data
xsum_train_data = XsumDataset(xsum_data_raw["train"])
xsum_val_data = XsumDataset(xsum_data_raw["validation"])
xsum_test_data = XsumDataset(xsum_data_raw["test"])

# concat data
xsum_data_raw_cc = datasets.concatenate_datasets(
    [xsum_data_raw["train"], xsum_data_raw["validation"], xsum_data_raw["test"]]
    )
xsum_concat_data = XsumDataset(xsum_data_raw_cc)

# NER
---

In [4]:
from ner_utils import *

In [5]:
import spacy
#NER = spacy.load("en_core_web_lg")
NER = spacy.load("en_core_web_trf")
cache_dir = "../cache_trf"

# from collections import Counter
from tqdm.notebook import tqdm

## 1) create ner / entities list

1) train

In [6]:
_, _, train_doc_ents_list, train_sum_ents_list = tag_dataset(tagger=NER,
                                                             xsum_dataset=xsum_train_data.dataset, 
                                                             return_ner_list=False)

  0%|                                        | 24/204045 [00:10<25:41:22,  2.21it/s]


KeyboardInterrupt: 

In [10]:
# save to cache directory
save_to_cache_dir(
    train_doc_ents_list, 
    "train_doc_ents_list",
    cache_dir)

save_to_cache_dir(
    train_sum_ents_list, 
    "train_sum_ents_list",
    cache_dir)

saved to '../cache/train_doc_ents_list.pkl'
saved to '../cache/train_sum_ents_list.pkl'


2) val

In [None]:
_, _, val_doc_ents_list, val_sum_ents_list = tag_dataset(tagger=NER,
                                                         xsum_dataset=xsum_val_data.dataset, 
                                                         return_ner_list=False)

In [None]:
val_doc_ner_list, val_sum_ner_list, val_doc_ents_list, val_sum_ents_list = tag_dataset(tagger=NER,
                                                         xsum_dataset=xsum_val_data.dataset, 
                                                         return_ner_list=True)

In [11]:
# save_to_cache_dir(val_doc_ner_list, "val_doc_ner_list")
# save_to_cache_dir(val_sum_ner_list, "val_sum_ner_list")
save_to_cache_dir(val_doc_ents_list, "val_doc_ents_list")
save_to_cache_dir(val_sum_ents_list, "val_sum_ents_list")

saved to '../cache_trf/val_doc_ents_list.pkl'
saved to '../cache_trf/val_sum_ents_list.pkl'


3) test

In [None]:
_, _, test_doc_ents_list, test_sum_ents_list = tag_dataset(tagger=NER,
                                                         xsum_data=xsum_test_data, 
                                                         return_ner_list=False)

In [None]:
test_doc_ner_list, test_sum_ner_list, test_doc_ents_list, test_sum_ents_list = tag_dataset(tagger=NER,
                                                         xsum_data=xsum_test_data, 
                                                         return_ner_list=True)

In [14]:
# save_to_cache_dir(test_doc_ner_list, "test_doc_ner_list")
# save_to_cache_dir(test_sum_ner_list, "test_sum_ner_list")
save_to_cache_dir(test_doc_ents_list, "test_doc_ents_list")
save_to_cache_dir(test_sum_ents_list, "test_sum_ents_list")

saved to '../cache_trf/test_doc_ents_list.pkl'
saved to '../cache_trf/test_sum_ents_list.pkl'


## 2) create the pool of entities

* label list
    * CARDINAL: Numerals that do not fall under another type
    * DATE: Absolute or relative dates or periods
    * EVENT: Named hurricanes, battles, wars, sports events, etc.
    * FAC: Buildings, airports, highways, bridges, etc.
    * GPE: Countries, cities, states
    * LANGUAGE: Any named language
    * LAW: Named documents made into laws.
    * LOC: Non-GPE locations, mountain ranges, bodies of water
    * MONEY: Monetary values, including unit
    * NORP: Nationalities or religious or political groups
    * ORDINAL: "first", "second", etc.
    * ORG: Companies, agencies, institutions, etc.
    * PERCENT: Percentage, including "%"
    * PERSON: People, including fictional
    * PRODUCT: Objects, vehicles, foods, etc. (not services)
    * QUANTITY: Measurements, as of weight or distance
    * TIME: Times smaller than a day
    * WORK_OF_ART: Titles of books, songs, etc.

In [6]:
ALL_LABELS = list(NER.get_pipe('ner').labels)

1) test entities pool

In [7]:
# # load entities list
test_doc_ents_list = load_from_cache_dir("test_doc_ents_list", cache_dir)

'../cache_trf/test_doc_ents_list.pkl' loaded


In [10]:
test_ent_pool_dict = create_ent_pool_dict(test_doc_ents_list, ALL_LABELS)

100%|████████████████████████████████████████| 11334/11334 [00:34<00:00, 326.24it/s]

label: CARDINAL, count: 5114
label: DATE, count: 10843
label: EVENT, count: 1955
label: FAC, count: 4515
label: GPE, count: 5965
label: LANGUAGE, count: 83
label: LAW, count: 645
label: LOC, count: 1612
label: MONEY, count: 4102
label: NORP, count: 1338
label: ORDINAL, count: 175
label: ORG, count: 20547
label: PERCENT, count: 1402
label: PERSON, count: 40806
label: PRODUCT, count: 1552
label: QUANTITY, count: 2459
label: TIME, count: 3429
label: WORK_OF_ART, count: 3222





In [13]:
save_to_cache_dir(test_ent_pool_dict, "test_ent_pool_dict", cache_dir)

saved to '../cache_trf/test_ent_pool_dict.pkl'


2) concat entities pool

In [7]:
# # load entities lists
# train_doc_ents_list = load_from_cache_dir("train_doc_ents_list", cache_dir=cache_dir)
val_doc_ents_list = load_from_cache_dir("val_doc_ents_list", cache_dir=cache_dir)
test_doc_ents_list = load_from_cache_dir("test_doc_ents_list", cache_dir=cache_dir)

'../cache_trf/val_doc_ents_list.pkl' loaded
'../cache_trf/test_doc_ents_list.pkl' loaded


In [8]:
concat_doc_ents_list = [*val_doc_ents_list, *test_doc_ents_list]  # *train_doc_ents_list, 
print(len(concat_doc_ents_list))

22666


In [9]:
concat_ent_pool_dict = create_ent_pool_dict(concat_doc_ents_list, ALL_LABELS)

100%|████████████████████████████████████| 22666/22666 [02:25<00:00, 155.55it/s]


label: CARDINAL, count: 8328
label: DATE, count: 17273
label: EVENT, count: 3322
label: FAC, count: 8347
label: GPE, count: 9341
label: LANGUAGE, count: 113
label: LAW, count: 1172
label: LOC, count: 2719
label: MONEY, count: 7430
label: NORP, count: 1920
label: ORDINAL, count: 229
label: ORG, count: 34501
label: PERCENT, count: 2160
label: PERSON, count: 67832
label: PRODUCT, count: 2672
label: QUANTITY, count: 4329
label: TIME, count: 5742
label: WORK_OF_ART, count: 5809


In [11]:
save_to_cache_dir(concat_ent_pool_dict, "val_test_ent_pool_dict", cache_dir)

saved to '../cache_trf/val_test_ent_pool_dict.pkl'


### 2-1) Preprocess entity pool

In [12]:
import numpy as np

In [38]:
# entities pool
concat_ent_pool_dict = load_from_cache_dir("concat_ent_pool_dict")

'../cache/concat_ent_pool_dict.pkl' loaded


In [13]:
FILTER_LABELS = ["PERSON", "FAC", "GPE", "NORP", "LOC", "EVENT", "LANGUAGE", "LAW", "ORG"]

In [14]:
ALL_LABELS = list(NER.get_pipe('ner').labels)

In [15]:
# a lil bit of humon labor
for label in FILTER_LABELS:
    print(f"\n========= {label} =========")
    for ent, count in list(concat_ent_pool_dict[label].items())[:100]:
        print(f"ent: {ent}, count:{count}" )


ent: David Cameron, count:367
ent: Trump, count:336
ent: Donald Trump, count:283
ent: Jones, count:259
ent: Theresa May, count:258
ent: Barack Obama, count:235
ent: Obama, count:234
ent: Cameron, count:222
ent: Williams, count:194
ent: Smith, count:192
ent: Johnson, count:166
ent: George Osborne, count:158
ent: May, count:154
ent: Jeremy Corbyn, count:150
ent: Davies, count:150
ent: Taylor, count:142
ent: Vladimir Putin, count:139
ent: James, count:134
ent: Nicola Sturgeon, count:132
ent: Hillary Clinton, count:130
ent: Corbyn, count:129
ent: Brown, count:127
ent: Murray, count:122
ent: Evans, count:112
ent: Hamilton, count:110
ent: Lewis, count:110
ent: David, count:107
ent: Boris Johnson, count:106
ent: Bashar al-Assad, count:102
ent: Putin, count:94
ent: Morgan, count:93
ent: Angela Merkel, count:93
ent: Wilson, count:93
ent: Carwyn Jones, count:92
ent: Cook, count:91
ent: John, count:91
ent: Hughes, count:88
ent: Anderson, count:88
ent: Ed Miliband, count:87
ent: Clinton, count:87

In [42]:
CORRECT_PAIRS = [("Brexit", "LAW"),
                  ("Twitter", "ORG"),
                  ("Â£2.5", None),
                  ("Championship", "EVENT"),
                  ("Euro 2016", "EVENT"),
                  ("Formula 1", "EVENT"),
                  ("Challenge Cup", "EVENT"),
                  ("French Open", "EVENT"),
                  ("PhD", None),
                  ("Ofqual", "ORG"),
                  ("headliners", None),
                  ("TfL", "ORG"),
                  ("Dembele", "PERSON"),
                  ("Worlds", "EVENT"),
                  ("the FA Trophy", "EVENT"),
                  ("DfE", "ORG")]

In [43]:
def check_labeling(ent_pool_dict, correct_pairs, all_labels):
    for ent, correct_label in correct_pairs:
        print(f"======={ent}=======")
    
        # find occurences
        for label in all_labels:
            ent_dict = ent_pool_dict[label]
            if ent in ent_dict.keys():
                print(f"label: {label}, count: {ent_dict[ent]}")
        print()

In [44]:
def fix_labeling_inplace(ent_pool_dict, correct_pairs):
    for ent, correct_label in correct_pairs:
        occurences = []

        # find occurences
        for label in ent_pool_dict.keys():
            ent_dict = ent_pool_dict[label]
            if ent in ent_dict.keys():
                occurences.append((label, ent_dict[ent]))

        # assign to right dict
        fix_flag = False
        
        correct_count = np.sum([count for (_, count) in occurences])
        for (label, _) in occurences:
            if label == correct_label:  # update count
                ent_pool_dict[label][ent] = correct_count
                fix_flag = True
            else:
                del ent_pool_dict[label][ent]

        # correct label not in occurences
        if not fix_flag:
            ent_pool_dict[label][ent] = correct_count

In [45]:
check_labeling(ent_pool_dict=concat_ent_pool_dict,
               correct_pairs=CORRECT_PAIRS,
               all_labels=ALL_LABELS)

label: FAC, count: 3
label: GPE, count: 19
label: MONEY, count: 1
label: ORG, count: 49
label: PERSON, count: 4563
label: WORK_OF_ART, count: 33

label: FAC, count: 4
label: NORP, count: 5
label: ORG, count: 9357
label: PERSON, count: 1148
label: PRODUCT, count: 3483
label: WORK_OF_ART, count: 11

label: CARDINAL, count: 5
label: LANGUAGE, count: 1
label: NORP, count: 37
label: PRODUCT, count: 4

label: DATE, count: 5
label: EVENT, count: 338
label: GPE, count: 8
label: LAW, count: 266
label: LOC, count: 88
label: ORG, count: 1205
label: PERSON, count: 8
label: PRODUCT, count: 2

label: DATE, count: 30
label: EVENT, count: 24
label: LAW, count: 235
label: WORK_OF_ART, count: 2

label: LAW, count: 58
label: ORG, count: 4

label: EVENT, count: 152
label: LAW, count: 21
label: ORG, count: 8
label: PRODUCT, count: 1
label: WORK_OF_ART, count: 4

label: EVENT, count: 86
label: FAC, count: 6
label: ORG, count: 9
label: WORK_OF_ART, count: 62

label: ORG, count: 1
label: WORK_OF_ART, count: 3

In [46]:
fix_labeling_inplace(ent_pool_dict=concat_ent_pool_dict,
                     correct_pairs=CORRECT_PAIRS)

In [47]:
check_labeling(ent_pool_dict=concat_ent_pool_dict,
               correct_pairs=CORRECT_PAIRS,
               all_labels=ALL_LABELS)

label: WORK_OF_ART, count: 4668

label: ORG, count: 14008

label: PRODUCT, count: 47

label: EVENT, count: 1920

label: EVENT, count: 291

label: ORG, count: 62

label: EVENT, count: 186

label: EVENT, count: 163

label: WORK_OF_ART, count: 328

label: ORG, count: 124

label: WORK_OF_ART, count: 63

label: ORG, count: 486

label: PERSON, count: 185

label: WORK_OF_ART, count: 127

label: WORK_OF_ART, count: 29

label: ORG, count: 221



In [27]:
# sort dicts by count
for label in ALL_LABELS:
    concat_ent_pool_dict[label] = dict(sorted(concat_ent_pool_dict[label].items(), key=lambda item: item[1], reverse=True))

In [29]:
save_to_cache_dir(concat_ent_pool_dict, "concat_ent_pool_dict_pp")

saved to '../cache/concat_ent_pool_dict_pp.pkl'


## 3) preprocess duplicate entities

In [21]:
import numpy as np
import copy

In [22]:
# # train
# train_doc_ents_list = load_from_cache_dir("train_doc_ents_list")
# train_sum_ents_list = load_from_cache_dir("train_sum_ents_list")

# # val
# val_doc_ner_list = load_from_cache_dir("val_doc_ner_list")
# val_sum_ner_list = load_from_cache_dir("val_sum_ner_list")
val_doc_ents_list = load_from_cache_dir("val_doc_ents_list", cache_dir)
val_sum_ents_list = load_from_cache_dir("val_sum_ents_list", cache_dir)

# test
# test_doc_ner_list = load_from_cache_dir("test_doc_ner_list")
# test_sum_ner_list = load_from_cache_dir("test_sum_ner_list")
test_doc_ents_list = load_from_cache_dir("test_doc_ents_list", cache_dir)
test_sum_ents_list = load_from_cache_dir("test_sum_ents_list", cache_dir)

# entities pool
concat_ent_pool_dict = load_from_cache_dir("concat_ent_pool_dict", cache_dir)
# test_ent_pool_dict = load_from_cache_dir("test_ent_pool_dict")

'../cache_trf/val_doc_ents_list.pkl' loaded
'../cache_trf/val_sum_ents_list.pkl' loaded
'../cache_trf/test_doc_ents_list.pkl' loaded
'../cache_trf/test_sum_ents_list.pkl' loaded


In [32]:
def count_duplicates(ents_list):
    count = 0
    for c in ents_list:
        ents = [ent for ((ent, label), count) in c.items()]
        if len(ents) > len(set(ents)):
            count += 1
    return count

In [33]:
def remove_duplicates(src_ents_list, ent_pool=concat_ent_pool_dict):
    for c in tqdm(src_ents_list):
        ents = [ent for ((ent, label), count) in c.items()]

        # check duplicates and save the indices    
        dup_ents = {}
        for ent_idx, ((ent, label), count) in enumerate(c.items()):
            if ents.count(ent) > 1: # duplicate
                if ent not in dup_ents.keys():
                    dup_ents[ent] = {"ent_idx": [ent_idx],
                                     "label": [label],
                                     "count": [count]}
                else:
                    dup_ents[ent]["ent_idx"].append(ent_idx)
                    dup_ents[ent]["label"].append(label)
                    dup_ents[ent]["count"].append(count)

        if len(dup_ents) == 0: # no duplicates
            continue

        # if duplicates exist, remove them
        for ent, ent_info in dup_ents.items():
            # print("\nduplicate entity:", ent)
            # print("duplicate entity info:", ent_info)
            counts = np.array(ent_info["count"])
            unique_max_count = True if len(np.flatnonzero(counts == np.max(counts))) == 1 else False

            # if there is a dominent label, unify to it
            # else, (all labels have the same counts), check ent_pool and choose the most popular one
            if unique_max_count:
                # print("unique max count exists")
                survive_idx = counts.argmax()
            else:
                # find the most popular one from entity pool
                # print("choose from pool")
                labels = ent_info["label"]
                pool_counts = np.array([Counter(ent_pool[label])[ent] for label in labels])
                # print("pool_counts", pool_counts)
                survive_idx = pool_counts.argmax()
                
            
            # remove duplicates
            survived = (ent, ent_info["label"][survive_idx])
            absorbed_list = [(ent, ent_info["label"][absorb_idx])
                             for absorb_idx in range(len(counts)) if absorb_idx != survive_idx]

            # print(survived, c[survived])
            # print(absorbed_list)
            for absorbed in absorbed_list:
                c[survived] += c[absorbed]
                del c[absorbed]

In [34]:
print(f"val document: duplicate count - {count_duplicates(val_doc_ents_list)}") 
print(f"val summary: duplicate count - {count_duplicates(val_sum_ents_list)}") 
print(f"test document: duplicate count - {count_duplicates(test_doc_ents_list)}") 
print(f"test document: duplicate count - {count_duplicates(test_sum_ents_list)}") 

val document: duplicate count - 2161
val summary: duplicate count - 3
test document: duplicate count - 2127
test document: duplicate count - 2


In [35]:
val_doc_ents_list_no_dup = copy.deepcopy(val_doc_ents_list)
val_sum_ents_list_no_dup = copy.deepcopy(val_sum_ents_list)
test_doc_ents_list_no_dup = copy.deepcopy(test_doc_ents_list)
test_sum_ents_list_no_dup = copy.deepcopy(test_sum_ents_list)

In [36]:
remove_duplicates(val_doc_ents_list_no_dup)
remove_duplicates(val_sum_ents_list_no_dup)
remove_duplicates(test_doc_ents_list_no_dup)
remove_duplicates(test_sum_ents_list_no_dup)

  0%|          | 0/11332 [00:00<?, ?it/s]

  0%|          | 0/11332 [00:00<?, ?it/s]

  0%|          | 0/11334 [00:00<?, ?it/s]

  0%|          | 0/11334 [00:00<?, ?it/s]

In [37]:
print(f"val document: duplicate count - {count_duplicates(val_doc_ents_list_no_dup)}") 
print(f"val summary: duplicate count - {count_duplicates(val_sum_ents_list_no_dup)}") 
print(f"test document: duplicate count - {count_duplicates(test_doc_ents_list_no_dup)}") 
print(f"test document: duplicate count - {count_duplicates(test_sum_ents_list_no_dup)}") 

val document: duplicate count - 0
val summary: duplicate count - 0
test document: duplicate count - 0
test document: duplicate count - 0


In [38]:
save_to_cache_dir(val_doc_ents_list_no_dup, "val_doc_ents_list_no_dup", cache_dir)
save_to_cache_dir(val_sum_ents_list_no_dup, "val_sum_ents_list_no_dup", cache_dir)
save_to_cache_dir(test_doc_ents_list_no_dup, "test_doc_ents_list_no_dup", cache_dir)
save_to_cache_dir(test_sum_ents_list_no_dup, "test_sum_ents_list_no_dup", cache_dir)

saved to '../cache_trf/val_doc_ents_list_no_dup.pkl'
saved to '../cache_trf/val_sum_ents_list_no_dup.pkl'
saved to '../cache_trf/test_doc_ents_list_no_dup.pkl'
saved to '../cache_trf/test_sum_ents_list_no_dup.pkl'


### 3-1) Preprocess entities list

In [12]:
import copy

In [8]:
CORRECT_PAIRS = [("Brexit", "LAW"),
                  ("Twitter", "ORG"),
                  ("Â£2.5", None),
                  ("Championship", "EVENT"),
                  ("Euro 2016", "EVENT"),
                  ("Formula 1", "EVENT"),
                  ("Challenge Cup", "EVENT"),
                  ("French Open", "EVENT"),
                  ("PhD", None),
                  ("Ofqual", "ORG"),
                  ("headliners", None),
                  ("TfL", "ORG"),
                  ("Dembele", "PERSON"),
                  ("Worlds", "EVENT"),
                  ("the FA Trophy", "EVENT"),
                  ("DfE", "ORG")]

In [18]:
# # train
# train_doc_ents_list = load_from_cache_dir("train_doc_ents_list")
# train_sum_ents_list = load_from_cache_dir("train_sum_ents_list")

# # val
# val_doc_ner_list = load_from_cache_dir("val_doc_ner_list")
# val_sum_ner_list = load_from_cache_dir("val_sum_ner_list")
val_doc_ents_list = load_from_cache_dir("val_doc_ents_list_no_dup") # no duplicate
val_sum_ents_list = load_from_cache_dir("val_sum_ents_list_no_dup")

# test
# test_doc_ner_list = load_from_cache_dir("test_doc_ner_list")
# test_sum_ner_list = load_from_cache_dir("test_sum_ner_list")
test_doc_ents_list = load_from_cache_dir("test_doc_ents_list_no_dup")
test_sum_ents_list = load_from_cache_dir("test_sum_ents_list_no_dup")

# entities pool
concat_ent_pool_dict = load_from_cache_dir("concat_ent_pool_dict_pp") # preprocessed
# test_ent_pool_dict = load_from_cache_dir("test_ent_pool_dict")

'../cache/val_doc_ents_list_no_dup.pkl' loaded
'../cache/val_sum_ents_list_no_dup.pkl' loaded
'../cache/test_doc_ents_list_no_dup.pkl' loaded
'../cache/test_sum_ents_list_no_dup.pkl' loaded
'../cache/concat_ent_pool_dict_pp.pkl' loaded


In [19]:
def fix_ents_list(src_ents_list, correct_pairs):    
    replace_count = 0
    delete_count = 0
    
    # new list
    new_src_ents_list = []
    
    # entities to replace or delete
    fix_ents = [ent for (ent, correct_label) in correct_pairs]
    replace_dict = {ent: correct_label for (ent, correct_label) in correct_pairs if correct_label is not None}
    replace_ents = replace_dict.keys()
    
    # enumerate over the list
    for c in tqdm(src_ents_list):
        new_c = copy.deepcopy(c)
        for ((ent, label), count) in c.items():
            if ent in fix_ents:
                if ent in replace_ents:  # replace
                    correct_label = replace_dict[ent]
                    if label == correct_label:  # correct label
                        pass
                    else:  # incorrect label
                        del new_c[(ent, label)]
                        new_c.update({(ent, correct_label): count})
                        replace_count += 1
                else: # delete
                    del new_c[(ent, label)]
                    delete_count += 1
        new_src_ents_list.append(new_c)
    
    print("replace_count:", replace_count)
    print("delete_count:", delete_count)
    
    return new_src_ents_list

In [20]:
val_doc_ents_list_pp = fix_ents_list(val_doc_ents_list, CORRECT_PAIRS)
val_sum_ents_list_pp = fix_ents_list(val_sum_ents_list, CORRECT_PAIRS)
test_doc_ents_list_pp = fix_ents_list(test_doc_ents_list, CORRECT_PAIRS)
test_sum_ents_list_pp = fix_ents_list(test_sum_ents_list, CORRECT_PAIRS)

  0%|          | 0/11332 [00:00<?, ?it/s]

replace_count: 548
delete_count: 23


  0%|          | 0/11332 [00:00<?, ?it/s]

replace_count: 108
delete_count: 0


  0%|          | 0/11334 [00:00<?, ?it/s]

replace_count: 562
delete_count: 23


  0%|          | 0/11334 [00:00<?, ?it/s]

replace_count: 109
delete_count: 0


In [21]:
save_to_cache_dir(val_doc_ents_list_pp, "val_doc_ents_list_pp")
save_to_cache_dir(val_sum_ents_list_pp, "val_sum_ents_list_pp")
save_to_cache_dir(test_doc_ents_list_pp, "test_doc_ents_list_pp")
save_to_cache_dir(test_sum_ents_list_pp, "test_sum_ents_list_pp")

saved to '../cache/val_doc_ents_list_pp.pkl'
saved to '../cache/val_sum_ents_list_pp.pkl'
saved to '../cache/test_doc_ents_list_pp.pkl'
saved to '../cache/test_sum_ents_list_pp.pkl'
