In [1]:
import pandas as pd
import json
import numpy as np
from tqdm.auto import tqdm
import pickle5
import ast
from copy import deepcopy

In [2]:
global input_dir
input_dir = './input_data/'

In [3]:
project_entity_data = pd.read_csv(input_dir + 'project_entity.csv', lineterminator='\n', header = None)
project_entity_data.columns = ['pid', 'title', 'alt_title', 'criteria', 'description', 'alt_desc', 'pqs']

In [4]:
import pickle
with open('./evaluation/test_label.pkl', 'rb') as f:
    test_label = pickle.load(f)
test_project = list(test_label.keys())   

In [5]:
from nltk import ngrams, word_tokenize, sent_tokenize
from nltk.stem.porter import *
from bs4 import BeautifulSoup
import re
import spacy

nlp = spacy.blank("en")

def clean_text(tmp):
    soup = BeautifulSoup(tmp)
    text = soup.get_text(separator=" ").strip()
    text = re.sub(r'\n\s*\n', '\n', text)
    text = re.sub(r'\t\s*\t', ' ', text)
    text = re.sub(r'\xa0', ' ', text)
#     text = nlp(text)
    return text

In [6]:
entity_labels = []
for i in tqdm(range(len(project_entity_data))):
    pq = project_entity_data.iloc[i, 6]
    label = []
    if pq != pq:
        pass
    else:
        pq = json.loads(project_entity_data.iloc[i, 6])
        for x in pq:
            tmp = x.get('definition')
            if tmp is not None:
                tmp = json.loads(tmp)
                if "columnHeaders" in list(tmp.keys()):
                    label += tmp['rowHeaders']
    label = np.unique([x for x in label if ('Other' not in x)&('None' not in x)&(len(x) > 1)]).tolist()
    entity_labels.append(label)
project_entity_data['entity'] = entity_labels
project_entity_data['n_ent'] = [len(x) for x in entity_labels]

  0%|          | 0/715500 [00:00<?, ?it/s]

In [None]:
project_labels = project_entity_data[project_entity_data.n_ent != 0]
project_labels = project_labels.iloc[:, [0,1,3,4,7]]
project_labels = project_labels.reset_index(drop = True)

remove_string = "PLEASE NOTE THAT THE CLIENT INTENDS TO RECORD THIS PHONE CALL. BY ACCEPTING THIS PROJECT, YOU AGREE TO BE RECORDED."
remove_pattern = [#re.compile(r"PLEASE NOTE:(.*)[<.*>$]"), 
                  re.compile(r"PLEASE NOTE:(.*)decline the project.", re.IGNORECASE),
                  re.compile(r"PLEASE NOTE:(.*)Physician Payment Sunshine Act.", re.IGNORECASE),
                  #re.compile(r"PLEASE NOTE:(.*)your current employer.", re.IGNORECASE), 
                  re.compile(r"PLEASE NOTE:(.*)selected for a consultation.", re.IGNORECASE), 
                  re.compile(r"glg ((network)*|(council)*) members are not permitted(.*)third parties.", re.IGNORECASE),
                  re.compile(remove_string, re.IGNORECASE),
                  re.compile(r"Typically, GLG clients are identified by name so that you can determine(.*)[\n|<.*>]", re.IGNORECASE)]

cleaned_description = []
cleaned_criteria = []
for i in tqdm(range(len(project_labels))):
    tmp = project_labels.iloc[i]
    
    d = tmp.description if tmp.description == tmp.description else ''
    c = tmp.criteria if tmp.criteria == tmp.criteria else ''  
    
    for pattern in remove_pattern:
        d = re.sub(pattern, "", d)
        c = re.sub(pattern, "", c)
    
    d = clean_text(d)
    c = clean_text(c)
    
    cleaned_description.append(d)
    cleaned_criteria.append(c)

project_labels['cleaned_criteria'] = cleaned_criteria
project_labels['cleaned_description'] = cleaned_description

In [8]:
from spaczz.matcher import FuzzyMatcher

data_criteria = {}
data_description = {}

for i in tqdm(range(len(project_labels))):
    tmp = project_labels.iloc[i]
    matcher = FuzzyMatcher(nlp.vocab)
    for phrase in tmp.entity:
        matcher.add(phrase, [nlp(phrase)])
    
    text = nlp(clean_text(tmp.criteria if tmp.criteria == tmp.criteria else ''))
    matches = matcher(text)
    token = [str(x) for x in text]
    label = [[x[0], x[1], x[2], x[3]/100] for x in matches]
    label = [x for x in label if (x[-1] > 0.9)]
    data_criteria[tmp.pid] = [token, label]
    
    text = nlp(clean_text(tmp.description if tmp.description == tmp.description else ''))
    matches = matcher(text)
    token = [str(x) for x in text]
    label = [[x[0], x[1], x[2], x[3]/100] for x in matches]
    label = [x for x in label if (x[-1] > 0.9)]
    data_description[tmp.pid] = [token, label]

  0%|          | 0/104284 [00:00<?, ?it/s]

In [9]:
bad_pid = []
for k, v in tqdm(data_description.items()):
    label = v[-1] + data_criteria[k][-1]
    
    if k not in test_project:
        if len(label) == 0:
            bad_pid.append(k)
        else:
            if np.mean([len(x[0]) > 50 for x in label]) > 0.3:
                bad_pid.append(k)

  0%|          | 0/104284 [00:00<?, ?it/s]

In [11]:
output_description = deepcopy(data_description)
for k in tqdm(list(data_description.keys())):
    if k in bad_pid:
        output_description.pop(k)

output_criteria = deepcopy(data_criteria)
for k in tqdm(list(data_criteria.keys())):
    if k in bad_pid:
        output_criteria.pop(k)

  0%|          | 0/104284 [00:00<?, ?it/s]

  0%|          | 0/104284 [00:00<?, ?it/s]

In [12]:
with open('./input_data/data_criteria_v5.pk5', 'wb') as f:
    pickle5.dump(output_criteria, f)
with open('./input_data/data_description_v5.pk5', 'wb') as f:
    pickle5.dump(output_description, f)