In [None]:
import os
import pickle
import shutil
import gc
import bisect
import copy
import time
import random
from tqdm import tqdm
from collections import defaultdict, Counter
from aser.database import SqliteDBConnection, MongoDBConnection
from aser.database.kg_connection import CHUNKSIZE
from aser.database.kg_connection import EVENTUALITY_TABLE_NAME, EVENTUALITY_COLUMNS, EVENTUALITY_COLUMN_TYPES
from aser.database.kg_connection import RELATION_TABLE_NAME, RELATION_COLUMNS, RELATION_COLUMN_TYPES
#from aser.concept.concept_extractor import ASERConceptExtractor
#from aser.concept.concept_connection import ASERConceptConnection
#from aser.concept.concept_connection import CONCEPT_TABLE_NAME, CONCEPT_COLUMNS, CONCEPT_COLUMN_TYPES
#from aser.concept.concept_connection import CONCEPTINSTANCEPAIR_TABLE_NAME, CONCEPTINSTANCEPAIR_COLUMNS, CONCEPTINSTANCEPAIR_COLUMN_TYPES
from aser.extract.aser_extractor import DiscourseASERExtractor
from aser.eventuality import Eventuality
from aser.relation import Relation, relation_senses

Below code is from CoCoLM [http://arxiv.org/abs/2012.15643]

In [2]:
def convert_row_to_eventuality(row):
    eventuality = Eventuality().decode(row["info"])
    eventuality.eid = row["_id"]
    eventuality.frequency = row["frequency"]
    eventuality.pattern = row["pattern"]
    return eventuality

def convert_row_to_relation(row):
    return Relation(row["hid"], row["tid"], {r: cnt for r, cnt in row.items() if isinstance(cnt, float) and cnt > 0.0})

def build_concept_instance_table_from_aser_kg(aser_conceptualizer, erows):
    cid2concept = dict()
    concept_instance_pairs = []
    cid_to_filter_score = dict()
    for erow in tqdm(erows):
        event = convert_row_to_eventuality(erow)
        results = aser_conceptualizer.conceptualize(event)
        for concept, score in results:
            if concept.cid not in cid2concept:
                cid2concept[concept.cid] = copy.copy(concept)
            concept = cid2concept[concept.cid]
            if (event.eid, event.pattern, score) not in concept.instances:
                concept.instances.append(((event.eid, event.pattern, score)))
                if concept.cid not in cid_to_filter_score:
                    cid_to_filter_score[concept.cid] = 0.0
                cid_to_filter_score[concept.cid] += score * event.frequency
            concept_instance_pairs.append((concept, event, score))
    return cid2concept, concept_instance_pairs, cid_to_filter_score

def build_concept_relation_table_from_aser_kg(aser_concept_conn, rrows):
    rid2relation = dict()
    hid2related_events = defaultdict(list)
    for rrow in rrows:
        relation = convert_row_to_relation(rrow)
        hid2related_events[rrow["hid"]].append((rrow["tid"], relation))
        
    for h_cid in tqdm(aser_concept_conn.cids):
        instances = aser_concept_conn.get_eventualities_given_concept(h_cid)
        for h_eid, pattern, instance_score in instances:
            # eid -> event -> related eids -> related events, relations -> related concepts, relations
            related_events = hid2related_events[h_eid]
            for t_eid, relation in related_events:
                concept_score_pairs = aser_concept_conn.get_concepts_given_eventuality(t_eid)
                for t_concept, score in concept_score_pairs:
                    t_cid = t_concept.cid
                    if h_cid == t_cid:
                        continue
                    rid = Relation.generate_rid(h_cid, t_cid)
                    if rid not in rid2relation:
                        rid2relation[rid] = Relation(h_cid, t_cid)
                    rid2relation[rid].update(
                        {k: v * instance_score * score for k, v in relation.relations.items()})
    return rid2relation

Random walk from CoCoLM adapted for use with the database version ASER (Used for experimentation, not included in final method)

As mentioned in the paper, random walk produced uninterpretable results and took a long time to generate data.


In [4]:
def random_walk(start_eid, kg_conn, max_len=3, min_len=0):
    print(f'Performing random walk from eventuality: {start_eid}')
    """
    Perform a random walk starting from a given eventuality ID (eid).
    
    :param start_eid: the starting eventuality ID
    :type start_eid: str
    :param kg_conn: the SQLite database connection
    :type kg_conn: SqliteDBConnection
    :param max_len: the maximum length of the random walk path
    :type max_len: int
    :return: the random walk path as a list of eventuality IDs and relation IDs
    :rtype: List[Dict[str, str]]
    """
    curr_eid = start_eid
    path = [{'eid_0': curr_eid}]
    eid_visited = set()
    eid_visited.add(curr_eid)
    rid_visited = set()
    prev_relation_type = None

    for i in range(1, max_len):
        start_time = time.time()
        relations = kg_conn.get_relations_by_eid(curr_eid)
        end_time = time.time()
        print(f'.....Took {end_time - start_time} seconds to retrieve {len(relations)} relations')
        
        if not relations:
            print(f'No relations found for eid {start_eid}')
            break

        valid_relation = False
        while relations:
            relation = random.choice(relations)
            relations.remove(relation)
            rid = relation['_id']
            hid = relation['hid']
            tid = relation['tid']

            relation_type = max(relation_senses, key=lambda x: relation.get(x, 0))
            if relation_type not in ['Precedence', 'Succession', 'Reason', 'Result', 'Condition'] or relation_type != prev_relation_type:
                if rid not in rid_visited:
                    valid_relation = True
                    break

        if not valid_relation:
            break

        next_eid = tid if hid == curr_eid else hid
        if next_eid not in eid_visited:
            path.append({f'rid_{i}': rid})
            path.append({f'eid_{i}': next_eid})
            rid_visited.add(rid)
            eid_visited.add(next_eid)
            curr_eid = next_eid
            prev_relation_type = relation_type
        else:
            break

    if len(path) < min_len:
        return None

    return path

Code used to generate random seed start eventualities used with the random walk algorithm (Used for experimentation, not included in final method)

In [10]:
# generate random start eventualities
kg_path = "/media/corey/Second Drive/ASER KB/KG.db"
kg_conn = SqliteDBConnection(kg_path, CHUNKSIZE)

num_samples = 10  # Number of random samples to retrieve
min_freq = 100 # minumum frequency for start eventualities

# Retrieve random eventuality start points
start_eventualities = []
print(f'Generating {num_samples} samples')
start_time = time.time()
for erow in kg_conn.get_random_rows(EVENTUALITY_TABLE_NAME, 
                                    EVENTUALITY_COLUMNS, 
                                    num_samples,
                                    min_freq=min_freq):
    start_eventualities.append(convert_row_to_eventuality(erow))
end_time = time.time()

print(f'Took {end_time-start_time} to generate {len(start_eventualities)} random start eventualities')


Generating 10 samples
Took 70.87368655204773 to generate 10 random start eventualities


Code to run the random walk function. See outputs for example of the large amount of time required to produce each path (~30 seconsds) compared to final method (~5 seconds)

In [11]:
# Perform random walk from each start node

max_len = 8  # Maximum length of the random walk path
min_len = 3 # Minimum length of the random walk path
paths = []

for start_eventuality in start_eventualities:
    start_time = time.time()
    path = random_walk(start_eventuality.eid, kg_conn, max_len, min_len)
    end_time = time.time()
    if path:
        paths.append(path)
        print(f'    Random walk of length {len(path)} took {end_time - start_time} seconds.')

print(f"Successfully generated all paths with success rate {round((len(paths)/num_samples)*100)}%")
kg_conn.close()

Performing random walk from eventuality: e37ca9370d9d765ee435b75bbf83986bc2fafa11
.....Took 5.22434139251709 seconds to retrieve 44 relations
.....Took 5.2234978675842285 seconds to retrieve 9789 relations
.....Took 5.241192579269409 seconds to retrieve 12 relations
.....Took 5.512298107147217 seconds to retrieve 3 relations
.....Took 5.468954563140869 seconds to retrieve 4 relations
    Random walk of length 9 took 26.67138409614563 seconds.
Performing random walk from eventuality: df56bdee90c689d9cc65b3031e4c1b864fbf27be
.....Took 5.18345308303833 seconds to retrieve 29 relations
.....Took 5.485089063644409 seconds to retrieve 2 relations
.....Took 5.2254838943481445 seconds to retrieve 9484 relations
.....Took 5.300302028656006 seconds to retrieve 17 relations
.....Took 5.617592096328735 seconds to retrieve 82 relations
.....Took 5.575869798660278 seconds to retrieve 20 relations
.....Took 5.4191648960113525 seconds to retrieve 145 relations
    Random walk of length 15 took 37.8076

Path oject used for storing long path objects (Used for experimentation, not included in final method)

In [3]:
class Path:
    def __init__(self, path_data, kg_conn):
        self.path_data = path_data
        self.kg_conn = kg_conn
        self.eventualities = []
        self.relations = []
        self._populate_path()

    def _populate_path(self):
        for i in range(len(self.path_data)):
            if i % 2 == 0:  # Eventuality
                eid = list(self.path_data[i].values())[0]
                erow = self.kg_conn.select_row(EVENTUALITY_TABLE_NAME, eid, EVENTUALITY_COLUMNS)
                if erow:
                    eventuality = convert_row_to_eventuality(erow)
                    self.eventualities.append(eventuality)
            else:  # Relation
                rid = list(self.path_data[i].values())[0]
                rrow = self.kg_conn.select_row(RELATION_TABLE_NAME, rid, RELATION_COLUMNS)
                if rrow:
                    relation = convert_row_to_relation(rrow)
                    self.relations.append(relation)

    def __str__(self):
        path_str = ""
        for i in range(len(self.eventualities)):
            path_str += repr(self.eventualities[i])
            if i < len(self.relations):
                relation_type = self.get_relation_type(self.relations[i])
                path_str += " -[{}]-> ".format(relation_type)
        return path_str

    def get_relation_type(self, relation):
        return max(relation_senses, key=lambda x: relation.relations.get(x, 0))


First attempt to create algorithm to select eventualities by pattern and connected relations (Used for experimentation, not included in final method)

In [6]:
def select_eventualities_by_pattern(kg_conn, patterns, min_freq=0, min_result_relations=4):
    selected_eventualities = []
    max_no = 10

    for pattern in patterns:
        result = kg_conn.get_eventuality_by_pattern_and_freq(pattern, min_freq, EVENTUALITY_COLUMNS)
        selected_eventualities.extend([convert_row_to_eventuality(r) for r in result])
        print(f'{len(selected_eventualities)} found with pattern {pattern} and min. freq. {min_freq}')

    # filter by number of Result relations
    filtered_eventualities = []
    for eventuality in selected_eventualities:
        if len(filtered_eventualities) == max_no:
            break
        eid = eventuality.eid
        relations = kg_conn.get_relations_by_eid(eid)
        result_relations = [r for r in relations if max(relation_senses, key=lambda x: r.get(x, 0)) == "Result"]
        if len(result_relations) >= min_result_relations:
            filtered_eventualities.append(eventuality)

    paths = []
    for eventuality in filtered_eventualities:
        eid = eventuality.eid
        relations = kg_conn.get_relations_by_eid(eid)
        #relations = [r for r in relations if r['hid'] == eid]
        for relation in relations:
            if max(relation_senses, key=lambda x: relation.get(x, 0)) == "Result":
                rid = relation["_id"]
                hid = relation["hid"]
                tid = relation["tid"]
                if hid == eid:
                    paths.append([{"eid_0": eid}, {"rid_1": rid}, {"eid_1": tid}])
                else:
                    paths.append([{"eid_0": hid}, {"rid_1": rid}, {"eid_1": eid}])

    return paths

In [34]:
kg_path = "/media/corey/Second Drive/ASER KB/KG.db"
kg_conn = SqliteDBConnection(kg_path, CHUNKSIZE)

patterns = [
    "s-v", #0
    "s-v-o", #1
    "s-v-a", #2
    "s-v-o-o", #3
    "s-be-a", #4
    "s-v-be-a", #5
    "s-v-be-o", #6
    "s-v-v-o", #7
    "s-v-v", #8
    "s-be-a-p-o", #9
    "s-v-p-o", #10
    "s-v-o-p-o", #11
    "spass-v", #12
    "spass-v-p-o" #13
]

min_freq = 100
min_result_relations = 4

paths = select_eventualities_by_pattern(kg_conn, patterns[3:4], min_freq, min_result_relations)

kg_conn.close()


1545 found with pattern s-v-o-o and min. freq. 100


Adaptation of previous algorithm (Used for experimentation, not included in final method)

In [12]:
# def create_path with eventuality pattern
def create_path(kg_conn, eventuality_pattern, relation_pattern, start_eventuality, min_freq=0):
    
    path_data = [{"eid_0": start_eventuality.eid}]
    current_eventuality = start_eventuality

    for i in range(1, len(eventuality_pattern)):
        target_pattern = eventuality_pattern[i]
        target_relation = relation_pattern[i - 1]

        relations = kg_conn.get_relations_by_eid(current_eventuality.eid)
        filtered_relations = [r for r in relations if max(relation_senses, key=lambda x: r.get(x, 0)) == target_relation]

        if not filtered_relations:
            return None

        # selecting relation by highest frequency under relation type
        # may change to check tid/ hid to match the next eventuality pattern
        max_freq = max(filtered_relations, key=lambda x: x[target_relation])[target_relation]
        top_relations = [r for r in filtered_relations if r[target_relation] == max_freq]
        chosen_relation = random.choice(top_relations)

        path_data.append({f"rid_{i}": chosen_relation["_id"]})

        next_eid = chosen_relation["tid"] if current_eventuality.eid == chosen_relation["hid"] else chosen_relation["hid"]
        next_eventuality = convert_row_to_eventuality(
            kg_conn.select_row(EVENTUALITY_TABLE_NAME, next_eid, EVENTUALITY_COLUMNS)
        )

        if next_eventuality.pattern != target_pattern:
            return None

        path_data.append({f"eid_{i}": next_eid})
        current_eventuality = next_eventuality

    return path_data

# test
kg_path = "/media/corey/Second Drive/ASER KB/KG.db"
kg_conn = SqliteDBConnection(kg_path, CHUNKSIZE)

eventuality_pattern = ["s-v", "s-v-o", "s-v"]
relation_pattern = ["Condition", "Reason"]
min_freq = 5
desired_paths = 5

start_pattern = eventuality_pattern[0]
start_eventualities = kg_conn.get_eventuality_by_pattern_and_freq(start_pattern, min_freq, EVENTUALITY_COLUMNS)

paths = []
num_paths = 0
while num_paths < desired_paths:
    if not start_eventualities:
        break
    start_eventuality = convert_row_to_eventuality(random.choice(start_eventualities))
    path_data = create_path(kg_conn, eventuality_pattern, relation_pattern, start_eventuality, min_freq)
    if path_data:
        paths.append(path_data)
        print(Path(path_data, kg_conn))
        num_paths += 1
    else:
        print("No valid path found.")

kg_conn.close()

No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path found.
No valid path

Multihop path generator, but also took too much time. (Used for experimentation, not included in final method)

In [7]:
# create_path without eventuality pattern
def create_path(kg_conn, relation_pattern, start_eventuality, min_freq=0, max_hops=3):
    path_data = [{"eid_0": start_eventuality.eid}]
    current_eventuality = start_eventuality
    hop_count = 0

    while hop_count < max_hops:
        target_relation = relation_pattern[hop_count]
        relations = kg_conn.get_relations_by_hid(current_eventuality.eid, type=target_relation)
        filtered_relations = [r for r in relations if max(relation_senses, key=lambda x: r.get(x, 0)) == target_relation]
        

        if not filtered_relations:
            #print(f'No relations found of type {target_relation} connected to {current_eventuality.eid}')
            #print(f'Path length {len(path_data)}')
            return None

        # selecting relation by highest frequency under relation type
        max_freq = max(filtered_relations, key=lambda x: x[target_relation])[target_relation]
        top_relations = [r for r in filtered_relations if r[target_relation] == max_freq]
        chosen_relation = random.choice(top_relations)
        path_data.append({f"rid_{hop_count+1}": chosen_relation["_id"]})

        next_eid = chosen_relation["tid"]
        next_eventuality = convert_row_to_eventuality(
            kg_conn.select_row(EVENTUALITY_TABLE_NAME, next_eid, EVENTUALITY_COLUMNS)
        )
        path_data.append({f"eid_{hop_count+1}": next_eid})
        current_eventuality = next_eventuality

        hop_count += 1

    return path_data


# test
kg_path = "/media/corey/Second Drive/ASER KB/KG.db"
kg_conn = SqliteDBConnection(kg_path, CHUNKSIZE)
relation_pattern = ["Condition", "Reason", "Result"]
relation_pattern_1 = ["Precedence", "Succession"]
relation_pattern_2 = ["Reason", "Result"]
relation_pattern_3 = ["Contrast", "Concession"]
relation_pattern_4 = ["Condition", "Co_Occurrence"]
relation_pattern_5 = ["ChosenAlternative", "Reason"]
relation_pattern_6 = ["Precedence", "Reason", "Result"]
relation_pattern_7 = ["Contrast", "Precedence", "Succession"]
relation_pattern_8 = ["Precedence", "Reason", "Condition", "Result"]
relation_pattern_9 = ["Contrast", "Precedence", "Reason", "Succession"]
min_freq = 100
max_hops = 3
num_paths = 5
start_eventualities = kg_conn.get_random_rows(EVENTUALITY_TABLE_NAME, EVENTUALITY_COLUMNS, num_samples=100, min_freq=min_freq)

paths = []
for e in start_eventualities:
    if len(paths) == num_paths:
        break
    path_data = create_path(kg_conn, relation_pattern, convert_row_to_eventuality(e), min_freq, max_hops)
    if path_data:
        paths.append(path_data)
        print(f'Added path: {Path(path_data, kg_conn)}')
else:
    print(f'{len(paths)} found from {len(start_eventualities)} random start eventualities')

kg_conn.close()

Added path: you run away -[Condition]-> you be scare -[Reason]-> i be get close -[Result]-> you get nothing
Added path: i be in madison -[Condition]-> i be in the area -[Reason]-> i keep -[Result]-> i be only gon na take it
Added path: it be new to i -[Condition]-> i have not see it -[Reason]-> i have be busy -[Result]-> i hear
Added path: you look stunning -[Condition]-> it be not obvious -[Reason]-> it be not true -[Result]-> you gay sperm donor be not back in jail
Added path: how do you know -[Condition]-> i do not tell you -[Reason]-> he make -[Result]-> we will not bother he anymore


In [23]:
# save paths obj as pickle
import os
import pickle

# Create the file path for the pickle file
pickle_file = "paths.pickle"

with open(pickle_file, "wb") as file:
    pickle.dump(paths, file)

print("Paths list saved as", pickle_file)

Paths list saved as paths.pickle


In [None]:
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

relations_list = ['Reason', 'Succession', 'Precedence', 'Synchronous', 'Contrast', 'Result', 'Condition']
eventuality_patterns = ['s-v-o', 's-v-o-o', 's-v-a', 's-v-v-o', 'spass-v-o']
kg_path = "/media/corey/Second Drive/ASER KB/KG.db"
kg_conn = SqliteDBConnection(kg_path, CHUNKSIZE)
min_freq = 100
num_paths = 200
num_options = 4

# one-hop inference
for pattern in eventuality_patterns:
    paths = []
    logging.info(f"Processing pattern: {pattern}")
    start_pattern = pattern.split(', ')[0]
    start_eventualities = kg_conn.get_eventuality_by_pattern_and_freq(start_pattern, min_freq, EVENTUALITY_COLUMNS)

    # sort start eventualities from highest to lowest frequency
    start_eventualities = sorted(start_eventualities, key=lambda x: x['frequency'], reverse=True)

    logging.info(f"Retrieved {len(start_eventualities)} start eventualities for pattern: {pattern}")
    counter = dict.fromkeys(relations_list, 0)

    for e in start_eventualities:
        if all(counter[r] >= num_paths for r in relations_list):
            break

        start_eventuality = convert_row_to_eventuality(e)
        all_relations = kg_conn.get_relations_by_hid(start_eventuality.eid)
        logging.info(f"Processing start eventuality: {start_eventuality.eid}")
        for relation in [r for r in relations_list if counter[r] < num_paths]:
            relations = [r for r in all_relations if r[relation] > 0]

            if len(relations) >= num_options:
                path_data = [{"eid_0": start_eventuality.eid, "probability": 1.0}]
                relation_probabilities = []
                total_weight = sum(r[relation] for r in relations)

                for r in relations:
                    probability = r[relation] / total_weight
                    relation_probabilities.append({"relation": r, "probability": probability})

                relation_probabilities = sorted(relation_probabilities, key=lambda d: d['probability'])
                chosen_relation = relation_probabilities[-1]['relation']

                path_data.append({
                    "rid_1": chosen_relation["_id"],
                    'relation_type': relation,
                    "relation_probabilities": [
                        {'rid': r['relation']['_id'], 'p': r['probability']} for r in relation_probabilities[-num_options:]
                    ]
                })

                next_eid = chosen_relation["tid"]
                next_eventuality = convert_row_to_eventuality(
                    kg_conn.select_row(EVENTUALITY_TABLE_NAME, next_eid, EVENTUALITY_COLUMNS)
                )
                path_data.append({"eid_1": next_eid, "probability": relation_probabilities[-1]["probability"]})
                paths.append(path_data)
                counter[relation] += 1
                logging.info(f"Path generated for relation {relation}: {path_data}")

    logging.info(f"Generated {len(paths)} paths for pattern: {pattern}")
    with open('paths.txt', 'a') as f:
        for path in paths:
            f.write(str(path) + '\n')
            logging.info(f"Path written to file: {path}")

kg_conn.close()

# process paths to match target structure
kg_path = "/media/corey/Second Drive/ASER KB/KG.db"
kg_conn = SqliteDBConnection(kg_path, CHUNKSIZE)

for path in paths:
    # get start eventuality words
    start_eid = path[0]['eid_0']
    start_eventuality = kg_conn.select_row(EVENTUALITY_TABLE_NAME, start_eid, EVENTUALITY_COLUMNS)
    start_eventuality_words = start_eventuality['words']
    path[0]['words'] = start_eventuality_words

    # get relation tid words
    for relation in path[1]['relation_probabilities']:
        tid = kg_conn.select_row(RELATION_TABLE_NAME, relation['rid'], RELATION_COLUMNS)['tid']
        tid_eventuality = kg_conn.select_row(EVENTUALITY_TABLE_NAME, tid, EVENTUALITY_COLUMNS)
        relation['tail_ev'] = {'tid': tid, 'words': tid_eventuality['words']}

    # get end eventuality words
    end_eid = path[2]['eid_1']
    end_eventuality = kg_conn.select_row(EVENTUALITY_TABLE_NAME, end_eid, EVENTUALITY_COLUMNS)
    end_eventuality_words = end_eventuality['words']
    path[2]['words'] = end_eventuality_words

# save paths as txt
with open('paths_5356_processed.txt', 'w') as f:
    for path in paths:
        f.write(str(path) + '\n')

