In [1]:
!cp "/content/drive/MyDrive/Ecole/Internship/data/aikg.zip" "/content/"
!unzip aikg.zip

Archive:  aikg.zip
  inflating: aikg.ttl                


In [2]:
# Retrieving relations from ttl file
!grep rdf:predicate aikg.ttl -A 1 -B 1 > aikg_relations.txt

In [3]:
!head -n 10 aikg_relations.txt

    rdf:object aikg:information_overload ;
    rdf:predicate aikg-ont:supportsOtherEntity ;
    rdf:subject aikg:recommender_system ;
--
    rdf:object aikg:km ;
    rdf:predicate aikg-ont:taskSupportedBy ;
    rdf:subject aikg:knowledge_sharing ;
--
    rdf:object aikg:boosting ;
    rdf:predicate aikg-ont:methodUsedBy ;


In [4]:
i = 0
objects = []
subjects = []
predicates = []

with open('aikg_relations.txt', 'r') as fp:
    for line in fp:
        if line.startswith('--'):
            pass
        elif i == 0:
            objects.append(line.split(':')[2][:-3])
            i += 1
        elif i == 1:
            predicates.append(line.split(':')[2][:-3])
            i += 1
        elif i == 2:
            subjects.append(line.split(':')[2][:-3])
            i = 0

In [5]:
len(objects), len(subjects), len(predicates)

(2235820, 2235820, 2235820)

In [6]:
import pandas as pd
import numpy as np

In [7]:
df = pd.DataFrame({'object': objects, 
                   'subject': subjects, 
                   'predicate': predicates})
df.head()

Unnamed: 0,object,subject,predicate
0,information_overload,recommender_system,supportsOtherEntity
1,km,knowledge_sharing,taskSupportedBy
2,boosting,adaboost,methodUsedBy
3,image_segmentation,object_recognition,otherEntityUsedBy
4,knowledge_representation,semantic_network,methodUsedBy


In [8]:
df['predicate'].unique()

array(['supportsOtherEntity', 'taskSupportedBy', 'methodUsedBy',
       'otherEntityUsedBy', 'metricImprovedBy',
       '//www.w3.org/2004/02/skos/core#narrower>', 'usesMethod',
       'usesOtherEntity', 'includesOtherEntity',
       '//www.w3.org/2004/02/skos/core#broader>', 'OtherEntityIncludedBy',
       'OtherEntityUsedBy', 'usesTask', 'taskUsedBy', 'usesMetric',
       'metricUsedBy', 'includesMethod', 'methodIncludedBy',
       'usesMaterial', 'materialUsedBy', 'includesMaterial',
       'materialIncludedBy', 'includesTask', 'taskIncludedBy',
       'evaluatesMethod', 'methodEvaluatedBy', 'supportsTask',
       'supportsMethod', 'methodSupportedBy', 'evaluatesOtherEntity',
       'OtherEntityEvaluatedBy', 'predictsOtherEntity',
       'OtherEntityPredictedBy', 'improvesMetric', 'requiresTask',
       'taskRequiredBy', 'OtherEntitySupportedBy', 'improvesMethod',
       'methodImprovedBy', 'evaluatesTask', 'taskEvaluatedBy',
       'predictsMetric', 'metricPredictedBy', 'evaluatesM

In [9]:
df['predicate'].value_counts()

methodUsedBy                                460724
usesMethod                                  460724
usesOtherEntity                             241004
includesOtherEntity                         144755
otherEntityUsedBy                           136310
OtherEntityIncludedBy                       113678
//www.w3.org/2004/02/skos/core#narrower>    107812
//www.w3.org/2004/02/skos/core#broader>     107812
OtherEntityUsedBy                           104694
materialUsedBy                               41075
usesMaterial                                 41075
includesMethod                               34616
methodIncludedBy                             30332
OtherEntityPredictedBy                       29382
taskUsedBy                                   22341
usesTask                                     22341
methodEvaluatedBy                            18622
evaluatesMethod                              17954
includesMaterial                             14317
materialIncludedBy             

In [10]:
def clean_word(word):
    word = word.replace('_', ' ')
    return word

In [11]:
df['clean_subject'] = df['subject'].apply(lambda x: clean_word(x))
df['clean_object'] = df['object'].apply(lambda x: clean_word(x))

df.head()

Unnamed: 0,object,subject,predicate,clean_subject,clean_object
0,information_overload,recommender_system,supportsOtherEntity,recommender system,information overload
1,km,knowledge_sharing,taskSupportedBy,knowledge sharing,km
2,boosting,adaboost,methodUsedBy,adaboost,boosting
3,image_segmentation,object_recognition,otherEntityUsedBy,object recognition,image segmentation
4,knowledge_representation,semantic_network,methodUsedBy,semantic network,knowledge representation


ConceptNet Numberbatch embeds

In [None]:
!wget https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz
!gunzip numberbatch-en-19.08.txt.gz

--2022-04-25 16:52:25--  https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz
Resolving conceptnet.s3.amazonaws.com (conceptnet.s3.amazonaws.com)... 54.231.204.97
Connecting to conceptnet.s3.amazonaws.com (conceptnet.s3.amazonaws.com)|54.231.204.97|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 325403502 (310M) [application/x-gzip]
Saving to: ‘numberbatch-en-19.08.txt.gz’

   numberbatch-en-1  19%[==>                 ]  61.46M  35.7MB/s               ^C
gzip: numberbatch-en-19.08.txt already exists; do you wish to overwrite (y or n)? ^C


In [None]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

def apply_embed_model(model, line, dim=300):
    result = np.zeros((dim,))
    words = line.split(' ')

    for w in words:
        if w in model:
            result += model[w]
        else:
            return None
    
    return result / len(words)

In [None]:
conceptnet_model = load_embeddings("numberbatch-en-19.08.txt")

In [None]:
df['object_cn_embed'] = df['clean_object'].apply(lambda x: apply_embed_model(conceptnet_model, x))
df['subject_cn_embed'] = df['clean_subject'].apply(lambda x: apply_embed_model(conceptnet_model, x))

df.head()