In [None]:
!cp "/content/drive/MyDrive/Ecole/Internship/data/subjects.npy" "/content/"
!cp "/content/drive/MyDrive/Ecole/Internship/data/objects.npy" "/content/"
!cp "/content/drive/MyDrive/Ecole/Internship/data/df_conceptnet.pkl" "/content/"
!cp -r "/content/drive/MyDrive/Ecole/Internship/data/WN18RR" "/content/"

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
from collections import defaultdict

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

%load_ext tensorboard

In [None]:
def clean_word(word):
    word = word.split('.')[0]
    word = word.replace('_', ' ')
    return word

In [None]:
df_wn_train = pd.read_csv('/content/WN18RR/text/train.txt', sep='\t', header=None)
df_wn_train.columns = ['word1', 'relation', 'word2']

df_wn_valid = pd.read_csv('/content/WN18RR/text/valid.txt', sep='\t', header=None)
df_wn_valid.columns = ['word1', 'relation', 'word2']

df_wn_test = pd.read_csv('/content/WN18RR/text/test.txt', sep='\t', header=None)
df_wn_test.columns = ['word1', 'relation', 'word2']

df_wn_train.head()

Unnamed: 0,word1,relation,word2
0,land_reform.n.01,_hypernym,reform.n.01
1,cover.v.01,_derivationally_related_form,covering.n.02
2,botany.n.02,_derivationally_related_form,botanize.v.01
3,kamet.n.01,_instance_hypernym,mountain_peak.n.01
4,question.n.01,_derivationally_related_form,ask.v.01


In [None]:
df_wn_train.shape, df_wn_valid.shape, df_wn_test.shape

((86835, 3), (3034, 3), (3134, 3))

In [None]:
bad_relations = ['_derivationally_related_form', ] # '_verb_group'?
df_wn_train = df_wn_train[~df_wn_train['relation'].isin(bad_relations)]
df_wn_valid = df_wn_valid[~df_wn_valid['relation'].isin(bad_relations)]

In [None]:
df_full = pd.concat((df_wn_train, df_wn_valid), 0)
df_full.shape

  """Entry point for launching an IPython kernel.


(59076, 3)

In [None]:
df_full.loc[df_full['relation']=='_instance_hypernym', 'relation'] = '_hypernym'
df_full.loc[df_full['relation']!='_hypernym', 'relation'] = '_non_hypernym'
df_full['relation'].value_counts()

_hypernym        38998
_non_hypernym    20078
Name: relation, dtype: int64

In [None]:
df_full['clean_word1'] = df_full['word1'].apply(lambda x: clean_word(x))
df_full['clean_word2'] = df_full['word2'].apply(lambda x: clean_word(x))

df_full.head()

Unnamed: 0,word1,relation,word2,clean_word1,clean_word2
0,land_reform.n.01,_hypernym,reform.n.01,land reform,reform
3,kamet.n.01,_hypernym,mountain_peak.n.01,kamet,mountain peak
6,hoist.v.03,_hypernym,raise.v.02,hoist,raise
7,empathy.n.01,_hypernym,sympathy.n.02,empathy,sympathy
8,disengagement.n.02,_hypernym,retreat.n.01,disengagement,retreat


In [None]:
le = LabelEncoder()
df_full['relation_num'] = le.fit_transform(df_full['relation'].values)

In [None]:
df_full.head()

Unnamed: 0,word1,relation,word2,clean_word1,clean_word2,word1_cn_embed,word2_cn_embed,relation_num
0,land_reform.n.01,_hypernym,reform.n.01,land reform,reform,"[-0.023549998179078102, -0.05289999954402447, ...","[-0.08699999749660492, -0.0689999982714653, -0...",0
3,kamet.n.01,_hypernym,mountain_peak.n.01,kamet,mountain peak,"[0.05260000005364418, 0.019700000062584877, 0....","[0.005700001493096352, -0.016000000294297934, ...",0
6,hoist.v.03,_hypernym,raise.v.02,hoist,raise,"[-0.09549999982118607, -0.09019999951124191, 0...","[-0.14509999752044678, -0.1289999932050705, 0....",0
7,empathy.n.01,_hypernym,sympathy.n.02,empathy,sympathy,"[-0.13760000467300415, -0.04010000079870224, -...","[-0.10400000214576721, -0.12430000305175781, -...",0
8,disengagement.n.02,_hypernym,retreat.n.01,disengagement,retreat,"[-0.12960000336170197, -0.1014999970793724, -0...","[-0.039500001817941666, -0.14910000562667847, ...",0


In [None]:
!wget https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz

--2022-06-22 01:05:55--  https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz
Resolving conceptnet.s3.amazonaws.com (conceptnet.s3.amazonaws.com)... 52.216.143.140
Connecting to conceptnet.s3.amazonaws.com (conceptnet.s3.amazonaws.com)|52.216.143.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 325403502 (310M) [application/x-gzip]
Saving to: ‘numberbatch-en-19.08.txt.gz’


2022-06-22 01:06:03 (38.5 MB/s) - ‘numberbatch-en-19.08.txt.gz’ saved [325403502/325403502]



In [None]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

def apply_embed_model(model, line, dim=300):
    result = np.zeros((dim,))
    words = line.split(' ')

    for w in words:
        if w in model:
            result += model[w]
        else:
            return None
    
    return result / len(words)

In [None]:
!gunzip numberbatch-en-19.08.txt.gz
conceptnet_model = load_embeddings("numberbatch-en-19.08.txt")

In [None]:
df_full['word1_cn_embed'] = df_full['clean_word1'].apply(lambda x: apply_embed_model(conceptnet_model, x))
df_full['word2_cn_embed'] = df_full['clean_word2'].apply(lambda x: apply_embed_model(conceptnet_model, x))

In [None]:
df_full.isna().sum()

word1                0
relation             0
word2                0
clean_word1          0
clean_word2          0
word1_cn_embed    1154
word2_cn_embed     809
relation_num         0
dtype: int64

In [None]:
df_wn_train_conceptnet = df_full.dropna()

In [None]:
word1_embeds = np.stack(df_wn_train_conceptnet['word1_cn_embed'].values)
word2_embeds = np.stack(df_wn_train_conceptnet['word2_cn_embed'].values)

train_embeds = word1_embeds + word2_embeds
train_embeds_concat = np.concatenate((word1_embeds, word2_embeds), axis=1)

y_train = df_wn_train_conceptnet['relation_num'].values

In [None]:
# sum embeds
logreg = LogisticRegression(multi_class='multinomial')
logreg.fit(train_embeds, y_train)

train_preds = logreg.predict(train_embeds)
print('Train score:', accuracy_score(y_train, train_preds))

Train score: 0.8046680897225041


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# AiKg validation

In [None]:
tmp_object = np.load('objects.npy')
tmp_subject = np.load('subjects.npy')

valid_embeds = tmp_object + tmp_subject

df_grouped = pd.read_pickle('df_conceptnet.pkl')

In [None]:
df_grouped.loc[df_grouped['predicate']=='methodUsedBy', 'predicate'] = 'non_hypernym'
df_grouped.loc[df_grouped['predicate']=='otherEntityUsedBy', 'predicate'] = 'non_hypernym'
df_grouped.loc[df_grouped['predicate']=='materialUsedBy', 'predicate'] = 'non_hypernym'
df_grouped.loc[df_grouped['predicate']=='taskUsedBy', 'predicate'] = 'non_hypernym'
df_grouped.loc[df_grouped['predicate']=='OtherEntityIncludedBy', 'predicate'] = 'non_hypernym'
df_grouped.loc[df_grouped['predicate']=='materialIncludedBy', 'predicate'] = 'non_hypernym'
df_grouped.loc[df_grouped['predicate']=='methodIncludedBy', 'predicate'] = 'non_hypernym'
df_grouped.loc[df_grouped['predicate']=='methodEvaluatedBy', 'predicate'] = 'non_hypernym'
df_grouped.loc[df_grouped['predicate']=='OtherEntityPredictedBy', 'predicate'] = 'non_hypernym'
df_grouped.head()

Unnamed: 0,object,subject,predicate,clean_subject,clean_object,predicate_num
0,image_segmentation,object_recognition,non_hypernym,object recognition,image segmentation,8
1,knowledge_representation,semantic_network,non_hypernym,semantic network,knowledge representation,7
2,decision_making_process,collaborative_bi,non_hypernym,collaborative bi,decision making process,7
3,model_oriented_formalism,safety_constraint,non_hypernym,safety constraint,model oriented formalism,8
4,model_oriented_formalism,satisfaction_of_refinement_obligation,non_hypernym,satisfaction of refinement obligation,model oriented formalism,8


In [None]:
df_grouped['predicate'].value_counts()

non_hypernym    721156
hypernym         80425
Name: predicate, dtype: int64

In [None]:
le = LabelEncoder()
df_grouped['predicate_num'] = le.fit_transform(df_grouped['predicate'].values)
y = df_grouped['predicate_num'].values
df_grouped.head()

Unnamed: 0,object,subject,predicate,clean_subject,clean_object,predicate_num
0,image_segmentation,object_recognition,non_hypernym,object recognition,image segmentation,1
1,knowledge_representation,semantic_network,non_hypernym,semantic network,knowledge representation,1
2,decision_making_process,collaborative_bi,non_hypernym,collaborative bi,decision making process,1
3,model_oriented_formalism,safety_constraint,non_hypernym,safety constraint,model oriented formalism,1
4,model_oriented_formalism,satisfaction_of_refinement_obligation,non_hypernym,satisfaction of refinement obligation,model oriented formalism,1


In [None]:
y.shape, valid_embeds.shape

((801581,), (801581, 300))

In [None]:
valid_preds = logreg.predict(valid_embeds)
print('Validation score:', accuracy_score(y, valid_preds))

Validation score: 0.1362894080573267
