<a href="https://colab.research.google.com/github/flora0110/OpenCSR_RL/blob/main/F2FSpareS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import pandas as pd
from scipy.sparse import dok_matrix

In [4]:
# read GenericsKB to Pandas DataFrame
df = pd.read_csv('/content/drive/MyDrive/Research/GenericsKB-Best.tsv',sep='\t')

In [5]:
df[:10]

Unnamed: 0,SOURCE,TERM,QUANTIFIER,GENERIC SENTENCE,SCORE
0,Waterloo,aa battery,,AA batteries maintain the settings if the powe...,0.350923
1,ARC,aardvark female,,Aardvark females appear to come into season on...,0.570737
2,ARC,aardvark hole,,Aardvark holes are used by small buck as a res...,0.574909
3,Waterloo,aardvark skin,,Aardvark skin is thick and sparsely haired.,0.444273
4,WordNet3.0,aardvark,,Aardvark isa mammal.,1.0
5,ARC,aardvark,,Aardvarks also dig to get food.,0.590054
6,ARC,aardvark,,Aardvarks also eat locusts and a type of grass...,0.724097
7,ARC,aardvark,,"Aardvarks also require sandy soil, as opposed ...",0.708718
8,ARC,aardvark,,Aardvarks are a nocturnal creature.,0.712522
9,ARC,aardvark,,Aardvarks are about the size of a small pig.,0.620401


## make Sparse Fact-to-Fact Index S

In [6]:
# get unique concept
# use set becuase we only care if the element exsit, so we dont need index
# set not allow duplicate element
concepts = set(df["TERM"].unique())

In [7]:
concepts_list = list(concepts)
concepts_list[:10]

[nan,
 'andean region',
 'obsessive thought',
 'alcoholic patient',
 'direct modulation',
 'modern timpani',
 'garlic clone',
 'weak tree',
 'filamentous type',
 'forked trunk']

In [8]:
# create a concept2idx(dic)，let every concept has index
concept2idx = {c: i for i, c in enumerate(concepts)}

In [9]:
#---- tes code----

from itertools import islice

# print first 10 element
for key, value in islice(concept2idx.items(), 10):
    print(key, value)


nan 0
andean region 1
obsessive thought 2
alcoholic patient 3
direct modulation 4
modern timpani 5
garlic clone 6
weak tree 7
filamentous type 8
forked trunk 9


In [10]:
# create a spare matrix S，size=|F| × |F|
# S = dok_matrix((len(concepts), len(concepts)), dtype=bool)

### convert "GENERIC SENTENCE" to list of concept

In [11]:
how_many_sentence_we_use = 10000

In [12]:
import nltk
from nltk.stem import WordNetLemmatizer

In [13]:
# download necessary NLTK data (only need to do this once)
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("maxent_ne_chunker")
nltk.download("words")
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
df.shape

(1020868, 5)

In [15]:
short_df = df.sample(n=how_many_sentence_we_use, random_state=42).reset_index(drop=True)
short_df.shape

(10000, 5)

In [16]:
facts_GENERICSENTENCE = short_df['GENERIC SENTENCE']

In [17]:
facts_GENERICSENTENCE[:10]

0    Hawks start breeding when they are one year old .
1                           Companies have candidates.
2                          Most male owls bring meals.
3    Colon cancer is now the leading cause of cance...
4    Some objects have sharp geometric features suc...
5    Citizenship is in many ways a difficult and pe...
6    Some lizards change color in response to their...
7                          Every computer has a clock.
8                          All glands receive vessels.
9    Alcohol affects the central nervous system of ...
Name: GENERIC SENTENCE, dtype: object

In [18]:
concepts_GENERICSENTENCE = []
# 定義lemmatizer
lemmatizer = WordNetLemmatizer()

for idx, row in short_df.iterrows():
  #print(str(idx)+" : "+row["GENERIC SENTENCE"])
  tokens = nltk.word_tokenize(row["GENERIC SENTENCE"])
  #print(tokens)
  # perform part-of-speech tagging to identify nouns and noun phrases
  pos_tags = nltk.pos_tag(tokens)
  #print(pos_tags)
  concepts = set()
  for i in range(len(pos_tags)):
    if pos_tags[i][1].startswith("NN"):  # check if the word is a noun
      # 使用lemmatizer將單數名詞還原為原形
      lemma = lemmatizer.lemmatize(pos_tags[i][0], pos=pos_tags[i][1][0].lower())
      # 加入到概念中
      concepts.add(lemma)
  #print(concepts)
  #print("\n")
  concepts_GENERICSENTENCE.append(concepts)
    

In [19]:
concepts_GENERICSENTENCE

[{'Hawks', 'year'},
 {'Companies', 'candidate'},
 {'bring', 'meal', 'owl'},
 {'Colon', 'cancer', 'cause', 'death', 'nonsmoker'},
 {'corner', 'crease', 'feature', 'object'},
 {'Citizenship', 'life', 'way'},
 {'color', 'lizard', 'mood', 'response'},
 {'clock', 'computer'},
 {'gland', 'vessel'},
 {'Alcohol', 'driver', 'system'},
 {'Silk', 'resistant', 'water'},
 {'energy', 'plant'},
 {'Slugs', 'marigold'},
 {'cache', 'space'},
 {'computer', 'graphic', 'picture'},
 {'centrifugation', 'measure', 'rate', 'sediment'},
 {'Protection', 'detection'},
 {'Hate', 'community', 'crime', 'fabric', 'society'},
 {'Communication', 'basis', 'decision'},
 {'Geomancies', 'divination'},
 {'baby', 'developing', 'nutrition'},
 {'Pressure', 'pressure'},
 {'birth', 'boar', 'dog', 'drainage', 'man', 'remain', 'system'},
 {'Packers', 'cell', 'part'},
 {'compound', 'room', 'substance', 'temperature'},
 {'Ear', 'microscopic', 'mite', 'organism', 'tick'},
 {'Ethics', 'practice', 'structure', 'theory'},
 {'Bats', 'day

In [20]:
type(concepts_GENERICSENTENCE)

list

In [21]:
facts_GENERICSENTENCE = facts_GENERICSENTENCE.tolist()

In [22]:
type(facts_GENERICSENTENCE)

list

#### 把concepts_GENERICSENTENCE先存起來

In [87]:
import pickle


# Open a file in binary mode
with open('concepts_GENERICSENTENCE_how_many_sentence_we_use.pkl', 'wb') as f:
    # Use pickle to dump the list to the file
    pickle.dump(concepts_GENERICSENTENCE, f)


In [88]:
"""
import pickle

# Open the file in binary mode
with open('concepts_GENERICSENTENCE_how_many_sentence_we_use.pkl', 'rb') as f:
    # Use pickle to load the list from the file
    concepts_GENERICSENTENCE = pickle.load(f)

# The list can now be used in the program
print(concepts_GENERICSENTENCE)
"""

"\nimport pickle\n\n# Open the file in binary mode\nwith open('concepts_GENERICSENTENCE_how_many_sentence_we_use.pkl', 'rb') as f:\n    # Use pickle to load the list from the file\n    concepts_GENERICSENTENCE = pickle.load(f)\n\n# The list can now be used in the program\nprint(concepts_GENERICSENTENCE)\n"

### 利用concepts建立fact to fact的關係

In [23]:
import numpy as np
from scipy.sparse import lil_matrix

In [24]:
# 創建一個空的稀疏矩陣
num_facts = len(facts_GENERICSENTENCE)
S = lil_matrix((num_facts, num_facts), dtype=bool)

In [25]:
print(len(facts_GENERICSENTENCE))
print(num_facts)
print(type(S))

10000
10000
<class 'scipy.sparse._lil.lil_matrix'>


**連接規則**
if we can create a link fi → fj (i.e.,
Sij = 1) as follows:
- i != j
- |I| >= 1 
  - where I is the set of concepts that are mentioned in both fi and fj . Note that we remove the most frequent 100 concepts (e.g.,
human) from I.
- |I| < |fi|
  - We do not create links when all
concepts in fi are mentioned in fj , which are
usually redundant.
- |fj | − |I| >= 2. 
  - We create links only when there are more than two unseen concepts in fj which are not in fi

#### create a list with most frequent 100 concepts

In [91]:
how_many_cocept_too_common = 50

In [92]:
# create a list with most frequent 100 concepts
from collections import defaultdict
from nltk.tokenize import word_tokenize

In [93]:
# Step 1: Create a dictionary to store the frequency of each concept
freq_dict = defaultdict(int)

# Step 2-3: Tokenize each sentence and count the frequency of each concept
for sentence in concepts_GENERICSENTENCE:
  for concept in sentence:
      freq_dict[concept] += 1


In [94]:
# test code
first_10_items = list(freq_dict.items())[:10]

print(first_10_items)

[('Hawks', 4), ('year', 76), ('Companies', 9), ('candidate', 2), ('bring', 1), ('meal', 15), ('owl', 4), ('cancer', 71), ('nonsmoker', 1), ('cause', 65)]


In [95]:
# Step 4: Sort the dictionary by value in descending order
sorted_freq_dict = sorted(freq_dict.items(), key=lambda x: x[1], reverse=True)

In [98]:
# Step 5: Extract the top 100 concepts
most_common_concepts = [x[0] for x in sorted_freq_dict[:how_many_cocept_too_common]]

In [99]:
most_common_concepts

['part',
 'people',
 'water',
 'plant',
 'animal',
 'food',
 'life',
 'cell',
 'body',
 'child',
 'energy',
 'time',
 'disease',
 'system',
 'woman',
 'way',
 'form',
 'area',
 'tree',
 'world',
 'problem',
 'year',
 'growth',
 'cancer',
 'effect',
 'soil',
 'specie',
 'process',
 'bird',
 'cause',
 'source',
 'color',
 'health',
 'air',
 'insect',
 'material',
 'Water',
 'number',
 'group',
 'People',
 'family',
 'leaf',
 'development',
 'seed',
 'death',
 'environment',
 'thing',
 'blood',
 'organism',
 'light']

#### connect facts -> Sparse Fact-to-Fact Index S

In [100]:
# 定義連接規則
def connection_rules(concepts1, concepts2):
    common_concepts = concepts1.intersection(concepts2)
    common_concepts.difference_update(most_common_concepts)
    new_concepts = concepts2 - concepts1
    if(len(common_concepts) <= 0):
      return False
    elif(len(new_concepts) < 2):
      return False
    return len(common_concepts) > 0 and len(new_concepts) >= 2

In [117]:
# 對所有事實建立稀疏索引
for i in range(len(facts_GENERICSENTENCE)):
    for j in range(i+1, len(facts_GENERICSENTENCE)):
        if connection_rules(concepts_GENERICSENTENCE[i], concepts_GENERICSENTENCE[j]):
            S[i, j] = True
            S[j, i] = True  # 因為S是對稱矩陣，所以需要同時更新對稱元素

##### 存起來

In [119]:
from scipy.sparse import save_npz

# 存成 .npz 檔案
saving_usage_S = S.tocsr()
save_npz('S.npz', saving_usage_S)


In [None]:
#from scipy.sparse import load_npz

#load_out_S = load_npz('S.npz')


## sparse retrieval

#### 計算句子相似度

In [139]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [137]:
sentence1 = "The quick brown fox jumps over the lazy dog."
sentence2 = "A quick brown dog jumps on the log."


In [140]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch

In [141]:
# 載入DistilBERT模型和tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [142]:
# 將句子轉換成模型需要的格式
inputs1 = tokenizer(sentence1, return_tensors='pt', padding=True, truncation=True)
inputs2 = tokenizer(sentence2, return_tensors='pt', padding=True, truncation=True)


In [143]:
# 使用Bi-encoder計算兩個句子之間的相似度
with torch.no_grad():
    # 獲取兩個句子的embedding
    embeddings1 = model(**inputs1).last_hidden_state[:, 0, :]
    embeddings2 = model(**inputs2).last_hidden_state[:, 0, :]
    
    # 計算兩個embedding之間的cosine similarity
    similarity = torch.cosine_similarity(embeddings1, embeddings2)
    
print(similarity.item())


0.9886928796768188


#### 計算句子相似度 function

In [145]:
def sim(sentence1,sentence2):
  # 將句子轉換成模型需要的格式
  inputs1 = tokenizer(sentence1, return_tensors='pt', padding=True, truncation=True)
  inputs2 = tokenizer(sentence2, return_tensors='pt', padding=True, truncation=True)
  # 使用Bi-encoder計算兩個句子之間的相似度
  with torch.no_grad():
    # 獲取兩個句子的embedding
    embeddings1 = model(**inputs1).last_hidden_state[:, 0, :]
    embeddings2 = model(**inputs2).last_hidden_state[:, 0, :]
    
    # 計算兩個embedding之間的cosine similarity
    similarity = torch.cosine_similarity(embeddings1, embeddings2)
    
  return similarity.item()


#### 一跳測試

In [148]:
import random

In [151]:
# 先做一跳
question = "what can help alleviate global warming?"
# 隨機找起點
indices = random.sample(range(0, len(facts_GENERICSENTENCE)), 10)

start_index = 0
max_sim = -1
for i in indices:
  temp = sim(facts_GENERICSENTENCE[i],question)
  print(temp)
  if (temp>max_sim):
    start_index = i
    max_sim = temp

0.8108698725700378
0.8012885451316833
0.8772512078285217
0.864356279373169
0.8401616215705872
0.8798949718475342
0.852785050868988
0.8323057889938354
0.8357231616973877
0.9053924679756165


In [152]:
print(start_index)
print(facts_GENERICSENTENCE[start_index])
print(max_sim)

3147
Addiction is a reversal of healthy aims.
0.9053924679756165


In [1]:
start_index_vec = np.zeros((1, S.shape[0]))
start_index_vec[0, start_index] = 1

NameError: ignored

In [None]:
# 將 start_index_vec 和 S 做矩陣乘法
connected_nodes = np.dot(start_index_vec, S)


In [122]:
def fact_to_concept(fact):
  tokens = nltk.word_tokenize(fact)
  #print(tokens)
  # perform part-of-speech tagging to identify nouns and noun phrases
  pos_tags = nltk.pos_tag(tokens)
  #print(pos_tags)
  concepts = set()
  for i in range(len(pos_tags)):
    if pos_tags[i][1].startswith("NN"):  # check if the word is a noun
      # 使用lemmatizer將單數名詞還原為原形
      lemma = lemmatizer.lemmatize(pos_tags[i][0], pos=pos_tags[i][1][0].lower())
      concepts.add(lemma)
  return concepts

In [None]:
def sparse_retrieval(question, kb, concept2idx, S):
    # 将问题分解为概念列表
    concepts = fact_to_concept(question)
    concept_idxs = [concept2idx[c] for c in concepts if c in concept2idx]
    
    # 对于每个包含至少一个概念的事实，计算与问题的相似度得分
    scores = np.zeros(len(kb))
    for i, row in kb.iterrows():
        # 跳过缺失的事实
        if pd.isnull(row['GENERIC SENTENCE']):
            continue
        # 计算事实和问题之间的共同概念数量
        fact_concepts = set(row['GENERIC SENTENCE'].split(' '))
        common_concepts = fact_concepts.intersection(set(concepts))
        # 如果事实中包含至少一个概念，则计算其与问题的相似度得分
        if len(common_concepts) > 0:
            j = concept_idxs[0]  # 取第一个概念的索引作为行索引
            k = concept2idx[row['TERM']]  # 取事实中的TERM概念的索引作为列索引
            # 在稀疏张量 S 中查找行索引 j 和列索引 k，如果存在则将得分加入相似度得分
            if j in S and k in S[j].indices:
                scores[i] = S[j][k]
    
    # 根据得分从高到低对事实进行排序并返回
    sorted_indices = np.argsort(scores)[::-1]
    return kb.iloc[sorted_indices]

In [None]:
print(df.columns)

Index(['SOURCE', 'TERM', 'QUANTIFIER', 'GENERIC SENTENCE', 'SCORE'], dtype='object')


In [None]:
sparse_retrieval("what can help alleviate global warming?",df,concept2idx,S)

ValueError: ignored

# RL

In [None]:
import random

In [None]:
def find_start(question):
  return 0

In [None]:
def step(state, action, grid, hop_num, done):
    """
    在环境中执行一个动作，更新状态并返回奖励、下一个状态以及完成标志。

    参数：
        state (tuple): 当前状态，格式为int, 表示第幾個fact index。
        action (str): 要执行的动作,走去相連的Fact。
        grid (spare matrix): 表示有相連的fact
        hop_num(int): 現在跳了幾步

    返回：
        tuple: 包含新的状态、奖励和完成标志的元组，格式为 (新状态，奖励，完成标志)。
    """
    # 根據action做1 hop
    """
    if(action == ?):
      hop to this fact (index)
    """

    # 檢查是否走到中止步數了
    """
    if(hop_num>=stop_hop_num):
      done = tuue
    """
    
    next_state = (row, col)
    
    if(next_state[0] == end[0] and next_state[1] == end[1]):
        print("arrive!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        return state, 100, True
    done = False
    if grid[next_state[0]][next_state[1]] == 'obstacle':
        print("obstacle!")
        reward = -10 
    else:
        print("normal path")
        reward = 0
    return next_state, reward, done