<a href="https://colab.research.google.com/github/deep1003/deep1003/blob/master/PatentSBERT_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch

# GPU가 사용 가능한지 확인, 사용 가능한 경우 해당 GPU 이름 출력.

# torch.cuda.is_available()는 현재 환경에 GPU가 사용 가능한지 여부 확인.
# - 사용 가능하면 True, 사용 불가능하면 False 반환.
if torch.cuda.is_available():
    # GPU가 사용 가능할 경우, torch.cuda.get_device_name(0)을 이용해 첫 번째 GPU의 이름을 가져옴.
    gpu_name = torch.cuda.get_device_name(0)

    # 확인된 GPU 이름을 출력.
    print(f"GPU is available: {gpu_name}")
else:
    # GPU가 사용 불가능한 경우 경고 메시지를 출력.
    print("GPU is not available")


GPU is available: NVIDIA L4


In [2]:
gpu_name = !nvidia-smi --query-gpu=gpu_name --format=csv,noheader
print(f"Assigned GPU: {gpu_name[0]}")


Assigned GPU: NVIDIA L4


In [3]:
# Step 1: 데이터 로딩 및 정제

# Google Drive를 마운트하여 파일 접근
# Google Colab 환경에서 로컬 파일 시스템에 접근할 수 없기 때문에
# Google Drive에 저장된 데이터를 사용하려면 먼저 Google Drive를 마운트해야 함.
# Google Drive를 마운트하면 Colab 환경에서 '/content/drive' 디렉토리 아래로 Drive 파일에 접근 가능함.

from google.colab import drive
import pandas as pd

# Google Drive 마운트
# '/content/drive' 디렉토리에 Google Drive 파일 시스템을 연결
# 이후 Google Drive 내 파일을 Colab에서 직접 읽거나 쓸 수 있음.
drive.mount('/content/drive')

# 이후 데이터 파일을 로드할 때, Google Drive 내의 파일 경로를 지정하여 pandas로 데이터를 불러올 수 있음.


Mounted at /content/drive


In [4]:
# Google Drive의 CSV 파일 경로
# '/content/drive'는 Google Drive의 루트 디렉토리이며, 여기에 사용자 드라이브의 경로를 연결하여 접근할 수 있음.
# 예시로 'MyDrive' 아래 'data' 폴더에 있는 'patents.csv' 파일을 불러옴.
# 사용자가 원하는 경로에 맞게 파일 경로를 수정해야 함.
file_path = '/content/drive/MyDrive/data/patents.csv'

# CSV 파일 불러오기
# pd.read_csv()를 사용하여 CSV 파일을 데이터프레임으로 로드
# 만약 CSV 파일이 세미콜론(;)으로 구분되어 있다면 sep=';' 옵션을 사용함
df2 = pd.read_csv(file_path, sep=';')

# 데이터가 성공적으로 로드되었는지 확인
# 데이터의 첫 5행을 출력하여 제대로 불러와졌는지 확인
print("데이터 로딩 성공!")
print(df2.head())


데이터 로딩 성공!
    appln_id  earliest_filing_year person_ctry_code cpc_class_symbol  \
0  380934210                  2012               US      G06F  30/30   
1  380934210                  2012               DE      G06F  30/30   
2  380934210                  2012               US      G06F2119/12   
3  380934210                  2012               DE      G06F2119/12   
4  404968208                  2012               US     G06F   8/443   

   nace2_code                                        appln_title  \
0        26.2                Soft-bounded hierarchical synthesis   
1        26.2                Soft-bounded hierarchical synthesis   
2        26.2                Soft-bounded hierarchical synthesis   
3        26.2                Soft-bounded hierarchical synthesis   
4        26.2  Optimized memory configuration deployed on exe...   

                                      appln_abstract  
0  A large block synthesis (LBS) process pre-opti...  
1  A large block synthesis (LBS) proc

In [5]:
#check the lists of files in the mounted drive
!ls /content/drive/MyDrive/data

AI		       DL_abstract		   patentAbstractsW2V_300_10_5.model
AI_abstract_2022_2023  DL_title_2022_2023	   patents.csv
ai_all		       Economic_Networks_PS.ipynb  patents_embeddings.csv
AI_title	       g_patent_abstract.tsv	   patents_processed_bigrams.csv
ALL		       mobility			   patents_processed_tokens.csv
DL		       pat			   TSNE.png


In [6]:
!pip install --upgrade gensim -q

In [7]:
# numpy version 1.21.5
import numpy
numpy.version.version

'1.26.4'

In [8]:
# gensim version 4.1.2
import gensim
print(gensim.__version__)

4.3.3


In [9]:
import numpy as np
import pandas as pd
import os
from gensim.models import Word2Vec

import json
import gensim
from nltk import sent_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
stopwords = nltk.corpus.stopwords.words('english')

import h5py
import tqdm

import itertools
import gensim, logging
from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.matutils import sparse2full

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import gc
from multiprocessing import Pool
# from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
from gensim.parsing.preprocessing import remove_stopwords



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
def ele0(x):
    return x[0]

# Loads from JSON
def json_l(x):
    try:
        return json.loads(x)
    except ValueError:
        return []

# Preprocessing (sentence tokenizer + basik Gensim - lowercases, tokenizes, de-accents (optional))
def prepro(x):
    x = sent_tokenize(x)
    clean = []
    for j in x:
        clean.append(gensim.utils.simple_preprocess(j, min_len=1))
    return clean

def bigrammer(x):
    bigram = gensim.models.phrases.Phraser.load('bigram.m')
    sents = []
    for i in x:
        sents.append(bigram[i])
    return sents

def tfidfer(para):
    model_tfidf = gensim.models.TfidfModel.load('model_tfidf_11_2.m')
    docs_dict = Dictionary.load('docs_dict_11_2.d')
    return model_tfidf[docs_dict.doc2bow(itertools.chain(*para))]

def preprocess_1 (data):
    data['appln_abstract_prepro'] = list(map(prepro, data.appln_abstract_st_r))
    data['appln_abstract_prepro'] = list(map(json.dumps, data.appln_abstract_prepro))
    return data

def preprocess_2 (data):
    data['no_json'] = list(map(json.loads, data['appln_abstract_prepro']))
    data['appln_abstract_prepro_bi'] = list(map(bigrammer, data['no_json']))
    data['appln_abstract_prepro_bi'] = list(map(json.dumps, data.appln_abstract_prepro_bi))
    return data

def preprocess_3 (data):
    data['no_json_tf'] = list(map(json.loads, data['appln_abstract_prepro_bi']))
    data['tfidf'] = list(map(tfidfer, data['no_json_tf']))
    data['tfidf'] = list(map(json.dumps, data['tfidf']))
    return data

def get_data_path(filename):
    data_dir = '/content'
    path = os.path.join(data_dir, filename)
    if data_dir != '.' and 'DEEP_QUANT_ROOT' in os.environ:
        path = os.path.join(os.environ['DEEP_QUANT_ROOT'], path)
    return path

def abstract_generator_nonmult(df):
    go = 'OK'
    while go == 'OK':
        try:
            i = df['appln_abstract_prepro_bi']
        except StopIteration:
                return
        if i.isnull().sum() > 0:
            i = list(filter(lambda a: a != None, i))
            go = 'STOP'
        if i.isnull().sum() < 0:
            i = [x[0] for x in i]
        j = [json_l(x) for x in i]
        for h in j:
            try:
                yield list(itertools.chain(*h))
            except StopIteration:
                return


In [11]:
data = pd.read_csv('https://raw.githubusercontent.com/AI-Growth-Lab/Patent_p2p_similarity_w2v/main/patent_dtatset_sample_1k.csv')

In [12]:
tokens_without_sw = [remove_stopwords(text) for text in data.appln_abstract]
data['appln_abstract_st_r'] = tokens_without_sw

In [13]:
data_1 = preprocess_1(data)
data_1.to_csv('data.csv')

In [14]:
sentences = Text8Corpus(get_data_path('data.csv'))

In [15]:
bigram = Phrases(sentences, min_count=1, threshold=1, connector_words=ENGLISH_CONNECTOR_WORDS)

In [16]:
bigram = gensim.models.phrases.Phraser(bigram)

In [17]:
bigram.save('bigram.m')

In [18]:
data_2 = preprocess_2 (data_1)



In [19]:
data_2.to_csv('data.csv')

In [20]:
model_with_phrases = Word2Vec(sentences=sentences, vector_size=300, window=80, min_count=1, workers=14)

In [21]:
model_with_phrases.save('w2v.m')

In [22]:
voc = model_with_phrases.wv.index_to_key

In [23]:
# Build Gensim Dictionary
docs_dict = Dictionary([voc])
docs_dict.compactify()
docs_dict.save('docs_dict_11_2.d')

In [24]:
model_tfidf = TfidfModel((docs_dict.doc2bow(x) for x in sentences), id2word=docs_dict)
model_tfidf.save('model_tfidf_11_2.m')

In [25]:
data_3 = preprocess_3 (data_2)
data_3.to_csv('data.csv')
iterator = abstract_generator_nonmult(data_3)

In [26]:
# Document matrix TF-IDF weighted
docs_vecs = (sparse2full(c, len(docs_dict)) for c in ((model_tfidf[docs_dict.doc2bow(x)] for x in iterator)))

In [27]:
# Selected Word-Embeddings
emb_vecs_selftrained = np.vstack([model_with_phrases.wv[docs_dict.get(i)] for i in range(len(docs_dict))])
n_abstracts = len(data_2)
h5f = h5py.File('docvecs_23_4_test.h5', 'a')
dataset = h5f.create_dataset('weighted_tfidf_docvecs', (n_abstracts,300))

In [29]:
pbar = tqdm.tqdm(total=n_abstracts)
start = 0

for i in range(1):
    # Collect all vectors in a list and then stack them using vstack
    vectors = [next(docs_vecs) for _ in range(n_abstracts)]
    a = np.vstack(vectors)

    pbar.update(n_abstracts)

    # Perform the matrix multiplication
    b = np.dot(a, emb_vecs_selftrained)

    # Store the results
    end = start + len(b)
    dataset[start:end] = b
    start = end

pbar.close()
h5f.close()



  0%|          | 0/1000 [00:00<?, ?it/s][A
  0%|          | 0/1000 [00:59<?, ?it/s]
100%|██████████| 1000/1000 [00:01<00:00, 868.86it/s] 


In [30]:
# # Generates document-vectors
# pbar = tqdm.tqdm(total=n_abstracts)
# start = 0
# #while docs_vecs:
# for i in range(1):
#     a = np.vstack(next(docs_vecs) for _ in range(n_abstracts))
#     pbar.update(n_abstracts)
#     b = np.dot(a,emb_vecs_selftrained)
#     end = start + len(b)
#     dataset[start:end] = b
#     start = end
# pbar.close()
# h5f.close()

In [31]:
h5f = h5py.File('docvecs_23_4_test.h5', 'r')
dataset = h5f['weighted_tfidf_docvecs']
dataset[1]

array([-4.66187418e-01,  4.24385160e-01,  6.13570273e-01, -2.05526218e-01,
        5.40466905e-01, -1.07957959e+00, -7.51435935e-01,  8.77665997e-01,
       -1.97183281e-01, -5.08409917e-01,  1.74840117e+00,  7.11972356e-01,
       -3.21179330e-02,  3.90553445e-01, -3.13643813e-01, -4.39723909e-01,
        8.50277185e-01, -4.51629490e-01, -7.76157081e-01, -3.78596008e-01,
       -2.19097942e-01,  7.04338431e-01,  6.29782140e-01,  1.11158180e+00,
       -3.43680859e-01, -1.78028896e-01, -8.57983902e-03,  8.04007471e-01,
       -2.19281077e-01, -3.10893685e-01,  3.90577078e-01, -3.58160913e-01,
        4.31230277e-01, -5.62856555e-01,  2.29281396e-01, -8.71628761e-01,
       -1.71930432e-01, -1.27106500e+00,  5.38294315e-01, -1.00609004e-01,
       -3.87729645e-01, -8.01400185e-01, -4.24801081e-01, -4.45363879e-01,
       -2.35128462e-01,  4.06721771e-01,  7.21274465e-02,  1.30740061e-01,
       -7.62064457e-01, -1.97145771e-02, -9.67772827e-02,  2.99931169e-01,
       -6.39845908e-01,  

In [32]:
import scipy
scipy.spatial.distance.cosine(dataset[9],dataset[8])

0.5386800140641987

In [33]:
# Annoy
!pip install annoy
from annoy import AnnoyIndex
t = AnnoyIndex(dataset.shape[1])  # Length of item vector that will be indexed

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/647.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m645.1/647.5 kB[0m [31m28.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.3-cp310-cp310-linux_x86_64.whl size=550691 sha256=7060e66844891e6e89196867cf4cbda192e997bd28b513112889bbe7f4009236
  Stored in directory: /root/.cache/pip/wheels/64/8a/da/f714bcf46c5efdcfcac0559e63370c21abe961c48e3992465a
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.3


  t = AnnoyIndex(dataset.shape[1])  # Length of item vector that will be indexed


In [34]:
t.on_disk_build('vecs_build.annoy')

True

In [35]:
for i in tqdm.tqdm(range(dataset.shape[0])):
    t.add_item(i, dataset[i])

100%|██████████| 1000/1000 [00:00<00:00, 20835.77it/s]


In [36]:
t.build(20)

True

In [37]:
tt = AnnoyIndex(300)
tt.load('vecs_build.annoy', prefault=False)

  tt = AnnoyIndex(300)


True

In [38]:
simH5 = h5py.File('similarities_23_2.h5', 'a')

In [39]:
dataset = simH5.create_dataset('sims_50',(10000000,3), maxshape=(None,3), compression="gzip")

In [40]:
round(t.get_n_items()/10)

100

In [41]:
pbar = tqdm.tqdm(total=tt.get_n_items())

blocks = []
start = 0
end = 0
counter = 0
for i in range(tt.get_n_items()):
    j = 100
    tresh = 0.0
    while tresh < 0.5:
        y = tt.get_nns_by_item(i, j, search_k=50, include_distances=True)
        tresh = y[1][-1]
        j += 50
        if j >= 500:
            break
    block = np.array(list(zip([i for _ in range(len(y[0]))],y[0],y[1])))
    end = end + len(block)
    blocks.append(block)
    counter += 1
    if counter == 1000:
        counter = 0
        blocks_np = np.vstack(blocks)
        if end >= dataset.shape[0]:
            dataset.resize((end+(end - dataset.shape[0]),3))
        dataset[start:end] = blocks_np
        blocks = []
        gc.collect()
        start = end
        pbar.update(1000)


simH5.close()

100%|██████████| 1000/1000 [00:00<00:00, 1635.35it/s]

In [42]:
def lookup(x):
    y = tree.get_nns_by_item(x, j, search_k=5, include_distances=True)
    return np.array(list(zip([x for _ in range(len(y[0]))],y[0],y[1])))