In [1]:
from dask_jobqueue import SLURMCluster
from gensim.models import Word2Vec
import pandas as pd
import dask.dataframe as dd
from sklearn.metrics.pairwise import cosine_similarity as sk_cosine_similarity
import numpy as np
from dask import delayed, compute

# Compose SLURM script
cluster = SLURMCluster(queue='caslake', cores=10, memory='60GB', 
                       processes=10, walltime='03:00:00', interface='ib0',
                       job_extra=['--account=macs30123']
                      )

# Request resources
cluster.scale(jobs=1)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 48307 instead
2024-05-24 07:36:11,916 - distributed.scheduler - ERROR - Couldn't gather keys {"('head-1-100000-reset_index-eb9f0719bb98ccd8ca931b5be243c5df', 0)": ['tcp://172.25.2.12:35453']} state: ['memory'] workers: ['tcp://172.25.2.12:35453']
NoneType: None
2024-05-24 07:36:15,175 - distributed.scheduler - ERROR - Shut down workers that don't have promised key: ['tcp://172.25.2.12:35453'], ('head-1-100000-reset_index-eb9f0719bb98ccd8ca931b5be243c5df', 0)
NoneType: None
2024-05-24 07:37:31,212 - distributed.scheduler - ERROR - Couldn't gather keys {"('head-1-100000-reset_index-eb9f0719bb98ccd8ca931b5be243c5df', 0)": ['tcp://172.25.2.12:39047']} state: ['memory'] workers: ['tcp://172.25.2.12:39047']
NoneType: None
2024-05-24 07:37:35,686 - distributed.scheduler - ERROR - Shut down workers that don't have promised key: ['tcp://172.25.2.12:39047'], ('head-1-100000-reset_index-eb9f0719bb98ccd8ca931b5be243c5df', 

In [2]:
from dask.distributed import Client

client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://172.25.0.65:48307/status,

0,1
Dashboard: http://172.25.0.65:48307/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://172.25.0.65:38197,Workers: 0
Dashboard: http://172.25.0.65:48307/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [19]:
import ast
import re

patent_vectors = pd.read_csv("patent_vectors.csv")

def process_abstract_vector(vector_str):
    # Remove [[ and ]], as well as extra spaces and newline characters
    clean_str = vector_str.replace('[', '').replace(']', '').replace('\n', ' ').strip()
    # Split by spaces and remove empty strings
    vector_list = [num for num in clean_str.split(' ') if num]
    return vector_list

# Apply to the abstract_vector column of the DataFrame
patent_vectors['abstract_vector'] = patent_vectors['abstract_vector'].apply(process_abstract_vector)
patent_vectors['network_embedding'] = patent_vectors['network_embedding'].apply(process_abstract_vector)

def convert_to_float_list(vector_list):
    # Convert scientific notation strings to regular float representation
    float_list = [float(num) for num in vector_list]
    # Convert to numpy ndarray
    vector_array = np.array(float_list)
    return vector_array

# Step 2: Convert elements in the list to floats and store them in numpy ndarray
patent_vectors['abstract_vector'] = patent_vectors['abstract_vector'].apply(convert_to_float_list)
patent_vectors['network_embedding'] = patent_vectors['network_embedding'].apply(convert_to_float_list)

# Print the type of each element in the network_embedding column
print("Type of elements in the network_embedding column:")
print(patent_vectors['network_embedding'].apply(type))

# Print the type of each element in the abstract_vector column
print("\nType of elements in the abstract_vector column:")
print(patent_vectors['abstract_vector'].apply(type))

patent_vectors.head()

Type of elements in the network_embedding column:
0        <class 'numpy.ndarray'>
1        <class 'numpy.ndarray'>
2        <class 'numpy.ndarray'>
3        <class 'numpy.ndarray'>
4        <class 'numpy.ndarray'>
                  ...           
81978    <class 'numpy.ndarray'>
81979    <class 'numpy.ndarray'>
81980    <class 'numpy.ndarray'>
81981    <class 'numpy.ndarray'>
81982    <class 'numpy.ndarray'>
Name: network_embedding, Length: 81983, dtype: object

Type of elements in the abstract_vector column:
0        <class 'numpy.ndarray'>
1        <class 'numpy.ndarray'>
2        <class 'numpy.ndarray'>
3        <class 'numpy.ndarray'>
4        <class 'numpy.ndarray'>
                  ...           
81978    <class 'numpy.ndarray'>
81979    <class 'numpy.ndarray'>
81980    <class 'numpy.ndarray'>
81981    <class 'numpy.ndarray'>
81982    <class 'numpy.ndarray'>
Name: abstract_vector, Length: 81983, dtype: object


Unnamed: 0,id,network_embedding,citedby,abstract_vector
0,US-2023206765-A1,"[-0.0048047528, 0.0072327768, -0.00047358777, ...",{},"[-0.463403672, 0.148732617, 0.322311848, 0.071..."
1,US-2023013887-A1,"[-0.0073038526, 0.0049484577, 0.0051397895, -0...",{},"[-0.5442608, -0.0457125828, 0.00946517009, 0.1..."
2,US-2022109956-A1,"[-0.0012319116, 0.0016017314, -0.0036226427, 0...",{'CN115241958A': 'æ·±å\x9c³å¸\x82ä»\x8aæ\x9c\x...,"[-0.162252143, -0.0185790751, 0.193540916, -0...."
3,US-2022358597-A1,"[0.0054357667, -0.0038316157, -0.0063883718, -...",{},"[-0.0662348121, 0.0612511486, 0.379705906, -0...."
4,US-11567474-B2,"[-0.435433, 0.25674063, 0.243856564, 0.4580893...",{},"[0.125050649, 0.264000684, 0.473446935, 0.0168..."


In [20]:
patent_vectors = patent_vectors[['id', 'network_embedding', 'abstract_vector']]
patent_vectors = dd.from_pandas(patent_vectors, npartitions=10)
citing_df = dd.read_parquet("citing_df.parquet")

# Define a function to combine vectors
def combine_vectors(abstract_vector, network_embedding):
    # ensure input is NumPy array
    if not isinstance(abstract_vector, np.ndarray):
        abstract_vector = np.array(abstract_vector)
    if not isinstance(network_embedding, np.ndarray):
        network_embedding = np.array(network_embedding)
    return np.concatenate((abstract_vector, network_embedding))

# Apply the function using Dask
patent_vectors['combined_vector'] = patent_vectors.apply(lambda row: combine_vectors(row['abstract_vector'], row['network_embedding']), axis=1, meta=('x', 'f8'))


In [21]:
# Define cosine similarity function
def cosine_similarity(vec1, vec2):
    # ensure input is NumPy array
    if not isinstance(vec1, np.ndarray):
        vec1 = np.array(vec1)
    if not isinstance(vec2, np.ndarray):
        vec2 = np.array(vec2)

    # Ensure vectors have the same dimension by padding with zeros
    max_len = max(len(vec1), len(vec2))
        
    if len(vec1) < max_len:
        vec1 = np.pad(vec1, (0, max_len - len(vec1)), 'constant')
    if len(vec2) < max_len:
        vec2 = np.pad(vec2, (0, max_len - len(vec2)), 'constant')
    
    return sk_cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]
    
# Define KED function
def ked(citing_vector, cited_vectors):
    # deal with NaN 
    citing_vector = np.nan_to_num(citing_vector)
    cited_vectors = [np.nan_to_num(cited_vec) for cited_vec in cited_vectors]
    
    similarities = []
    for cited_vec in cited_vectors:
        try:
            similarity = 1 - cosine_similarity(citing_vector, cited_vec)
        except:
            similarity = 0
        similarities.append(similarity)
    return np.mean(similarities)

# Apply the KED calculation
def calculate_ked(citing_id, citing_vector, cited_vectors):
    ked_value = ked(citing_vector, cited_vectors)
    return pd.DataFrame({'citing_id': [citing_id], 'ked': [ked_value]})

# Use Dask delayed to calculate KED
joined_df = citing_df.merge(patent_vectors[['id', 'combined_vector']].rename(columns={'id': 'citing_id'}), on='citing_id', how='left') \
                     .merge(patent_vectors[['id', 'combined_vector']].rename(columns={'id': 'cited_id'}), on='cited_id', how='left')

grouped_df = joined_df.groupby('citing_id').agg(list).reset_index()

In [None]:
# From Pandas DataFrame to Dask DataFrame
grouped_df = dd.from_pandas(grouped_df_pd, npartitions=10)
# Calculate KED for each citing_id
ked_results = grouped_df.apply(lambda row: delayed(calculate_ked)(row['citing_id'], row['combined_vector_x'], row['combined_vector_y']), axis=1).compute()

# Convert results to Dask DataFrame
ked_df = dd.from_delayed(ked_results, meta=[('citing_id', 'str'), ('ked', 'float')])

# Calculate the average KED
average_ked = ked_df['ked'].mean().compute()
print("Average Knowledge Exploration Distance:", average_ked)

In [10]:
grouped_df2 = grouped_df.head(100000)
grouped_df2 = dd.from_pandas(grouped_df2, npartitions=10)
# Calculate KED for each citing_id
ked_results = grouped_df2.apply(lambda row: delayed(calculate_ked)(row['citing_id'], row['combined_vector_x'], row['combined_vector_y']), axis=1).compute()

# Convert results to Dask DataFrame
ked_df = dd.from_delayed(ked_results, meta=[('citing_id', 'str'), ('ked', 'float')])

# Calculate the average KED
average_ked = ked_df['ked'].mean().compute()
print("Average Knowledge Exploration Distance:", average_ked)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'object'))



Average Knowledge Exploration Distance: 0.7704132741051808


In [14]:
grouped_df2 = grouped_df.head(100000)
grouped_df2 = dd.from_pandas(grouped_df2, npartitions=10)
# Calculate KED for each citing_id
ked_results = grouped_df2.apply(lambda row: delayed(calculate_ked)(row['citing_id'], row['combined_vector_x'], row['combined_vector_y']), axis=1).compute()

# Convert results to Dask DataFrame
ked_df = dd.from_delayed(ked_results, meta=[('citing_id', 'str'), ('ked', 'float')])

# Calculate the average KED
average_ked = ked_df['ked'].mean().compute()
print("Average Knowledge Exploration Distance:", average_ked)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'object'))



Average Knowledge Exploration Distance: 0.04509103979633467


In [18]:
grouped_df2 = grouped_df.head(100000)
grouped_df2 = dd.from_pandas(grouped_df2, npartitions=10)
# Calculate KED for each citing_id
ked_results = grouped_df2.apply(lambda row: delayed(calculate_ked)(row['citing_id'], row['combined_vector_x'], row['combined_vector_y']), axis=1).compute()

# Convert results to Dask DataFrame
ked_df = dd.from_delayed(ked_results, meta=[('citing_id', 'str'), ('ked', 'float')])

# Calculate the average KED
average_ked = ked_df['ked'].mean().compute()
print("Average Knowledge Exploration Distance:", average_ked)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'object'))



Average Knowledge Exploration Distance: 4.938884785529252e-06


In [22]:
grouped_df2 = grouped_df.head(81983)
grouped_df2 = dd.from_pandas(grouped_df2, npartitions=10)
# Calculate KED for each citing_id
ked_results = grouped_df2.apply(lambda row: delayed(calculate_ked)(row['citing_id'], row['combined_vector_x'], row['combined_vector_y']), axis=1).compute()

# Convert results to Dask DataFrame
ked_df = dd.from_delayed(ked_results, meta=[('citing_id', 'str'), ('ked', 'float')])

# Calculate the average KED
average_ked = ked_df['ked'].mean().compute()
print("Average Knowledge Exploration Distance:", average_ked)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'object'))



Average Knowledge Exploration Distance: 2.4024924297984364e-06


In [23]:
# ked for each 4 part
values = [0.7704132741051808, 0.04509103979633467, 4.938884785529252e-06, 2.4024924297984364e-06]

# weights
weights = [100000, 100000, 100000, 81983]

# calculate final ked
final_ked = sum(v * w for v, w in zip(values, weights)) / sum(weights)

final_ked


0.21349411424635903