In [None]:
import sys
import os
import logging
import pandas as pd
import numpy as np
from pathlib import Path
import nest_asyncio
nest_asyncio.apply()
utils_path = os.path.abspath('/home/wadmin/embed_norm/apps/embed_norm/src')
if utils_path not in sys.path:
    sys.path.append(utils_path)
from main import Environment, CacheManager, Config, Pipeline

Environment.configure_logging()
# utils_path = Path(__file__).parent.resolve()
Environment.setup_environment(utils_path=utils_path)

project_path = Path.cwd().parents[1]
cache_dir = project_path / "apps" / "embed_norm" / "cached_datasets"
cache_dir.mkdir(parents=True, exist_ok=True)
for subdir in ["embeddings", "datasets", "final_data"]:
    (cache_dir / subdir).mkdir(parents=True, exist_ok=True)

pos_seed = 54321
neg_seed = 67890
dataset_name = "rtx_kg2.int"
nodes_dataset_name = "integration.int.rtx.nodes"
edges_dataset_name = "integration.int.rtx.edges"
categories = ["All Categories"]
model_names = ["OpenAI", "PubMedBERT", "BioBERT", "BlueBERT", "SapBERT"]
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    sys.exit(1)

total_sample_size = 1000
positive_ratio = 0.2
positive_n = int(total_sample_size * positive_ratio)
negative_n = total_sample_size - positive_n
cache_suffix = f"_pos_{positive_n}_neg_{negative_n}"

config = Config(
    cache_dir=cache_dir,
    pos_seed=pos_seed,
    neg_seed=neg_seed,
    dataset_name=dataset_name,
    nodes_dataset_name=nodes_dataset_name,
    edges_dataset_name=edges_dataset_name,
    categories=categories,
    model_names=model_names,
    total_sample_size=total_sample_size,
    positive_ratio=positive_ratio,
    positive_n=positive_n,
    negative_n=negative_n,
    cache_suffix=cache_suffix,
)

cache_manager = CacheManager(cache_dir=cache_dir)
pipeline = Pipeline(config=config, cache_manager=cache_manager, package_name="matrix", project_path=project_path)

In [None]:
categories, positive_datasets, negative_datasets, nodes_df = pipeline.load_data()

### Load Sample Edges DataFrame

In [None]:
# Load edges DataFrame
edges_cache_file = (
    cache_manager.cache_dir
    / "datasets"
    / f"edges_df{cache_suffix}.pkl"
)

In [None]:
edges_df = cache_manager.load_cached_data(edges_cache_file)
if edges_df is None:
    def load_edges():
        try:
            edges_df = pipeline.catalog.load(config.edges_dataset_name)
            if isinstance(edges_df, SparkDataFrame):
                edges_df = edges_df.toPandas()
            return edges_df
        except Exception as e:
            logging.error(f"Error loading edges: {e}")
            return pd.DataFrame()
    edges_df = cache_manager.get_or_compute(edges_cache_file, load_edges)

In [None]:
print("Columns in edges_df:", edges_df.columns.tolist())
edges_df.info()
edges_df.head()

In [None]:
for index, edge_df in enumerate(edge_dfs.iterrows()):
    if index ==5:
        break
    print(edge_df)
    # print(edge_dfs[edge_df].shape)
    # print(edge_dfs[edge_df].columns)
    # print(edge_dfs[edge_df].head())
    # print(edge_dfs[edge_df].info())
    # print(edge_dfs[edge_df].describe())
    # print(edge_dfs[edge_df].isnull().sum())
    # print(edge_df["label"].value_counts())
    # print(edge_df["label"].value_counts(normalize=True))
    # break

# for ds in positive_datasets:
#     print(ds)
#     print(positive_datasets[ds].shape)
#     print(positive_datasets[ds].columns)
#     print(positive_datasets[ds].head())
#     print(positive_datasets[ds].info())
#     print(positive_datasets[ds].describe())
#     print(positive_datasets[ds].isnull().sum())
#     break
    # print(positive_datasets[ds]["label"].value_counts())
    # print(positive_datasets[ds]["label"].value_counts(normalize=True))

# print(positive_datasets)

In [None]:
print("Columns in edges_df:", edges_df.columns.tolist())

In [None]:
edges_df.info()

In [None]:
edges_df.head()

In [None]:
print("Unique subjects:", edges_df['subject'].unique())

In [None]:
print("Unique predicates:", edges_df['predicate'].unique())

In [None]:
print("Unique objects:", edges_df['object'].unique())

In [None]:
edges_df['edge_attributes']

In [None]:
# Normalize the 'edge_attributes' column into separate columns
edge_attributes_expanded = edges_df['edge_attributes'].apply(pd.Series)

# Combine with the original DataFrame
edges_expanded_df = pd.concat([edges_df.drop(['edge_attributes'], axis=1), edge_attributes_expanded], axis=1)

edges_expanded_df

In [None]:
edges_df.isnull().sum()

In [None]:
# Create a graph using NetworkX
import networkx as nx
import matplotlib.pyplot as plt

G = nx.from_pandas_edgelist(edges_df, 'subject', 'object', edge_attr=True, create_using=nx.DiGraph())

plt.figure(figsize=(8,6))
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=2000, arrowsize=20)
plt.title('Graph Representation of Edges')
plt.show()