In [None]:
nodes = catalog.load("integration.int.rtx.nodes")
edges = catalog.load("integration.int.rtx.edges")

In [None]:
import polars as pl

# Load the TSV file into a Polars DataFrame
df = pl.read_csv("/home/wadmin/edges_c.tsv", separator='\t')




In [None]:
# Print column names
print(df.columns)

# Print the first 5 rows
print(df.head(5))

In [None]:
print(df.columns)
for index, row in enumerate(df.iter_rows(named=True)):
    if index == 5:
        break
    print(f"{index} **************** {index}")
    for column in df.columns:
        print(f"{column}: {row[column]}")
    print(f"{index} **************** {index}")

In [None]:
import json
from tqdm import tqdm

# Create dictionaries to store unique entries and their counts
unique_entries = {}
empty_counts = {}

# Iterate through each column in the DataFrame with a progress bar
for column in tqdm(df.columns, desc="Processing columns"):
    # Get unique entries and their counts
    unique_entries[column] = df[column].unique().to_list()
    unique_count = df[column].n_unique()
    # Count total rows with an empty column
    empty_count = df[column].is_null().sum()
    # Print the results
    tqdm.write(f"Column: {column}")
    tqdm.write(f"Unique Entries ({unique_count}): {unique_entries[column]}")
    tqdm.write(f"Empty Count: {empty_count}\n")
    # Store the counts in the dictionary
    empty_counts[column] = empty_count

# Combine the results into a single dictionary
output = {
    "unique_entries": unique_entries,
    "empty_counts": empty_counts
}

# Save the output to a JSON file
with open("output.json", "w") as f:
    json.dump(output, f, indent=4)

In [None]:
import sys
import os
import logging
import pandas as pd
import numpy as np
from pathlib import Path
import nest_asyncio
nest_asyncio.apply()
utils_path = os.path.abspath('/home/wadmin/embed_norm/apps/embed_norm/src')
if utils_path not in sys.path:
    sys.path.append(utils_path)
from main import Environment, CacheManager, Config, Pipeline

Environment.configure_logging()
# utils_path = Path(__file__).parent.resolve()
Environment.setup_environment(utils_path=utils_path)

project_path = Path.cwd().parents[2]
cache_dir = project_path / "apps" / "embed_norm" / "cached_datasets"
cache_dir.mkdir(parents=True, exist_ok=True)
for subdir in ["embeddings", "datasets"]:
    (cache_dir / subdir).mkdir(parents=True, exist_ok=True)

pos_seed = 54321
neg_seed = 67890
dataset_name = "rtx_kg2.int"
nodes_dataset_name = "integration.int.rtx.nodes"
edges_dataset_name = "integration.int.rtx.edges"
categories = ["All Categories"]
model_names = ["OpenAI", "PubMedBERT", "BioBERT", "BlueBERT", "SapBERT"]
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    sys.exit(1)

total_sample_size = 1000
positive_ratio = 0.2
positive_n = int(total_sample_size * positive_ratio)
negative_n = total_sample_size - positive_n
cache_suffix = f"_pos_{positive_n}_neg_{negative_n}"

config = Config(
    cache_dir=cache_dir,
    pos_seed=pos_seed,
    neg_seed=neg_seed,
    dataset_name=dataset_name,
    nodes_dataset_name=nodes_dataset_name,
    edges_dataset_name=edges_dataset_name,
    categories=categories,
    model_names=model_names,
    total_sample_size=total_sample_size,
    positive_ratio=positive_ratio,
    positive_n=positive_n,
    negative_n=negative_n,
    cache_suffix=cache_suffix,
)

cache_manager = CacheManager(cache_dir=cache_dir)
pipeline = Pipeline(config=config, cache_manager=cache_manager, package_name="matrix", project_path=project_path)
categories, positive_datasets, negative_datasets, nodes_df = pipeline.load_data()


In [None]:
print("Nodes DataFrame Columns:", list(nodes_df.columns))
print("\nSample of Nodes DataFrame:")
print(nodes_df.head())

print("\nMissing values per column in Nodes DataFrame:")
print(nodes_df.isnull().sum())

print("\nUnique Categories:")
print(nodes_df['category'].unique())

rows_per_category = nodes_df['category'].value_counts(dropna=False)
print("\nNumber of rows per category:")
print(rows_per_category)

In [None]:
print("\nPositive Datasets Categories:")
for cat, df in positive_datasets.items():
    print(f"Category: {cat}, Shape: {df.shape}")

print("\nNegative Datasets Categories:")
for cat, df in negative_datasets.items():
    print(f"Category: {cat}, Shape: {df.shape}")

In [None]:
def display_columns(datasets):
    for dataset_name, dataset in datasets.items():
        print(f"\nDataset: {dataset_name}")
        print("Columns:", list(dataset.columns))

print("\nPositive Datasets Columns:")
display_columns(positive_datasets)

print("\nNegative Datasets Columns:")
display_columns(negative_datasets)

In [None]:
def display_labels_sample(datasets, sample_size=5):
    for dataset_name, dataset in datasets.items():
        if "labels" in dataset.columns:
            print(f"\nDataset: {dataset_name}")
            print("Labels Sample:", dataset['labels'].head(sample_size).to_list())

print("\nPositive Datasets Labels Sample:")
display_labels_sample(positive_datasets)

print("\nNegative Datasets Labels Sample:")
display_labels_sample(negative_datasets)

In [None]:
import pandas as pd
import numpy as np

print(nodes_df.head())

# 1. Count missing values per column
missing_counts = nodes_df.isnull().sum()

print("\nMissing values per column:")
print(missing_counts)

# 2. Identify columns with unhashable types
def is_column_unhashable(col):
    try:
        # Attempt to hash the first non-null entry
        sample = col.dropna().iloc[0]
        hash(sample)
        return False
    except TypeError:
        return True
    except IndexError:
        # Column is entirely NaN
        return False

# Identify unhashable columns
unhashable_columns = [col for col in nodes_df.columns if is_column_unhashable(nodes_df[col])]

print("\nColumns with unhashable types:", unhashable_columns)

# 3. Handle unhashable columns by converting them to tuples
for col in unhashable_columns:
    nodes_df[col] = nodes_df[col].apply(lambda x: tuple(x) if isinstance(x, (list, np.ndarray)) else x)

print("\nConverted unhashable columns to tuples.")

# 4. Now, count unique rows
unique_rows_count = nodes_df.drop_duplicates().shape[0]
print(f"\nTotal number of unique rows: {unique_rows_count}")

# 5. Number of rows per category
rows_per_category = nodes_df['category'].value_counts(dropna=False)
print("\nNumber of rows per category:")
print(rows_per_category)

# 6. Missing values per column for each category
missing_counts_per_category = nodes_df.groupby('category').apply(lambda x: x.isnull().sum())

print("\nMissing values per column for each category:")
print(missing_counts_per_category)

# (Optional) Improved readability
# for category, group in nodes_df.groupby('category'):
#     print(f"\nCategory: {category}")
#     missing = group.isnull().sum()
#     print(missing)

In [None]:
import re
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

# Function to parse edge_text into a list of edge dictionaries
def parse_edge_text(edge_text):
    edges = []
    if not edge_text:
        return edges
    # Pattern to match each edge component
    pattern = r'has_edge_to:(.*?)_relation:(.*?)_props:(\{.*?\})'
    matches = re.findall(pattern, edge_text)
    for match in matches:
        obj, predicate, props_json = match
        try:
            props = json.loads(props_json)
        except json.JSONDecodeError:
            props = {}
        edges.append({
            'object': obj,
            'predicate': predicate,
            'properties': props
        })
    return edges

In [None]:
import json
# Parse edge_text for all nodes
nodes_df['parsed_edges'] = nodes_df['edge_text'].apply(parse_edge_text)

# Function to extract all edges from the DataFrame
def extract_all_edges(df):
    all_edges = []
    for _, row in df.iterrows():
        node_id = row['id']
        for edge in row['parsed_edges']:
            all_edges.append({
                'subject': node_id,
                'object': edge['object'],
                'predicate': edge['predicate'],
                'properties': edge['properties']
            })
    return pd.DataFrame(all_edges)

# Extract edges from positive and negative datasets
def get_edges_from_datasets(datasets, all_nodes_df):
    edges = []
    for category, df in datasets.items():
        subset_nodes = all_nodes_df[all_nodes_df['id'].isin(df['id'])]
        edges_df = extract_all_edges(subset_nodes)
        edges.append(edges_df)
    if edges:
        return pd.concat(edges, ignore_index=True)
    else:
        return pd.DataFrame(columns=['subject', 'object', 'predicate', 'properties'])

positive_edges = get_edges_from_datasets(positive_datasets, nodes_df)
negative_edges = get_edges_from_datasets(negative_datasets, nodes_df)

print(f"Total positive edges: {positive_edges.shape[0]}")
print(f"Total negative edges: {negative_edges.shape[0]}")

In [None]:
# Display sample edges for positive datasets
print("Positive Edges Sample:")
print(positive_edges.head())

# Display sample edges for negative datasets
print("Negative Edges Sample:")
print(negative_edges.head())

In [None]:
# Analyze predicates in positive edges
positive_predicate_counts = positive_edges['predicate'].value_counts()
print("\nPredicate counts in positive edges:")
print(positive_predicate_counts)

# Analyze predicates in negative edges
negative_predicate_counts = negative_edges['predicate'].value_counts()
print("\nPredicate counts in negative edges:")
print(negative_predicate_counts)

In [None]:
# Plot top 10 predicates in positive edges
top_n = 10
plt.figure(figsize=(10,6))
sns.barplot(x=positive_predicate_counts.head(top_n).values, y=positive_predicate_counts.head(top_n).index, palette="viridis")
plt.title(f"Top {top_n} Predicates in Positive Edges")
plt.xlabel("Count")
plt.ylabel("Predicate")
plt.tight_layout()
plt.show()

In [None]:
# Plot top 10 predicates in negative edges
plt.figure(figsize=(10,6))
sns.barplot(x=negative_predicate_counts.head(top_n).values, y=negative_predicate_counts.head(top_n).index, palette="magma")
plt.title(f"Top {top_n} Predicates in Negative Edges")
plt.xlabel("Count")
plt.ylabel("Predicate")
plt.tight_layout()
plt.show()

In [None]:
# Analyze number of edges per node in positive datasets
positive_edges_per_node = positive_edges['subject'].value_counts()

# Analyze number of edges per node in negative datasets
negative_edges_per_node = negative_edges['subject'].value_counts()

print(f"\nAverage number of edges per node in positive datasets: {positive_edges_per_node.mean():.2f}")
print(f"Average number of edges per node in negative datasets: {negative_edges_per_node.mean():.2f}")

In [None]:
# Plot distribution of number of edges per node in positive datasets
plt.figure(figsize=(10,6))
sns.histplot(positive_edges_per_node, bins=30, kde=False, color='blue')
plt.title("Distribution of Number of Edges per Node (Positive Datasets)")
plt.xlabel("Number of Edges")
plt.ylabel("Number of Nodes")
plt.tight_layout()
plt.show()

# Plot distribution of number of edges per node in negative datasets
plt.figure(figsize=(10,6))
sns.histplot(negative_edges_per_node, bins=30, kde=False, color='red')
plt.title("Distribution of Number of Edges per Node (Negative Datasets)")
plt.xlabel("Number of Edges")
plt.ylabel("Number of Nodes")
plt.tight_layout()
plt.show()

In [None]:
# Analyze properties in positive edges
def extract_property_counts(edges_df, property_key):
    property_values = edges_df['properties'].apply(lambda x: x.get(property_key, None)).dropna()
    # Flatten the list if values are lists
    flattened = [item for sublist in property_values for item in (sublist if isinstance(sublist, list) else [sublist])]
    return pd.Series(flattened).value_counts()

# Example: Analyze a specific property, e.g., 'strength'
# Replace 'strength' with actual property keys present in your data
property_key = 'strength'
positive_property_counts = extract_property_counts(positive_edges, property_key)
negative_property_counts = extract_property_counts(negative_edges, property_key)

print(f"\nProperty '{property_key}' counts in positive edges:")
print(positive_property_counts.head(10))

print(f"\nProperty '{property_key}' counts in negative edges:")
print(negative_property_counts.head(10))

In [None]:
# If there are properties to analyze, plot them
if not positive_property_counts.empty:
    plt.figure(figsize=(10,6))
    sns.barplot(x=positive_property_counts.head(top_n).values, y=positive_property_counts.head(top_n).index, palette="coolwarm")
    plt.title(f"Top {top_n} '{property_key}' Properties in Positive Edges")
    plt.xlabel("Count")
    plt.ylabel(property_key.capitalize())
    plt.tight_layout()
    plt.show()

if not negative_property_counts.empty:
    plt.figure(figsize=(10,6))
    sns.barplot(x=negative_property_counts.head(top_n).values, y=negative_property_counts.head(top_n).index, palette="inferno")
    plt.title(f"Top {top_n} '{property_key}' Properties in Negative Edges")
    plt.xlabel("Count")
    plt.ylabel(property_key.capitalize())
    plt.tight_layout()
    plt.show()