In [37]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict


In [38]:
# Load the dataset
final_df = pd.read_csv("")#insert path to filtered OBIS file
final_df = final_df.head(500)

# Assuming final_df is your DataFrame
final_df['matched_terms'] = final_df.apply(lambda row: ', '.join(filter(lambda x: x != 'nan', [str(row['matched_terms_keywords']), str(row['matched_terms_desc'])])), axis=1)

# Display the DataFrame to check the new column
print(final_df[['s', 'name', 'matched_terms']].head())
final_df = final_df.head(500)


                                                   s  \
0  <https://obis.org/dataset/a595a9a0-642a-473f-8...   
1  <https://obis.org/dataset/a595a9a0-642a-473f-8...   
2  <https://obis.org/dataset/0abb8cc1-8651-4213-a...   
3  <https://obis.org/dataset/0abb8cc1-8651-4213-a...   
4  <https://obis.org/dataset/0abb8cc1-8651-4213-a...   

                                                name  \
0  Electronic Atlas of Ichthyoplankton on the Sco...   
1  Electronic Atlas of Ichthyoplankton on the Sco...   
2  Colección de Gusanos Cinta (Nemertea) de la re...   
3  Colección de Gusanos Cinta (Nemertea) de la re...   
4  Colección de Gusanos Cinta (Nemertea) de la re...   

               matched_terms  
0  plankton, ichthyoplankton  
1  plankton, ichthyoplankton  
2                       coli  
3                       coli  
4                       coli  


In [39]:
# Create a directed graph
G = nx.DiGraph()

# Add nodes to the graph with name and matched term properties
for index, row in final_df.iterrows():
    G.add_node(row['s'], name=row['name'], uri=row['s'], matched_terms=row['matched_terms'])

# Build edges based on co-occurence function of shared matched terms and calculate weights based on frequency of co-occurence
def build_edges_based_on_shared_terms(df):
    term_to_datasets = defaultdict(set)
    for index, row in df.iterrows():
        terms = set(row['matched_terms'].split(', '))
        for term in terms:
            term_to_datasets[term].add(row['s'])

    edges = []
    for term, datasets in term_to_datasets.items():
        datasets = list(datasets)
        for i in range(len(datasets)):
            for j in range(i + 1, len(datasets)):
                edges.append((datasets[i], datasets[j], term))
    return edges

# Build edges
edges = build_edges_based_on_shared_terms(final_df)

# Add weighted edges to the graph
for edge in edges:
    if G.has_edge(edge[0], edge[1]):
        G[edge[0]][edge[1]]['shared_terms'].append(edge[2])
        G[edge[0]][edge[1]]['weight'] += 1
    else:
        G.add_edge(edge[0], edge[1], shared_terms=[edge[2]], weight=1)

# Serialize the shared_terms attribute to a string
for u, v, data in G.edges(data=True):
    data['shared_terms'] = ', '.join(data['shared_terms'])

# Export to GEXF, excluding Label and Interval
nx.write_gexf(G, "OBIS_graph_500.gexf")