In [None]:
import pandas as pd
import spacy

# 1. Load data
# ---------------------------------------------------------
df = pd.read_csv("document.csv")
df["text"] = df["text"].fillna("").astype(str)

# 2. Load spaCy model
# ---------------------------------------------------------
nlp = spacy.load("en_core_web_sm")

# 3. Process Text (Run Pipeline)
# ---------------------------------------------------------
# We run the pipe once to get both NER and Dependency data
docs = list(nlp.pipe(df["text"].tolist(), batch_size=32))

# 4. Define Extraction Logic
# ---------------------------------------------------------
NER_TYPES = ["ORG", "PERSON"]

# Initialize columns
for ner in NER_TYPES:
    df[f"entities_{ner}"] = [[] for _ in range(len(df))]

# New column for dependency context
df["entity_context"] = [[] for _ in range(len(df))]

for i, doc in enumerate(docs):
    temp_deps = []
    
    for ent in doc.ents:
        if ent.label_ in NER_TYPES:
            # A. Standard NER Extraction
            df.at[i, f"entities_{ent.label_}"].append(ent.text)
            
            # B. Dependency Parsing (Building up on the entity)
            # ent.root.dep_  = The grammatical relationship (e.g., nsubj = Subject)
            # ent.root.head.text = The word the entity modifies (e.g., the main verb)
            context_str = f"{ent.text} ({ent.label_}) -> is {ent.root.dep_} of -> {ent.root.head.text}"
            temp_deps.append(context_str)
    
    # Store the dependency context
    df.at[i, "entity_context"] = temp_deps

# 5. Cleanup and Save
# ---------------------------------------------------------
# Deduplicate the standard NER columns
for ner in NER_TYPES:
    df[f"entities_{ner}"] = df[f"entities_{ner}"].apply(
        lambda x: list(dict.fromkeys(x))
    )

# Save
df.to_csv("document_with_deps.csv", index=False)

print("Processing complete. Sample of extracted dependencies:")
print(df[["text", "entity_context"]].head())

In [None]:
import pandas as pd
import spacy
import networkx as nx
import matplotlib.pyplot as plt

# 1. Setup Data and Model (Assuming df is loaded from previous step)
nlp = spacy.load("en_core_web_sm")
docs = list(nlp.pipe(df["text"].tolist(), batch_size=32))
NER_TYPES = ["ORG", "PERSON"]

# 2. Build the Graph
G = nx.DiGraph() # Directed Graph

for doc in docs:
    for ent in doc.ents:
        if ent.label_ in NER_TYPES:
            source = ent.text
            target = ent.root.head.text
            relationship = ent.root.dep_
            
            # Add nodes and edge to the graph
            # We differentiate entities by color later, so let's track the type
            G.add_node(source, type="entity", label=ent.label_)
            G.add_node(target, type="head_word") 
            G.add_edge(source, target, relation=relationship)

# 3. Visualization Configuration
plt.figure(figsize=(12, 8))

# Layout: 'spring_layout' pushes nodes apart so they don't overlap
pos = nx.spring_layout(G, k=0.5, iterations=50)

# Separate nodes by type for coloring
entity_nodes = [n for n, attr in G.nodes(data=True) if attr.get("type") == "entity"]
word_nodes = [n for n, attr in G.nodes(data=True) if attr.get("type") == "head_word"]

# Draw Nodes
nx.draw_networkx_nodes(G, pos, nodelist=entity_nodes, node_color='lightblue', node_size=2000, label="Entities")
nx.draw_networkx_nodes(G, pos, nodelist=word_nodes, node_color='lightgrey', node_size=1500, label="Head Words")

# Draw Labels (Text inside circles)
nx.draw_networkx_labels(G, pos, font_size=10, font_weight="bold")

# Draw Edges
nx.draw_networkx_edges(G, pos, edge_color='gray', arrows=True, arrowstyle='->', arrowsize=20)

# Draw Edge Labels (The dependency relationship)
edge_labels = nx.get_edge_attributes(G, 'relation')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_color='red')

# 4. Show Plot
plt.title("Entity Dependency Network")
plt.axis('off') # Turn off the x/y axis
plt.show()