# 00_environment_setup

**Purpose:**  
Verify that all team members can:
1. Activate the Python environment  
2. Import the core libraries  
3. Read in our config file  
4. Load a sample of the MGTAB dataset  
5. Connect to our Neo4j instance  


In [1]:
import sys, os
# -- ensure imports see src/ package --
sys.path.insert(0, os.path.abspath('..'))
print("Python:", sys.version.split()[0])

# Core libraries
import pandas as pd, numpy as np, networkx as nx
import sklearn, matplotlib, neo4j, py2neo
import nltk

# PyTorch / PyG
import torch
import torch_geometric

print("pandas:", pd.__version__)
print("numpy:", np.__version__)
print("networkx:", nx.__version__)
print("scikit-learn:", sklearn.__version__)
print("matplotlib:", matplotlib.__version__)
print("neo4j driver:", neo4j.__version__)
print("py2neo:", py2neo.__version__)
print("nltk:", nltk.__version__)
print("torch:", torch.__version__)
print("torch_geometric:", torch_geometric.__version__)


Python: 3.13.2
pandas: 2.2.3
numpy: 2.2.5
networkx: 3.4.2
scikit-learn: 1.6.1
matplotlib: 3.10.1
neo4j driver: 5.28.1
py2neo: 2021.2.4
nltk: 3.9.1
torch: 2.6.0
torch_geometric: 2.6.1


In [2]:
import yaml

config_path = os.path.join("..", "config", "config.yaml")
with open(config_path) as f:
    cfg = yaml.safe_load(f)

# Expect keys: mgtab_root, neo4j.uri, neo4j.user, neo4j.password
cfg


{'mgtab_root': 'data',
 'neo4j': {'uri': 'bolt://localhost:7687',
  'user': 'neo4j',
  'password': 'neo4jpass'}}

In [4]:
from pathlib import Path
import sys, os
from src.data.mgtab_dataset import MGTAB

# 1) Locate repo root
repo_root   = Path().resolve().parent
data_root   = repo_root / "data"
raw_dir     = data_root / "raw"
processed_dir = data_root / "processed"

print("Repo root:       ", repo_root)
print("Looking in raw:  ", raw_dir)
print("Files in raw:    ", list(raw_dir.glob("*.pt")))

# 2) Ensure our src/ package is importable
sys.path.insert(0, str(repo_root))

# 3) Remove old processed file so `process()` will run
processed_file = processed_dir / "data.pt"
if processed_file.exists():
    print("Removing old processed file…")
    processed_file.unlink()

# 4) Instantiate dataset (triggers process() if needed)
dataset = MGTAB(root=str(data_root))

# 5) Verify processed file appears
assert processed_file.exists(), f"Processed file not found at {processed_file}"

# 6) Inspect the loaded data
print("Dataset length:", len(dataset))
data = dataset[0]
print(data)  # summary
print(f" • #nodes: {data.num_nodes}")
print(f" • #edges: {data.num_edges}")
print(f" • x.shape: {tuple(data.x.shape)}")
print(f" • y_bot labels: {data.y_bot.unique().tolist()}")
print(" • mask sizes:", 
      data.train_mask.sum().item(), 
      data.val_mask.sum().item(), 
      data.test_mask.sum().item())




Repo root:        /Users/dennisberger/Library/Mobile Documents/com~apple~CloudDocs/Uni/FS_2025/Social Media Analytics/project/social-botnet-analytics
Looking in raw:   /Users/dennisberger/Library/Mobile Documents/com~apple~CloudDocs/Uni/FS_2025/Social Media Analytics/project/social-botnet-analytics/data/raw
Files in raw:     [PosixPath('/Users/dennisberger/Library/Mobile Documents/com~apple~CloudDocs/Uni/FS_2025/Social Media Analytics/project/social-botnet-analytics/data/raw/edge_type.pt'), PosixPath('/Users/dennisberger/Library/Mobile Documents/com~apple~CloudDocs/Uni/FS_2025/Social Media Analytics/project/social-botnet-analytics/data/raw/labels_stance.pt'), PosixPath('/Users/dennisberger/Library/Mobile Documents/com~apple~CloudDocs/Uni/FS_2025/Social Media Analytics/project/social-botnet-analytics/data/raw/labels_bot.pt'), PosixPath('/Users/dennisberger/Library/Mobile Documents/com~apple~CloudDocs/Uni/FS_2025/Social Media Analytics/project/social-botnet-analytics/data/raw/edge_weight

Processing...
Done!


In [5]:
from py2neo import Graph

neo_cfg = cfg["neo4j"]
graph = Graph(neo_cfg["uri"], auth=(neo_cfg["user"], neo_cfg["password"]))

# simple test query
result = graph.run("RETURN 1 AS test").data()
print("Neo4j connection test:", result)


Neo4j connection test: [{'test': 1}]
