# 00_environment_setup

**Purpose:**  
Verify that all team members can:
1. Activate the Python environment  
2. Import the core libraries  
3. Read in our config file  
4. Load a sample of the MGTAB dataset  
5. Connect to our Neo4j instance  

If any of these steps fails, fix it now before moving on.


In [1]:
import sys, os
# -- ensure imports see src/ package --
sys.path.insert(0, os.path.abspath('..'))
print("Python:", sys.version.split()[0])

# Core libraries
import pandas as pd, numpy as np, networkx as nx
import sklearn, matplotlib, neo4j, py2neo
import nltk

# PyTorch / PyG
import torch
import torch_geometric

print("pandas:", pd.__version__)
print("numpy:", np.__version__)
print("networkx:", nx.__version__)
print("scikit-learn:", sklearn.__version__)
print("matplotlib:", matplotlib.__version__)
print("neo4j driver:", neo4j.__version__)
print("py2neo:", py2neo.__version__)
print("nltk:", nltk.__version__)
print("torch:", torch.__version__)
print("torch_geometric:", torch_geometric.__version__)


Python: 3.13.2
pandas: 2.2.3
numpy: 2.2.5
networkx: 3.4.2
scikit-learn: 1.6.1
matplotlib: 3.10.1
neo4j driver: 5.28.1
py2neo: 2021.2.4
nltk: 3.9.1
torch: 2.6.0
torch_geometric: 2.6.1


In [2]:
import yaml

config_path = os.path.join("..", "config", "config.yaml")
with open(config_path) as f:
    cfg = yaml.safe_load(f)

# Expect keys: mgtab_root, neo4j.uri, neo4j.user, neo4j.password
cfg


{'mgtab_root': 'data/raw',
 'neo4j': {'uri': 'bolt://localhost:7687',
  'user': 'neo4j',
  'password': 'neo4jpass'}}

In [3]:
import shutil, os
from src.data.mgtab_dataset import MGTAB
print("Notebook cwd:", os.getcwd())
print("data/raw contents:", os.listdir(os.path.join(os.getcwd(), 'data', 'raw')))
repo_root = os.path.abspath('..')  
data_root = os.path.join(repo_root, 'data')


# 1) Remove any previously processed file to force `process()` to run:
processed_file = os.path.join(data_root, 'processed', 'data.pt')
if os.path.exists(processed_file):
    print("Removing old processed file…")
    os.remove(processed_file)

# 2) Instantiate the dataset (this will call `process()` if needed):
dataset = MGTAB(root=data_root)

# 3) Check that the processed file was created:
assert os.path.exists(processed_file), "Processed file was not saved!"

# 4) Inspect the dataset:
print("Dataset length:", len(dataset))              # should be 1
data = dataset[0]
print(data)                                        # PyG Data summary
print("  • #nodes:", data.num_nodes)
print("  • #edges:", data.num_edges)
print("  • x.shape:", data.x.shape)
print("  • y_bot unique labels:", data.y_bot.unique().tolist())
print("  • mask sizes:", 
      data.train_mask.sum().item(), 
      data.val_mask.sum().item(), 
      data.test_mask.sum().item())



Notebook cwd: /Users/dennisberger/Library/Mobile Documents/com~apple~CloudDocs/Uni/FS_2025/Social Media Analytics/project/social-botnet-analytics/notebooks
data/raw contents: []
Removing old processed file…
Dataset length: 1
Data(x=[10199, 788], edge_index=[2, 1700108], edge_type=[1700108], edge_weight=[1700108], y_stance=[10199], y_bot=[10199], train_mask=[10199], val_mask=[10199], test_mask=[10199])
  • #nodes: 10199
  • #edges: 1700108
  • x.shape: torch.Size([10199, 788])
  • y_bot unique labels: [0, 1]
  • mask sizes: 7139 2040 1020


Processing...
Done!


In [4]:
from py2neo import Graph

neo_cfg = cfg["neo4j"]
graph = Graph(neo_cfg["uri"], auth=(neo_cfg["user"], neo_cfg["password"]))

# simple test query
result = graph.run("RETURN 1 AS test").data()
print("Neo4j connection test:", result)


Neo4j connection test: [{'test': 1}]
