# 0_setup — Local environment & project paths

This notebook sets up paths, checks required Python packages and external tools, and validates input files.

**Inputs expected:**
- `query.algn.fa` — gapped MSA
- `ASC-tree.newick` — RAxML-generated tree 
- `ASC_target.fa` — ungapped FASTA of *target* species only



In [None]:
import os, shutil
from pathlib import Path
import sys
import subprocess

#  Project directories 
PROJ = Path.cwd().resolve().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA = PROJ / "data"
OUT  = PROJ / "results"
CLADES = OUT / "clades"
MOTIFS = OUT / "motifs"
REPORTS = OUT / "reports"

for d in (DATA, OUT, CLADES, MOTIFS, REPORTS):
    d.mkdir(parents=True, exist_ok=True)


# --- Input files ---
IN_MSA = DATA / "query.algn.fa"       # gapped
IN_MSA_TRIMMED = DATA / "query.algn.trimmed.fa"
IN_TREE = DATA / "ASC-tree.newick"
IN_TARGETS = DATA / "ASC_targets.fasta"   # ungapped

print("Project:", PROJ)
print("Data dir:", DATA)
print("Results dir:", OUT)
print("\nExpected inputs:")
for p in (IN_MSA, IN_TREE, IN_TARGETS):
    print(" -", p, "exists:", p.exists())

Project: /Users/gorkemdurmaz/Desktop/asc_project_10
Data dir: /Users/gorkemdurmaz/Desktop/asc_project_10/data
Results dir: /Users/gorkemdurmaz/Desktop/asc_project_10/results

Expected inputs:
 - /Users/gorkemdurmaz/Desktop/asc_project_10/data/query.algn.fa exists: True
 - /Users/gorkemdurmaz/Desktop/asc_project_10/data/ASC-tree.newick exists: True
 - /Users/gorkemdurmaz/Desktop/asc_project_10/data/ASC_targets.fasta exists: True


### Python package checks

In [None]:
#  dependency check 
import sys, importlib, subprocess

# label -> module-to-import
mods = {
    "pandas": "pandas",
    "numpy": "numpy",
    "matplotlib": "matplotlib",
    "scipy": "scipy",
    "biopython": "Bio",   
}

missing = []
for label, mod in mods.items():
    try:
        importlib.import_module(mod)
        print(f"OK  - {label} (import '{mod}')")
    except Exception as e:
        print(f"MISS- {label} (import '{mod}'): {e}")
        missing.append(label)

print("\nMissing (install with conda or pip):", missing)

#  to debug environment issues
print("\nPython executable:", sys.executable)
print("Python version:", sys.version)
print("pip in this kernel:")
subprocess.run([sys.executable, "-m", "pip", "--version"])
print("\nBiopython install (if any):")
subprocess.run([sys.executable, "-m", "pip", "show", "biopython"])


OK  - pandas (import 'pandas')
OK  - numpy (import 'numpy')
OK  - matplotlib (import 'matplotlib')
OK  - scipy (import 'scipy')
OK  - biopython (import 'Bio')

Missing (install with conda or pip): []

Python executable: /Users/gorkemdurmaz/miniconda3/envs/asc/bin/python
Python version: 3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:46:25) [Clang 18.1.8 ]
pip in this kernel:
pip 25.2 from /Users/gorkemdurmaz/miniconda3/envs/asc/lib/python3.10/site-packages/pip (python 3.10)

Biopython install (if any):
Name: biopython
Version: 1.85
Summary: Freely available tools for computational molecular biology.
Home-page: https://biopython.org/
Author: The Biopython Contributors
Author-email: biopython@biopython.org
License: 
Location: /Users/gorkemdurmaz/miniconda3/envs/asc/lib/python3.10/site-packages
Requires: numpy
Required-by: 


CompletedProcess(args=['/Users/gorkemdurmaz/miniconda3/envs/asc/bin/python', '-m', 'pip', 'show', 'biopython'], returncode=0)

### External tool checks (optional; skip if you will run only Python parts)

In [3]:
import shutil
def which(tool):
    return shutil.which(tool) or "NOT FOUND"

tools = {
    "mafft": which("mafft"),
    "iqtree2": which("iqtree2"),
    "raxml-ng": which("raxml-ng"),
    "meme": which("meme"),
    "streme": which("streme"),
    "tomtom": which("tomtom"),
    "hmmbuild": which("hmmbuild"),
    "hmmsearch": which("hmmsearch"),
    "interproscan.sh": which("interproscan.sh"),
}
tools

{'mafft': '/Users/gorkemdurmaz/miniconda3/envs/asc/bin/mafft',
 'iqtree2': 'NOT FOUND',
 'raxml-ng': '/Users/gorkemdurmaz/miniconda3/envs/asc/bin/raxml-ng',
 'meme': '/Users/gorkemdurmaz/miniconda3/envs/asc/bin/meme',
 'streme': '/Users/gorkemdurmaz/miniconda3/envs/asc/bin/streme',
 'tomtom': '/Users/gorkemdurmaz/miniconda3/envs/asc/bin/tomtom',
 'hmmbuild': '/Users/gorkemdurmaz/miniconda3/envs/asc/bin/hmmbuild',
 'hmmsearch': '/Users/gorkemdurmaz/miniconda3/envs/asc/bin/hmmsearch',
 'interproscan.sh': 'NOT FOUND'}

In [None]:
docker run --rm \
  --platform linux/amd64 \
  -v "$PWD/data:/input" \
  -v "$PWD/results:/output" \
  interpro/interproscan:5.75-106.0 \
  /opt/interproscan/interproscan.sh \
    -i /input/ASC_targets.fasta \
    -f tsv -dp \
    -o /output/reports/interproscan.tsv \
    -appl Pfam,SMART,TIGRFAM,CDD,ProSiteProfiles,ProSitePatterns,Coils,SuperFamily,Gene3D,PANTHER,PRINTS

In [None]:

docker run --rm \
  --platform linux/amd64 \
  interpro/interproscan:5.75-106.0 \
  /opt/interproscan/interproscan.sh --list-applications


SyntaxError: invalid syntax (3175822852.py, line 2)

### Next step
If your inputs are detected above, proceed to **1_gene_tree.ipynb** to load and inspect the provided tree and map your target sequences onto clades.