In [1]:
import os
import tarfile
import shutil
import scanpy as sc  # pip install scanpy
import pandas as pd  # (not strictly needed here; kept since you had it)
import re

In [3]:
# -------------------------------
# Step 0: Define paths
# -------------------------------
tar_file = "GSE159977_RAW.tar"          # downloaded tar file
extract_folder = "GSE159977_RAW"        # folder to extract files into
output_folder = "GSE159977_h5ad"        # folder to save h5ad files

os.makedirs(extract_folder, exist_ok=True)
os.makedirs(output_folder, exist_ok=True)

In [4]:
# -------------------------------
# Step 1: Extract the tar file
# -------------------------------
print("Extracting tar file...")
with tarfile.open(tar_file, "r") as tar:
    tar.extractall(path=extract_folder)
print("Extraction done.")

Extracting tar file...
Extraction done.


In [6]:
# -------------------------------
# Step 2: Map GSM to PT labels
# -------------------------------
gsm_to_pt = {
    "GSM4851987": "PT-5",
    "GSM4851988": "PT-8",
    "GSM4851989": "PT-9",
    "GSM4851990": "PT-11",
    "GSM4851991": "PT-12",
    "GSM4851992": "PT-16",
    "GSM4851993": "PT-17",
    "GSM4851994": "PT-19",
    "GSM4851995": "PT-20",
    "GSM4851996": "PT-21"
}

In [7]:
# -------------------------------
# Step 3: File grouping (GSM..._PT-... prefix)
#           Move the 10x files into per-sample folders
#           and rename to 10x standard names (keeping .gz)
# -------------------------------
# Match files like:
#   GSM4851987_PT-5-barcodes.tsv.gz
#   GSM4851987_PT-5-features.tsv.gz
#   GSM4851987_PT-5-matrix.mtx.gz
pat = re.compile(r"^(GSM\d+_PT-\d+)-(barcodes|features|matrix)\.(tsv|mtx)\.gz$")

groups = {}
for fn in os.listdir(extract_folder):
    m = pat.match(fn)
    if not m:
        continue
    sample_key = m.group(1)  # e.g. GSM4851987_PT-5
    groups.setdefault(sample_key, []).append(fn)

print("Detected samples:", len(groups))
print("Example sample keys:", list(groups.keys())[:3])

for sample_key, fns in groups.items():
    sample_dir = os.path.join(extract_folder, sample_key)
    os.makedirs(sample_dir, exist_ok=True)

    for fn in fns:
        src = os.path.join(extract_folder, fn)

        # rename to 10x expected names (keeping .gz)
        if "-barcodes.tsv.gz" in fn:
            dst_name = "barcodes.tsv.gz"
        elif "-features.tsv.gz" in fn:
            dst_name = "features.tsv.gz"
        elif "-matrix.mtx.gz" in fn:
            dst_name = "matrix.mtx.gz"
        else:
            continue

        dst = os.path.join(sample_dir, dst_name)
        shutil.move(src, dst)

Detected samples: 10
Example sample keys: ['GSM4851987_PT-5', 'GSM4851988_PT-8', 'GSM4851989_PT-9']


In [8]:
# -------------------------------
# Step 4: Identify sample folders
# -------------------------------
sample_folders = [
    os.path.join(extract_folder, d)
    for d in os.listdir(extract_folder)
    if os.path.isdir(os.path.join(extract_folder, d))
]

print(f"Found {len(sample_folders)} sample folders.")

Found 10 sample folders.


In [9]:
# -------------------------------
# Sanity check: one known folder should have 10x files
# -------------------------------
example_dir = os.path.join(extract_folder, "GSM4851987_PT-5")
if os.path.isdir(example_dir):
    print("Sanity check contents of GSM4851987_PT-5:")
    print(os.listdir(example_dir))
else:
    print("WARNING: Expected sample dir not found:", example_dir)

Sanity check contents of GSM4851987_PT-5:
['barcodes.tsv.gz', 'features.tsv.gz', 'matrix.mtx.gz']


In [10]:
# -------------------------------
# Step 5: Load each sample and save per-sample h5ad
#         IMPORTANT CHANGE: cache=False (avoid cache collisions)
# -------------------------------
adata_list = []
for folder in sample_folders:
    sample_id = os.path.basename(folder)

    # Only process folders that look like sample dirs
    if not re.match(r"^GSM\d+_PT-\d+$", sample_id):
        continue

    gsm = sample_id.split("_")[0]  # e.g. GSM4851987
    pt_label = gsm_to_pt.get(gsm, sample_id)

    print(f"Loading sample {sample_id} -> {pt_label}...")

    # IMPORTANT: cache=False
    ad = sc.read_10x_mtx(folder, var_names="gene_symbols", cache=False)

    # Add metadata
    ad.obs["sample"] = pt_label
    ad.obs["dataset"] = "GSE159977_RAW"

    # Save per-sample h5ad
    per_sample_file = os.path.join(output_folder, f"GSE159977_{pt_label}.h5ad")
    ad.write(per_sample_file)

    print(f"Saved {per_sample_file} | shape={ad.shape} | n_cells={ad.n_obs}")

    adata_list.append(ad)

print("Done. Wrote", len(adata_list), "per-sample h5ad files to:", output_folder)

Loading sample GSM4851987_PT-5 -> PT-5...
Saved GSE159977_h5ad/GSE159977_PT-5.h5ad | shape=(6794880, 33694) | n_cells=6794880
Loading sample GSM4851988_PT-8 -> PT-8...
Saved GSE159977_h5ad/GSE159977_PT-8.h5ad | shape=(6794880, 33694) | n_cells=6794880
Loading sample GSM4851989_PT-9 -> PT-9...
Saved GSE159977_h5ad/GSE159977_PT-9.h5ad | shape=(6794880, 33694) | n_cells=6794880
Loading sample GSM4851990_PT-11 -> PT-11...
Saved GSE159977_h5ad/GSE159977_PT-11.h5ad | shape=(6794880, 33694) | n_cells=6794880
Loading sample GSM4851991_PT-12 -> PT-12...
Saved GSE159977_h5ad/GSE159977_PT-12.h5ad | shape=(6794880, 33694) | n_cells=6794880
Loading sample GSM4851992_PT-16 -> PT-16...
Saved GSE159977_h5ad/GSE159977_PT-16.h5ad | shape=(6794880, 33694) | n_cells=6794880
Loading sample GSM4851993_PT-17 -> PT-17...
Saved GSE159977_h5ad/GSE159977_PT-17.h5ad | shape=(6794880, 33694) | n_cells=6794880
Loading sample GSM4851994_PT-19 -> PT-19...
Saved GSE159977_h5ad/GSE159977_PT-19.h5ad | shape=(6794880, 33

In [11]:
# -------------------------------
# Step 6: Filtered h5ad files
# -------------------------------

import os
import glob
import scanpy as sc

min_counts = 200
min_genes = 10

in_dir = "GSE159977_h5ad"
out_dir = "GSE159977_h5ad_filtered"
os.makedirs(out_dir, exist_ok=True)

# Process every per-patient h5ad file in the input directory
for in_path in sorted(glob.glob(os.path.join(in_dir, "GSE159977_PT-*.h5ad"))):
    base = os.path.basename(in_path)  # e.g., GSE159977_PT-5.h5ad
    pt = base.replace("GSE159977_", "").replace(".h5ad", "")  # e.g., PT-5

    print(f"Reading {base}...")
    ad = sc.read_h5ad(in_path)

    # QC metrics
    sc.pp.calculate_qc_metrics(ad, inplace=True)

    # Filter
    ad_f = ad[ad.obs["total_counts"] >= min_counts, :].copy()
    ad_f = ad_f[ad_f.obs["n_genes_by_counts"] >= min_genes, :].copy()

    # Write filtered output
    out_path = os.path.join(out_dir, f"GSE159977_{pt}_filtered.h5ad")
    ad_f.write(out_path)

    print(f"{pt}: raw barcodes={ad.n_obs} -> filtered cells={ad_f.n_obs} | saved {out_path}")


Reading GSE159977_PT-11.h5ad...
PT-11: raw barcodes=6794880 -> filtered cells=9554 | saved GSE159977_h5ad_filtered/GSE159977_PT-11_filtered.h5ad
Reading GSE159977_PT-12.h5ad...
PT-12: raw barcodes=6794880 -> filtered cells=8359 | saved GSE159977_h5ad_filtered/GSE159977_PT-12_filtered.h5ad
Reading GSE159977_PT-16.h5ad...
PT-16: raw barcodes=6794880 -> filtered cells=9083 | saved GSE159977_h5ad_filtered/GSE159977_PT-16_filtered.h5ad
Reading GSE159977_PT-17.h5ad...
PT-17: raw barcodes=6794880 -> filtered cells=9913 | saved GSE159977_h5ad_filtered/GSE159977_PT-17_filtered.h5ad
Reading GSE159977_PT-19.h5ad...
PT-19: raw barcodes=6794880 -> filtered cells=9550 | saved GSE159977_h5ad_filtered/GSE159977_PT-19_filtered.h5ad
Reading GSE159977_PT-20.h5ad...
PT-20: raw barcodes=6794880 -> filtered cells=9497 | saved GSE159977_h5ad_filtered/GSE159977_PT-20_filtered.h5ad
Reading GSE159977_PT-21.h5ad...
PT-21: raw barcodes=6794880 -> filtered cells=1128 | saved GSE159977_h5ad_filtered/GSE159977_PT-21

In [12]:
# -------------------------------
# Step 7: Copy + rename filtered files into "Liver Samples"
# -------------------------------

import os
import glob
import shutil

src_dir = "GSE159977_h5ad_filtered"
dst_dir = "Liver Samples"
os.makedirs(dst_dir, exist_ok=True)

# Example: GSE159977_PT-11_filtered.h5ad -> Liver_PT-11.h5ad
pattern = os.path.join(src_dir, "GSE159977_PT-*_filtered.h5ad")

for src_path in sorted(glob.glob(pattern)):
    base = os.path.basename(src_path)  # e.g., GSE159977_PT-11_filtered.h5ad

    # Extract "PT-11" from "GSE159977_PT-11_filtered.h5ad"
    pt = base.replace("GSE159977_", "").replace("_filtered.h5ad", "")

    dst_name = f"Liver_{pt}.h5ad"      # e.g., Liver_PT-11.h5ad
    dst_path = os.path.join(dst_dir, dst_name)

    # Copy while preserving metadata (timestamps, etc.)
    shutil.copy2(src_path, dst_path)

    print(f"Copied+renamed: {base} -> {dst_name}")


Copied+renamed: GSE159977_PT-11_filtered.h5ad -> Liver_PT-11.h5ad
Copied+renamed: GSE159977_PT-12_filtered.h5ad -> Liver_PT-12.h5ad
Copied+renamed: GSE159977_PT-16_filtered.h5ad -> Liver_PT-16.h5ad
Copied+renamed: GSE159977_PT-17_filtered.h5ad -> Liver_PT-17.h5ad
Copied+renamed: GSE159977_PT-19_filtered.h5ad -> Liver_PT-19.h5ad
Copied+renamed: GSE159977_PT-20_filtered.h5ad -> Liver_PT-20.h5ad
Copied+renamed: GSE159977_PT-21_filtered.h5ad -> Liver_PT-21.h5ad
Copied+renamed: GSE159977_PT-5_filtered.h5ad -> Liver_PT-5.h5ad
Copied+renamed: GSE159977_PT-8_filtered.h5ad -> Liver_PT-8.h5ad
Copied+renamed: GSE159977_PT-9_filtered.h5ad -> Liver_PT-9.h5ad
