In [None]:
# Setup path to import from src
import sys
import os

cwd = os.getcwd()
if os.path.basename(cwd) == 'docs':
    project_root = os.path.dirname(cwd)
else:
    project_root = cwd

src_path = os.path.join(project_root, 'src')

if src_path not in sys.path:
    sys.path.append(src_path)
    print(f"Added {src_path} to sys.path")

# Example usage of the new SCENICPLUS wrapper
from methods import SCENICPLUS
from datastruct import Dataset
import numpy as np

from analyze.Data import Data
dataset = Data.from_json_url(
    'https://bitbucket.org/sonnhammergrni/gs-datasets/raw/d2047430263f5ffe473525c74b4318f723c23b0e/N50/Tjarnberg-ID252384-D20151111-N50-E150-SNR100000-IDY252384.json'
)
# Run SCENIC+ (Standard)
adj = SCENICPLUS( scenic_workflow_dir='src/methods/scenic_workflow/')

# Run SCENIC+ with Nested Bootstrap
# results = SCENICPLUS(
#     dataset, 
#     cisTopic_obj_fname="/path/to/cistopic.pkl",
#     nested_boot=True,
#     nest_runs=10,
#     boot_runs=10
# )

## modify and run

#### Add this code to top of `scenicplus/src/scenicplus/snakemake/Snakefile`

In [None]:
 Get run_id from command line, default to '1' if not specified
run_id = config.get('run_id', '1')

# Update the output directory with the run_id
config['params_general']['output_dir'] = config['params_general']['output_dir'].format(run_id=run_id)

# Update all output paths with the output directory
for key in config['output_data']:
    config['output_data'][key] = config['output_data'][key].format(output_dir=config['params_general']['output_dir'])

# Create output directory if it doesn't exist
shell("mkdir -p {config[params_general][output_dir]}")

configfile: "config/config.yaml"


#### Add rule for sampling data, add after previous code at top of snakemake file. Notice randomize commented part for Nested Bootstrapping null.

In [None]:
rule sample_data:
    input:
        GEX_anndata_fname=config["input_data"]["GEX_anndata_fname"]
    output:
        GEX_anndata_sampled_fname=config["params_general"]["output_dir"] + "/GEX_anndata_sampled.h5ad"
    # params:
        # random=config['params_general']['random']
    run:
        import scanpy as sc
        import os
        import numpy as np
        
        # Create output directory if it doesn't exist
        os.makedirs(os.path.dirname(output.GEX_anndata_sampled_fname), exist_ok=True)
        
        # Generate a random seed
        random_seed = np.random.randint(0, 10000000)
        
        # Read the AnnData object
        adata = sc.read_h5ad(input.GEX_anndata_fname)
        
        # randomize
       numeric_columns = adata.obs.select_dtypes(include=['number']).columns.tolist()
        
        # Sample 10% of cells using the random seed
        # if random==True:
        # sc.pp.sample(adata, fraction=0.95, replace=False)
        # for col in numeric_columns:
        #    adata.obs[col] = np.random.permutation(adata.obs[col].values)
        
        # Save the sampled AnnData object
        # adata.obs.index=adata.obs.index.str.split('-').str[0]+'-1'
        adata.write(output.GEX_anndata_sampled_fname)

In [None]:
for i in {1..5}; do
  snakemake --config run_id=$i -j 1 all
done

## postprocess data analysis

In [None]:
import pandas as pd
import os

# Specify the base directory to search for files
base_directory = 'snakemake/results' ## or normal_results

# List to hold DataFrames
dataframes = []

# Loop through the directory and its subdirectories
for root, dirs, files in os.walk(base_directory):
   for file in files:
       if 'eRegulon' in file: # s_extended.tsv':  # Check for the specific file name
           file_path = os.path.join(root, file)  # Get the full file path
           df = pd.read_csv(file_path, sep='\t')  # Load the DataFrame (assuming tab-separated values)
           df['source_file'] = file_path  # Add a new column with the file path
           dataframes.append(df)  # Append to the list

# Optionally, concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df['init']=combined_df.source_file.str.split('/').str[2]
combined_df=combined_df[['Gene','TF','importance_x_rho','init']]
# Print the combined DataFrame
combined_df.to_csv('cat_scenic.gz',compression='gzip')

In [53]:
pwd

'/Users/apple/Developer/dcolinmorgan/pyNB/docs'

In [None]:
import joblib
try:
    aa = joblib.load('../src/methods/scenic_workflow/results/run_1.pkl')
    print("Successfully loaded with joblib")
except Exception as e:
    print(f"joblib load failed: {e}")
    # Fallback or debugging
    import pickle
    try:
        with open('../src/methods/scenic_workflow/results/run_1.pkl', 'rb') as f:
            aa = pickle.load(f)
        print("Successfully loaded with pickle")
    except Exception as e2:
        print(f"pickle load failed: {e2}")


UnpicklingError: invalid load key, '\x0f'.