In [None]:
import anndict as adt
import scanpy as sc
import time
import warnings
warnings.filterwarnings('ignore')

In [None]:
#set configuration for llm providers
provider_config = {
    'anthropic': {
        'provider': 'anthropic',
        'api_key': 'your-anthropic-api-key-here',
        'requests_per_minute': 500, ##this is usage tier 2, max 1000
    },
}


provider_endpoint_dict = {
    'bedrock' :[
        'meta.llama3-1-8b-instruct-v1:0',
        'meta.llama3-1-70b-instruct-v1:0',
        'meta.llama3-1-405b-instruct-v1:0',
        'cohere.command-r-plus-v1:0',
        'mistral.mistral-large-2407-v1:0',
    ],
    'google' : [
        'gemini-1.5-pro',
        'gemini-1.5-flash'
    ],
    'openai': [
        'gpt-4',
        'gpt-4o',  
        'gpt-4o-mini'
    ],
    'anthropic': [
        'claude-3-5-sonnet-20240620',
        'claude-3-opus-20240229',
        'claude-3-haiku-20240307'
    ],
}

In [None]:
adt.configure_llm_backend(provider='anthropic',
                          model='claude-3-5-sonnet-20240620',
                          api_key=provider_config['anthropic']['api_key'],
                          requests_per_minute=500
                          )

In [None]:
#read data
adata_path = 'path-to-your-adata.h5ad'
adata = sc.read_h5ad(adata_path)

In [None]:
#set X to be raw counts
adata.X = adata.layers['raw_counts'].copy()

In [None]:
#get only protein coding genes

#load/define your list of protein-coding genes here, otherwise, annotationw will be based on all genes in object
protein_coding = None

if protein_coding:
    adata.var['protein_coding'] = [(i in protein_coding) for i in adata.var_names]
    # Subset to keep only protein-coding genes
    adata = adata[:, adata.var['protein_coding']].copy()

In [None]:
#build adata_dict
adata_dict = adt.build_adata_dict(adata, ['tissue'])

In [None]:
#remove a standard list of uninformative genes
abundant_rnas = [
    "MALAT1",
    "NEAT1",
    "XIST",
    "KCNQ1OT1",
    "RPPH1",
    "RN7SL1",
    "RMRP",
    "SNHG1",
    "MIAT",
    "H19"
]

adt.remove_genes_adata_dict(adata_dict, abundant_rnas)

In [None]:
import gc
gc.collect()

In [None]:
#Run leiden clustering on each adata independently
#adata.X is raw counts, so run standard preprocessing
# Normalize each AnnData in the dictionary
adt.normalize_adata_dict(adata_dict)

# Log transform each AnnData in the dictionary
adt.log_transform_adata_dict(adata_dict)

# Optionally, you might subset the data to only high-variance genes
adt.set_high_variance_genes_adata_dict(adata_dict, n_top_genes=2000, subset=False)

# Scale each AnnData in the dictionary
adt.scale_adata_dict(adata_dict)

# Perform PCA on each AnnData in the dictionary
adt.pca_adata_dict(adata_dict, n_comps=50, mask_var='highly_variable')

#Calculate the neighborhood graph
adt.neighbors_adata_dict(adata_dict)

#Calculate the UMAP
adt.calculate_umap_adata_dict(adata_dict)

#get leiden clusters
# adt.leiden_adata_dict(adata_dict)

In [None]:
#Determine appropriate cluster resolutions using AI
#This will leave the final column as 'leiden' in the .obs of each anndata
# appropriate_resolution_dict = adt.ai_determine_leiden_resolution_adata_dict(adata_dict, initial_resolution=0.5)

In [None]:
#saved from previous run
appropriate_resolution_dict = {'Bladder': 0.05,
 'Blood': 0.5,
 'Bone_Marrow': 0.5,
 'Ear': 0.5,
 'Eye': 0.05,
 'Fat': 0.05,
 'Heart': 0.05,
 'Kidney': 0.05,
 'Large_Intestine': 0.05,
 'Liver': 0.2,
 'Lung': 0.2,
 'Lymph_Node': 0.5,
 'Mammary': 0.05,
 'Muscle': 0.2,
 'Ovary': 0.05,
 'Pancreas': 0.2,
 'Prostate': 0.05,
 'Salivary_Gland': 0.05,
 'Skin': 0.35,
 'Small_Intestine': 0.05,
 'Spleen': 0.5,
 'Stomach': 0.5,
 'Testis': 0.35,
 'Thymus': 0.5,
 'Tongue': 0.5,
 'Trachea': 0.35,
 'Uterus': 0.2,
 'Vasculature': 0.05}

#fix some that are too low
updated_resolution_dict = appropriate_resolution_dict
updated_resolution_dict[('Bladder',
 'Eye',
 'Fat',
 'Heart',
 'Kidney',
 'Large_Intestine',
 'Mammary',
 'Ovary',
 'Prostate',
 'Salivary_Gland',
 'Small_Intestine',
 'Vasculature')] = 0.5

In [None]:
#recluster because chatgpt seems to have turned clustering too low
#get leiden clusters
adt.leiden_adata_dict(adata_dict, resolution=updated_resolution_dict)

In [None]:
#Run diffexp analysis
adt.rank_genes_groups_adata_dict(adata_dict, groupby='leiden')

In [None]:
model = adt.get_llm_config()['model']
label_results = adt.ai_annotate_cell_type_adata_dict(adata_dict, groupby='leiden', n_top_genes=10, label_column=f'{model}_ai_cell_type', tissue_of_origin_col='tissue')

#These labels seem to have some redundancy, let's merge them with AI
ai_label_column = f'{model}_simplified_ai_cell_type'
simplified_mappings = adt.simplify_obs_column_adata_dict(adata_dict, f'{model}_ai_cell_type', ai_label_column, simplification_level='redundancy-removed')

In [None]:
#Merge the adata_dict
adata = adt.concatenate_adata_dict(adata_dict)

In [None]:
#unify the labels from the different adata in the adata_dict
label_map_with_manual = adt.ensure_label_consistency_adata(adata, ai_label_column, simplification_level='unified', new_col_prefix='unified')

In [None]:
#write the adata
path_to_write_adata = 'your-path-here.h5ad'
adata.write(path_to_write_adata)