## Select the top 200 Most Occurring Unknown genes and the 200 Least Occurring Unknown genes

We are simply looking at the count of occurrences and not the level of expression

In [None]:
# Load worm cat data and select the Unassigned

import anndata as ad
import pandas as pd
from scipy.sparse import csr_matrix

# Note if you dont find CSV file look in 'backup-runs' directory
worm_cat_df = pd.read_csv('whole_genome_v2_nov-11-2021.csv') 
unassigned_df = worm_cat_df[worm_cat_df['Category 1']=='Unassigned']
unassigned_df.reset_index(drop=True, inplace=True)
print(f"We have {len(unassigned_df):,} Unassigned genes in Category 1")


In [None]:
# Load the h5ad file and set common variables
ad_worm_aging = ad.read("ad_worm_aging.h5ad")

x_df = pd.DataFrame(data=csr_matrix.todense(ad_worm_aging.X))

obs_df = ad_worm_aging.obs
obs_df.reset_index(drop=True, inplace=True)

var_df = ad_worm_aging.var
var_df.reset_index(drop=True, inplace=True)


In [None]:
# Get a list of unique cell types
cell_types = obs_df['annotate_name'].unique()
print(f"We have {len(cell_types)} unique Cell types")

In [None]:
# find index in x_df for Wormbase ID (column position)
# each column in the x_df is a gene. we are mapping the column position so we can index to the associated gene
import warnings
warnings.simplefilter(action="ignore", category=pd.errors.SettingWithCopyWarning)

def find_index(row):
    ret_val = var_df.index[var_df['gene_ids'] == row['Wormbase ID']].tolist()
    if ret_val == []:
        ret_val = None
    else:
        ret_val = ret_val[0]
    return ret_val

unassigned_df['gene_index'] = unassigned_df.apply(lambda row: find_index(row), axis=1)
print(f"We have {unassigned_df['gene_index'].isna().sum()} Unassigned genes that are not represented in the dataset")
print(f"That is %{round(unassigned_df['gene_index'].isna().sum()/len(unassigned_df)*100,2)} of Unassigned genes")

# unassigned_in_ds_df is the unassigned that are also in the dataset
unassigned_in_ds_df = unassigned_df[unassigned_df['gene_index'].notna()]
print(f"After removing the not represented rows we have {len(unassigned_in_ds_df['gene_index']):,} Unassigned genes")

In [None]:
# For each unknown gene count the number of non-zero entries for all cells/observations
# the result is a dictionary with gene index as key and cell_counts as value

# NOTE: This will take a few minutes to run!!
import time

results_dict={}
start = time.time()
for index in unassigned_in_ds_df['gene_index']:
    cell_count = x_df[x_df[index]>0].count()[index]
    results_dict[index]=cell_count

end = time.time()
m, s = divmod(round(end - start), 60)
h, m = divmod(m, 60)
print(f"Time for call hours {h:d} minutes {m:02d} seconds {s:02d} ")
results_dict_sorted = {k: v for k, v in sorted(results_dict.items(), key=lambda item: item[1])}


In [None]:
# Select the least occuring 200 genes and the most occuring 200 genes

results_lst_sorted = list(results_dict_sorted.keys())
first_200 = results_lst_sorted[0:200]
last_200 = results_lst_sorted[-200:]

def find_wormbase_id(index):
    ret_val = var_df.loc[index,['gene_ids']]
    return ret_val[0]

first_200_lst = []
for index in first_200:
    first_200_lst.append(find_wormbase_id(index))
first_200_df = pd.DataFrame (first_200_lst, columns = ['wormbase_id'])
first_200_df.to_csv("least_occurring_unknown.csv",index=False)
    
last_200_lst = []
for index in last_200:
    last_200_lst.append(find_wormbase_id(index))
last_200_df = pd.DataFrame (last_200_lst, columns = ['wormbase_id'])
last_200_df.to_csv("most_occurring_unknown.csv",index=False)

## Select all Unassigned: membrane spanning domain genes from wormcat

In [None]:
import random

worm_cat_df = pd.read_csv('whole_genome_v2_nov-11-2021.csv') 

unassigned_msd_df = worm_cat_df[worm_cat_df['Category 3']=='Unassigned: membrane spanning domain']
unassigned_msd_series = unassigned_msd_df['Wormbase ID']
unassigned_msd_df.reset_index(drop=True, inplace=True)
unassigned_msd_ids_df = unassigned_msd_series.to_frame()
unassigned_msd_ids_df.rename(columns={'Wormbase ID': 'wormbase_id'}, inplace=True)
unassigned_msd_ids_df.to_csv('membrane_unassigned.csv',index=False)
#print(unassigned_msd_ids_df)

## Select 200 Random Unssigned Genes from Wormcat Category 1

In [None]:
# The genes below are Worm Cat Category 1 Unassigned
import random

worm_cat_df = pd.read_csv('whole_genome_v2_nov-11-2021.csv') 

unassigned_df = worm_cat_df[worm_cat_df['Category 1']=='Unassigned']
unassigned_df.reset_index(drop=True, inplace=True)


def select_random(unassigned_df, max_genes=200):
    random_unassigned = {}
    while len(random_unassigned) < max_genes:
        index = random.randint(0, unassigned_df.shape[0])
        if index in random_unassigned.keys():
            random_unassigned[index] +=1
        else:
            random_unassigned[index] =1

    random_unassigned_df = pd.DataFrame() 
    random_unassigned_lst = []
    #wormbase_id
    for index in random_unassigned.keys():
        random_unassigned_lst.append(unassigned_df.loc[index,['Wormbase ID']][0])
    
    random_unassigned_df['wormbase_id'] = random_unassigned_lst
    return random_unassigned_df
    
random_unassigned_df = select_random(unassigned_df)

random_unassigned_df.to_csv('random_unassigned.csv',index=False)
print(random_unassigned_df)

## Hypothesis Cell Types

* Discover unknown genes that are isolated to a specific cell type

* Discover unknown genes that are pervasive across cell types

* Identify cell types with no uknown genes


In [None]:
#unknown_df = unassigned_df.copy(deep=True)
cell_types = obs_df['annotate_name'].unique()

In [None]:
# Create a dictionary with all cell types and a list of observations that the cell type appears in

cell_type_dict = {}
for cell_type in cell_types:
    cells = obs_df[obs_df['annotate_name']==cell_type]
    cell_type_dict[cell_type] = cells.index.values.tolist()
    
#cell_type_dict

In [None]:
import numpy as np


# Take a single observation and count the unknown genes that are present
def unknown_gene_count(cell_attribs, unknown_genes_array, unassigned_in_ds_df):
    cell_attribs_lst = cell_attribs.tolist()
    
    index=0
    for gene_pos in unassigned_in_ds_df['gene_index']:
        if cell_attribs_lst[int(gene_pos)] > 0: #If gene is expressed in the cell at any level increment count
            unknown_genes_array[index] += 1
        index +=1
    return unknown_genes_array
        

# Used for testing    
# cell_attribs = x_df.loc[0]
# print(len(cell_attribs))
# unknown_genes_array = np.zeros(len(unassigned_df['gene_index']), dtype = int)
# ret_val = unknown_gene_count(cell_attribs,unknown_genes_array)
# print(len(ret_val))
# print(ret_val.sum())

In [None]:
import time
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

start = time.time()
# Create a copy of unassigned_in_ds_df and prepare to append cell type data
unassigned_mapped_to_cell_type_df = unassigned_in_ds_df.copy(deep=True)
unassigned_mapped_to_cell_type_df.drop(['Category 1','Category 2','Category 3','Automated Description'], inplace=True, axis=1)

for key in cell_type_dict.keys():
    unknown_genes_array = np.zeros(len(unassigned_in_ds_df['gene_index']), dtype = int)
    for observation in cell_type_dict[key]:
        cell_attribs = x_df.loc[observation]
        unknown_genes_array = unknown_gene_count(cell_attribs, unknown_genes_array, unassigned_in_ds_df)
    unassigned_mapped_to_cell_type_df[key]=unknown_genes_array

end = time.time()
m, s = divmod(round(end - start), 60)
h, m = divmod(m, 60)
print(f"Time for call hours {h:d} minutes {m:02d} seconds {s:02d} ")    
unassigned_mapped_to_cell_type_df.to_csv("unassigned_mapped_to_cell_type_df.csv",index=False)

In [None]:
# Identify cell types with no uknown genes

# All Cell Types have some level of interaction with Unknown Genes
# '3_3:germline, sperm' has the lowest level with 320 counts this makes sense since there are only 4 observation that make up this cell type [2003, 15156, 17008, 40736]
# 0_0:seam cell has the highest with 321,367 this may also make sense since 2,202 observations contrubute to this cell type

cell_types_gene_count_dict = {}
for key in cell_type_dict.keys():
    count = unassigned_mapped_to_cell_type_df[key].sum()
    cell_types_gene_count_dict[key]= count

cell_types_gene_count_dict_sorted = {k: v for k, v in sorted(cell_types_gene_count_dict.items(), key=lambda item: item[1])}
for key in cell_types_gene_count_dict_sorted.keys():
    print(f"{key:<60} {cell_types_gene_count_dict_sorted[key]:>5,}")
    


In [None]:
# Discover unknown genes that are isolated to a specific cell type

# Start by creating a temp dataframe with just the Wormbased IDs of the unknowns
temp_df = unassigned_mapped_to_cell_type_df['Wormbase ID'].copy().to_frame()
print(len(temp_df))

# A simple function to sum up the colums related to cell types
def sum_category_counts(row):
     return row[3:].sum()
    
# Create a new column named counts with total for all indivudual cell types
temp_series =  unassigned_mapped_to_cell_type_df.apply(lambda row: sum_category_counts(row), axis=1)
temp_df['counts'] = temp_series

# Sort the data by 'counts' and write to a csv file
temp_df = temp_df.sort_values('counts')
temp_df.to_csv('isolated2cellType.csv',index=False)
temp_df

In [None]:
# Given a row with a wormbase ID look up the same 'Wormbase ID' in the unassigned_mapped_to_cell_type_df dataframe
# This dataframe has the counts of the number of times that the 'Wormbase ID' was expressed in any observation for this cell type
# It does not include the level of expression only if the gene was on or not

cell_type_list = list(cell_type_dict.keys())

#This is a complete hack peace of sh#$%@&t
def cell_types(row_a):
    result = unassigned_mapped_to_cell_type_df.loc[unassigned_mapped_to_cell_type_df['Wormbase ID']==row_a['Wormbase ID']]
    row = result.iloc[0]
    #print(row_a['Wormbase ID'])
    cell_type_lst = []
    pos=0
    for attrib in row:
        pos +=1
        if pos > 3 and attrib >0:
            cell_type_lst.append(cell_type_list[pos-4])
    
    print(cell_type_lst)
    return cell_type_lst


   

In [None]:
# Here we get down to the details of unknown genes that are isolated to a specific cell type
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

temp_df_20 = temp_df.loc[temp_df['counts']<3]
temp_df_20['cell_types'] =  temp_df_20.apply(lambda row: cell_types(row), axis=1)
temp_df_20.to_csv('isolated2cellType.csv',index=False)

## Select all Category 1 'Muscle function' genes from wormcat

In [None]:
# Load worm cat data and select the unknowns

import anndata as ad
import pandas as pd
from scipy.sparse import csr_matrix

worm_cat_df = pd.read_csv('whole_genome_v2_nov-11-2021.csv') 
muscle_function_df = worm_cat_df[worm_cat_df['Category 1']=='Muscle function']
muscle_function_df.drop(['Sequence ID','Category 1','Category 2','Category 3','Automated Description'], inplace=True, axis=1)
muscle_function_df.rename(columns={'Wormbase ID': 'wormbase_id'}, inplace=True)
muscle_function_df.reset_index(drop=True, inplace=True)
muscle_function_df.to_csv('muscle_function.csv',index=False)
print(f"Muscle function = {len(muscle_function_df):,}")
print(muscle_function_df)