In [6]:
import os
%run python_functions.py

In [7]:
ring_3d7_results_dir =  '../results/3D7'
ring_fup_results_dir =  '../results/FUP'

In [8]:
# Number of files equals the number of genes
ring_3d7_files = [file for file in os.listdir(ring_3d7_results_dir) if file.startswith('kt')]
print('# of files (genes): {}'.format(len(ring_3d7_files)))

# of files (genes): 4842


In [9]:
# Number of files equals the number of genes
ring_fup_files = [file for file in os.listdir(ring_fup_results_dir) if file.startswith('kt')]
print('# of files (genes): {}'.format(len(ring_fup_files)))

# of files (genes): 4657


______________________________________________
The following two cells combine all the datasets per parasite together into one dataframe. 
Each row represents a TAKT analysis on a single gene between two datasets.
Additional columns are added that specify which two datasets (pat1, pat2) were analyzed 

In [10]:

ring_3d7_df_list = []
for file in ring_3d7_files:
    loop_df = setup_df(os.path.join(ring_3d7_results_dir, file))
    ring_3d7_df_list.append(loop_df)
ring_3d7_df = pd.concat(ring_3d7_df_list)

In [12]:
ring_fup_df_list = []
for file in ring_fup_files:
    loop_df = setup_df(os.path.join(ring_fup_results_dir, file))
    ring_fup_df_list.append(loop_df)
ring_fup_df = pd.concat(ring_fup_df_list)

In [15]:
ring_fup_df.head()

Unnamed: 0,shift_0,shift_1,shift_2,shift_3,shift_4,df_1,df_2,kt_diss_range,kt_diss,kt_diss_pval,est_delay,adj_shift,pat1,para1,pat2,para2
PF3D7_0915700,0.266667,0.6,1.0,0.333333,0.0,AS19_FUP,AS18_FUP,1.0,0.0,0.0,12.0,12.0,AS19,FUP,AS18,FUP
PF3D7_0915700,0.2,0.5,0.666667,0.0,0.0,AS19_FUP,AA17_FUP,0.666667,0.0,0.0,9.0,9.0,AS19,FUP,AA17,FUP
PF3D7_0915700,0.466667,0.3,0.5,1.0,1.0,AS19_FUP,AA13_FUP,0.7,0.3,7.5e-05,3.0,3.0,AS19,FUP,AA13,FUP
PF3D7_0915700,0.2,0.6,1.0,0.333333,0.0,AS18_FUP,AA17_FUP,1.0,0.0,0.0,12.0,12.0,AS18,FUP,AA17,FUP
PF3D7_0915700,0.733333,0.0,0.5,0.666667,1.0,AS18_FUP,AA13_FUP,1.0,0.0,0.0,3.0,3.0,AS18,FUP,AA13,FUP


__________________________________________________
The two cells below query the dataframes generated above to identify genes that:
1. have similar dynamics across all β-globin types within each parasite (TAKT p-value ≤ 0.05)
2. have one HbAA replicate phased within +/- 3, 6, or 9 hours of its other HbAA replicate
3. both its HbAS replicates are phased > +/- 3, 6, or 9 hours from each HbAA replicate

The ouput is a dictionary for each parasite that contains lists of genes for each shift

In [16]:
shift_list = [3, 6, 9] 
pval_list = [0.05]

shift_fup_dict = {sh:{p:[] for p in pval_list} for sh in shift_list}

for gene in list(set(ring_fup_df.index)):
    gene_df = ring_fup_df[ring_fup_df.index == gene]
    for single_shift in shift_list:
        for single_pval in pval_list:
            # Parasite-specific comparisons
            # genes between AA samples have similar shape and no shift greater than 'single_shift'
            AA_same = single_gene_analysis(gene_df, 'AA', 'AA', single_shift, single_pval, no_shift=True)
            # genes between AS samples have similar shape and no shift greater than 'single_shift'
            AS_same = single_gene_analysis(gene_df, 'AS', 'AS', single_shift, single_pval, no_shift=True)
            # genes between AS and AA samples have similar shape and shifts greater than 'single_shift'
            AAAS_diff = single_gene_analysis(gene_df, 'AA', 'AS', single_shift, single_pval, no_shift=False)
#             print(gene, single_shift, single_pval, AA_same, AS_same, AAAS_diff)
            
            if AA_same['FUP'] == 1 and AS_same['FUP'] == 1 and AAAS_diff['FUP'] >= 2:
                shift_fup_dict[single_shift][single_pval].append(gene)

In [17]:
shift_list = [3, 6, 9] 
pval_list = [0.05]

shift_3d7_dict = {sh:{p:[] for p in pval_list} for sh in shift_list}

for gene in list(set(ring_3d7_df.index)):
    gene_df = ring_3d7_df[ring_3d7_df.index == gene]
    for single_shift in shift_list:
        for single_pval in pval_list:
            # Parasite-specific comparisons
            # genes between AA samples have similar shape and no shift greater than 'single_shift'
            AA_same = single_gene_analysis(gene_df, 'AA', 'AA', single_shift, single_pval, no_shift=True)
            # genes between AS samples have similar shape and no shift greater than 'single_shift'
            AS_same = single_gene_analysis(gene_df, 'AS', 'AS', single_shift, single_pval, no_shift=True)
            # genes between AS and AA samples have similar shape and shifts greater than 'single_shift'
            AAAS_diff = single_gene_analysis(gene_df, 'AA', 'AS', single_shift, single_pval, no_shift=False)
#             print(gene, single_shift, single_pval, AA_same, AS_same, AAAS_diff)
            
            if AA_same['3D7'] == 1 and AS_same['3D7'] == 1 and AAAS_diff['3D7'] >= 2:
                shift_3d7_dict[single_shift][single_pval].append(gene)

In [39]:
# Combine the gene lists from above into one dataframe.
shift_df = pd.DataFrame()
shift_df_list = []
for shift in shift_list:
    shift_df_list.append(pd.DataFrame(shift_3d7_dict[shift][0.05], columns=[f'3D7_{shift}']))
    shift_df_list.append(pd.DataFrame(shift_fup_dict[shift][0.05], columns=[f'FUP_{shift}']))
shift_df = pd.concat(shift_df_list, axis=1)
shift_df.to_csv('../results/TAKT_shifts_gene_list.csv', index=False)