In [88]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import Counter
from bioinfokit import analys, visuz
from tqdm import tqdm

In [89]:
results_path = os.path.join('computed_stats', 'kruskal.csv')
alpha = 1e-9

# Load data
results = pd.read_csv(results_path)
results = results.drop(columns = ['Unnamed: 0'])
# Drop STRs where all populations had the same repeat count
results = results.dropna().reset_index(drop=True)

# Correct p-values
results['kruskal_pval_bonf'] = results.kruskal_pval * len(results)
results['kruskal_pval_bonf'].where(results['kruskal_pval_bonf'] < 1, 1, inplace=True)

In [90]:
str_data_cache = {}

In [91]:
data_dir_path = 'preprocessed_data'

def find_pop_expansion(chrom, pos):

# Get STR data
    if chrom not in str_data_cache:
        print(chrom)
        str_data = pd.read_csv(os.path.join(data_dir_path, f'{chrom}.csv'))
        str_data_cache[chrom] = str_data.drop_duplicates()
    str_data = str_data_cache[chrom]
    example_data = str_data[str_data.position == pos].iloc[0]

    # Get diffs for each population and make dataframe
    motif_len = example_data.motif_len
    maximums = []
    diffs_dict = {}
    for pop in ['AMR', 'AFR', 'EAS', 'EUR', 'SAS']:
        diffs = example_data[f'diffs_{pop}'].strip(' []').split(',')
        if diffs == ['']:
            return np.nan
        diffs = [int(int(d.strip()) / motif_len) for d in diffs]
        diffs_dict[pop] = diffs
        maximums.append((max(diffs), pop))
    
    maximums.sort(reverse=True)
    if maximums[0][0] < 10:
        return

#     if maximums[0][0] - maximums[1][0] > 15:
    counts_5 = len([x for x in diffs_dict[maximums[0][1]] if x >= maximums[0][0] - 5])
    counts_10 = len([x for x in diffs_dict[maximums[0][1]] if x >= maximums[0][0] - 10])
    counts_15 = len([x for x in diffs_dict[maximums[0][1]] if x >= maximums[0][0] - 15])
    counts_mid = len([x for x in diffs_dict[maximums[0][1]] if x >= (maximums[0][0] + maximums[1][0]) / 2])
    
    stats.append([chrom, pos, motif_len, maximums[0][0], 
                  maximums[0][1], maximums[1][0], counts_5, counts_10, counts_15, counts_mid])

In [92]:
stats = []
loci = list(zip(results.chr, results.position))
for i in tqdm(range(len(loci))):
    find_pop_expansion(loci[i][0], loci[i][1])

  0%|          | 0/1273311 [00:00<?, ?it/s]

chr1


  9%|▊         | 110018/1273311 [08:32<1:27:05, 222.63it/s]

chr2


 17%|█▋        | 217775/1273311 [16:55<1:17:30, 226.96it/s] 

chr3


 24%|██▍       | 306770/1273311 [23:53<1:10:37, 228.10it/s]

chr4


 30%|███       | 387521/1273311 [30:18<1:06:38, 221.51it/s] 

chr5


 37%|███▋      | 465591/1273311 [36:30<1:00:12, 223.59it/s]

chr6


 43%|████▎     | 543167/1273311 [42:40<54:16, 224.20it/s]   

chr7


 48%|████▊     | 613902/1273311 [48:14<47:46, 230.03it/s]  

chr8


 53%|█████▎    | 678724/1273311 [53:19<44:36, 222.17it/s]  

chr9


 57%|█████▋    | 731774/1273311 [57:29<39:28, 228.67it/s]  

chr10


 62%|██████▏   | 793340/1273311 [1:02:17<34:44, 230.25it/s]

chr11


 67%|██████▋   | 852020/1273311 [1:06:54<31:17, 224.43it/s]  

chr12


 72%|███████▏  | 919092/1273311 [1:12:08<26:01, 226.89it/s]  

chr13


 75%|███████▌  | 960666/1273311 [1:15:23<22:21, 233.10it/s]  

chr14


 79%|███████▉  | 1003541/1273311 [1:18:43<18:51, 238.37it/s] 

chr15


 82%|████████▏ | 1042321/1273311 [1:21:45<16:47, 229.37it/s]  

chr16


 85%|████████▌ | 1084977/1273311 [1:25:04<14:04, 223.13it/s]  

chr17


 89%|████████▉ | 1132157/1273311 [1:28:41<10:11, 230.97it/s] 

chr18


 92%|█████████▏| 1165620/1273311 [1:31:19<07:47, 230.46it/s] 

chr19


 95%|█████████▍| 1207707/1273311 [1:34:31<04:44, 230.70it/s] 

chr20


 97%|█████████▋| 1238260/1273311 [1:36:55<02:29, 234.49it/s] 

chr21


 98%|█████████▊| 1253727/1273311 [1:38:05<01:24, 231.08it/s]

chr22


100%|██████████| 1273311/1273311 [1:39:35<00:00, 213.10it/s]


In [115]:
stats_df = pd.DataFrame(stats, columns = ['chr', 'pos', 'motif', 
                                          'max_cn', 'max_pop', 'second_cn', 'more_5',
                                          'more_10', 'more15', 'more_mid'])

stats_df['dif_cn'] = stats_df['max_cn'] - stats_df['second_cn']

stats_df.to_csv("expansion_stats.csv", index=False, sep = "\t")

diff = list(stats_df['dif_cn'].sort_values(ignore_index=True))



In [164]:
stats_df.to_csv("expansion_stats.csv", index=False, sep = "\t")


In [131]:
# Examine found expansions

def plot(chrom, pos):

# Get STR data
    if chrom not in str_data_cache:
        print(chrom)
        str_data = pd.read_csv(os.path.join(data_dir_path, f'{chrom}.csv'))
        str_data_cache[chrom] = str_data.drop_duplicates()
    str_data = str_data_cache[chrom]
    
    example_data = str_data[str_data.position == pos].iloc[0]
    

    # Get diffs for each population and make dataframe
    diff_data = []
    for pop in ['AMR', 'AFR', 'EAS', 'EUR', 'SAS']:
        diffs = example_data[f'diffs_{pop}'].strip(' []').split(',')
        if diffs == ['']:
            return -1
        diff_data.extend(
                [{'Super Population': pop, 'diff_n_bases': int(d.strip())} for d in diffs]
                )


    diff_data = pd.DataFrame(diff_data)
    diff_data['Copy Num. From Ref.'] = diff_data.diff_n_bases / example_data.motif_len
    
    # Plot
    sns.displot(
        data=diff_data,
        x='Copy Num. From Ref.',
        hue='Super Population',
        kind="kde",
        common_norm=False,
        common_grid=True,
        bw_adjust=1.5
    )
    plt.suptitle(f"STR Copy Number Distribution for {chrom}:{pos}")
    plt.tight_layout()
    #plt.savefig(os.path.join(plot_save_dir, f"{results.iloc[i].chr}_{results.iloc[i].position}.png"))
    plt.show()

def examine(chrom, pos, threshold):

    # Get STR data
    if chrom not in str_data_cache:
        print(chrom)
        str_data = pd.read_csv(os.path.join(data_dir_path, f'{chrom}.csv'))
        str_data_cache[chrom] = str_data.drop_duplicates()
    str_data = str_data_cache[chrom]
    example_data = str_data[str_data.position == pos].iloc[0]
    
    motif_len = example_data.motif_len
    print(chrom, pos, motif_len)
    diffs_dict = {}
    for pop in ['AMR', 'AFR', 'EAS', 'EUR', 'SAS']:
        diffs = example_data[f'diffs_{pop}'].strip(' []').split(',')
        if diffs == ['']:
            return np.nan
        diffs = [int(int(d.strip()) / motif_len) for d in diffs]
        diffs_dict[pop] = len([d for d in diffs if d > threshold])
    return diffs_dict    

for index, row in expansions.iterrows():
    chrom = row['chr']
    pos = row['pos']
    print(examine(chrom, pos, 15))
    plot(chrom, pos)
    


In [165]:
#Extract gene information

genes = pd.read_csv("Homo_sapiens.GRCh38.108.gtf", sep = "\t", comment="#", header=None)
genes[0] = "chr" + genes[0].astype(str)


def find_genes(row):
    gene_df = genes[(genes[0] == row['chr']) & (genes[3] <= row['pos']) & 
                    (genes[4] >= row['pos'])]
    if len(gene_df) == 0:
        return None
    gene_df_info = gene_df[8].str.split(";")
    gene_set = set()
    for info in list(gene_df_info):
        for field in info:
            if "gene_name" in field:
                name = field.strip().replace("gene_name ","").replace('"','')
                gene_set.add(name)
    if len(gene_set) == 0:
        return None
                
    return ",".join(list(gene_set))

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [169]:
# Final table

expansions = stats_df[(stats_df['dif_cn'] >= 15) & (stats_df['more_mid'] > 10)].sort_values('dif_cn')

def aggregate_info(row):
    expansion_dict = {}
    mid = int((row['max_cn'] + row['second_cn'])/2)
    expansion_dict[row['max_cn'] - 5] = row['more_5']
    expansion_dict[row['max_cn'] - 10] = row['more_10']
    expansion_dict[row['max_cn'] - 15] = row['more15']
    expansion_dict[mid] = row['more_mid']
    return expansion_dict
    
expansions['gene'] = expansions.apply(lambda row: find_genes(row), axis = 1)
expansions['size:number'] = expansions.apply(lambda row: aggregate_info(row), axis = 1)
expansions = expansions.drop(columns = ['more_5', 'more_10', 'more15', 'more_mid', 'dif_cn'])
expansions.sort_values("max_cn").to_csv("large_expansions.csv", sep = "\t", index = False)