In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import Counter

In [2]:
results_path = os.path.join('computed_stats', 'kruskal.csv')
alpha = 1e-9

# Load data
results = pd.read_csv(results_path,index_col = 0)
# Drop STRs where all populations had the same repeat count
results = results.dropna().reset_index(drop=True)

# Correct p-values
results['kruskal_pval_bonf'] = results.kruskal_pval * len(results)
results

  mask |= (ar1 == a)


Unnamed: 0,chr,position,motif_len,kruskal_statistic,kruskal_pval,kruskal_pval_bonf
0,chr1,899025,1,55.443106,2.623557e-11,3.340604e-05
1,chr1,902774,1,455.418393,2.927090e-97,3.727096e-91
2,chr1,904654,3,63.131652,6.366459e-13,8.106483e-07
3,chr1,907237,1,257.607245,1.494804e-54,1.903350e-48
4,chr1,907539,1,305.157243,8.361369e-65,1.064662e-58
...,...,...,...,...,...,...
1273306,chr22,50729989,2,22.971247,1.283120e-04,1.633811e+02
1273307,chr22,50732781,1,11.546172,2.106528e-02,2.682265e+04
1273308,chr22,50733151,3,5.399863,2.486729e-01,3.166379e+05
1273309,chr22,50735036,1,213.379989,4.980803e-45,6.342112e-39


In [3]:
coding_TRs = pd.read_csv("../stats/coding_overlap/TR_intersect.txt", sep = "\t", header=None)
coding_TRs
coding_diff = pd.merge(results, coding_TRs, left_on = ["chr","position"], right_on=[0,1])
coding_diff = coding_diff.sort_values('kruskal_pval_bonf')
coding_diff

Unnamed: 0,chr,position,motif_len,kruskal_statistic,kruskal_pval,kruskal_pval_bonf,0,1,2
1184,chr8,22404715,3,1590.530161,0.0,0.0,chr8,22404715,22404726
1547,chr11,73309324,3,2278.202135,0.0,0.0,chr11,73309324,73309343
1828,chr14,22902047,3,2481.487519,0.0,0.0,chr14,22902047,22902058
1401,chr10,75022076,3,1750.990143,0.0,0.0,chr10,75022076,75022171
1829,chr14,22902057,3,2363.263517,0.0,0.0,chr14,22902057,22902075
...,...,...,...,...,...,...,...,...,...
1575,chr11,118436780,6,0.000000,1.0,1273311.0,chr11,118436780,118436802
1813,chr13,112068246,3,0.000000,1.0,1273311.0,chr13,112068246,112068269
2546,chr19,45064349,3,0.000000,1.0,1273311.0,chr19,45064349,45064372
2656,chr20,21396154,3,0.000000,1.0,1273311.0,chr20,21396154,21396186


In [4]:
str_data_cache = {}

In [5]:
def find_peak(x):
    return Counter(x).most_common(1)[0][0]

In [6]:
data_dir_path = 'preprocessed_data'
stats = []
def plot(chrom, pos, pval):

# Get STR data
    if chrom not in str_data_cache:
        print(chrom)
        str_data = pd.read_csv(os.path.join(data_dir_path, f'{chrom}.csv'))
        str_data_cache[chrom] = str_data.drop_duplicates()
    str_data = str_data_cache[chrom]
    
    example_data = str_data[str_data.position == pos].iloc[0]
    

    # Get diffs for each population and make dataframe
    diff_data = []
    for pop in ['AMR', 'AFR', 'EAS', 'EUR', 'SAS']:
        diffs = example_data[f'diffs_{pop}'].strip(' []').split(',')
        diff_data.extend(
                [{'Super Population': pop, 'diff_n_bases': int(d.strip())} for d in diffs]
                )


    diff_data = pd.DataFrame(diff_data)
    diff_data['Copy Num. From Ref.'] = diff_data.diff_n_bases / example_data.motif_len
    
    
    
        
    grouped = diff_data.groupby('Super Population').agg({'Copy Num. From Ref.' : [np.max, np.min, find_peak]})
    grouped = grouped.droplevel(axis=1, level=0).reset_index()
    stats.append([chrom, pos, max(grouped.amax) - min(grouped.amax), 
                  max(grouped.amin) - min(grouped.amin), 
                  max(grouped.find_peak) - min(grouped.find_peak), pval])
    
#     interesting_loci.append((chrom,pos,gene))
#     # Plot
#     sns.displot(
#         data=diff_data,
#         x='Copy Num. From Ref.',
#         hue='Super Population',
#         kind="kde",
#         common_norm=False,
#         common_grid=True,
#         bw_adjust=1.5
#     )
#     plt.suptitle(f"STR Copy Number Distribution for {chrom}:{pos}")
#     plt.tight_layout()
#     #plt.savefig(os.path.join(plot_save_dir, f"{results.iloc[i].chr}_{results.iloc[i].position}.png"))
#     plt.show()

In [7]:
for index,row in coding_diff.iterrows():
    plot(row['chr'], row['position'], -np.log10(row['kruskal_pval_bonf']))

  


chr8
chr11
chr14
chr10
chr9
chr7
chr6
chr22
chr5
chr4
chr2
chr15
chr20
chr16
chr3
chr19
chr17
chr13
chr18
chr1
chr12
chr21


In [16]:
stats_df = pd.DataFrame(stats, columns = ['chrom', 'pos', 'max_dif', 'min_dif', 'peak_dif', 'pval'])
stats_df = stats_df.replace([np.inf], 300)
stats_df[stats_df['pval'] < 0] = 0




Unnamed: 0,chrom,pos,max_dif,min_dif,peak_dif,pval
0,chr8,22404715,0.0,0.0,2.0,300.0
1,chr11,73309324,0.0,0.0,1.0,300.0
2,chr14,22902047,1.0,3.0,1.0,300.0
3,chr10,75022076,1.0,1.0,1.0,300.0
4,chr14,22902057,1.0,3.0,1.0,300.0
...,...,...,...,...,...,...
2815,0,0,0.0,0.0,0.0,0.0
2816,0,0,0.0,0.0,0.0,0.0
2817,0,0,0.0,0.0,0.0,0.0
2818,0,0,0.0,0.0,0.0,0.0


In [18]:
from bioinfokit import analys, visuz


ModuleNotFoundError: No module named 'bioinfokit'

In [47]:
chrom = 'chr2'
pos = 25161587
str_data = str_data_cache[chrom]
example_data = str_data[str_data.position == pos].iloc[0]
data = example_data['diffs_EUR'].strip(' []').split(',')
data = [int(x)/int(example_data.motif_len) for x in data]
print(example_data.motif_len)
np.histogram([int(x) for x in data], bins=np.arange(-5,13), density = False)

3


(array([   0,    0,    0,    0,    0, 1180,    0,    0,   86,    0,    0,
           0,    0,    0,    0,    0,    0]),
 array([-5, -4, -3, -2, -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,
        12]))

In [7]:
interesting_loci


[('chr22', 41958483, 'ENST00000669957.1_cds_0_0_chr22_41958217_f'),
 ('chr10', 21516537, 'ENST00000449193.7_cds_0_0_chr10_21515096_r'),
 ('chr10', 21516537, 'ENST00000444772.3_cds_0_0_chr10_21515096_r'),
 ('chr1', 35370335, 'ENST00000314607.11_cds_6_0_chr1_35370372_f'),
 ('chr3', 40462029, 'ENST00000338970.10_cds_5_0_chr3_40461939_f'),
 ('chr13', 71866526, 'ENST00000611519.4_cds_7_0_chr13_71865922_r'),
 ('chr1', 154869723, 'ENST00000271915.9_cds_7_0_chr1_154869032_r'),
 ('chr13', 99970408, 'ENST00000267294.5_cds_1_0_chr13_99970127_r'),
 ('chr12', 111598950, 'ENST00000550104.5_cds_24_0_chr12_111598784_r'),
 ('chr12', 111598950, 'ENST00000389153.10_cds_23_0_chr12_111598784_r'),
 ('chr2', 25161587, 'ENST00000264708.7_cds_0_0_chr2_25161081_r'),
 ('chr2', 25161587, 'ENST00000449220.1_cds_0_0_chr2_25161146_r'),
 ('chr2', 25161574, 'ENST00000449220.1_cds_0_0_chr2_25161146_r'),
 ('chr2', 25161574, 'ENST00000264708.7_cds_0_0_chr2_25161081_r'),
 ('chr12', 6936717, 'ENST00000356654.8_cds_4_0_chr1

In [35]:
coding_regions = pd.read_csv("/gymreklab-tscc/helia/ensembl/experiments/coding_regions/mapping.txt", sep = "\t", header=None)

In [41]:
coding_regions[1] = pd.to_numeric(coding_regions[1])
coding_regions[(coding_regions[0] == "chr17") & (coding_regions[1] > 51631583) & (coding_regions[1] < 51636583)]

Unnamed: 0,0,1,2,3,4,5,6,7,8
