In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc='My bar!')
from collections import defaultdict

  from pandas import Panel


In [2]:
######## Loading gene information ########
genes = pd.read_csv("Homo_sapiens.GRCh38.108.gtf", sep = "\t", comment="#", header=None)
genes[0] = "chr" + genes[0].astype(str)


def find_genes(row):
    gene_df = genes[(genes[0] == row['CHROM']) & (genes[3] <= row['POS']) & 
                    (genes[4] >= row['POS'])]
    if len(gene_df) == 0:
        return None
    gene_df_info = gene_df[8].str.split(";")
    gene_set = set()
    for info in list(gene_df_info):
        for field in info:
            if "gene_name" in field:
                name = field.strip().replace("gene_name ","").replace('"','')
                gene_set.add(name)
    if len(gene_set) == 0:
        return None
                
    return ",".join(list(gene_set))

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [69]:
######## Loading initial set of expansions ########
all_expansions = pd.DataFrame(columns = ['CHROM', 'POS', 'PERIOD', 'MOTIF', 'outlier_threshold', 'AFR_freq',
       'NON_AFR_freq'])
for i in range(1,23):
    expansions = pd.read_csv(f"{i}_expansions.csv")
    expansions = expansions[(expansions['AFR_freq'] > 10 * expansions['NON_AFR_freq']) | 
                            (expansions['NON_AFR_freq'] > 10 * expansions['AFR_freq'])]
    all_expansions = pd.concat([expansions, all_expansions])

all_expansions = all_expansions[all_expansions['outlier_threshold'] > 10]    
all_expansions['gene'] = all_expansions.apply(lambda row: find_genes(row), axis = 1)

In [21]:
######## Loading samples and populations ########
pedigree = pd.read_csv("/expanse/projects/gymreklab/helia/TR_1000G/1000G.ped", delim_whitespace=True)
pedigree = pedigree[['SampleID','Superpopulation', 'Population']]

samp_to_subpop = pd.Series(pedigree[pedigree['Superpopulation'] == "AFR"].Population.values,index=pedigree[pedigree['Superpopulation'] == "AFR"].SampleID).to_dict()
samp_to_pop = pd.Series(pedigree.Superpopulation.values,index=pedigree.SampleID).to_dict()
H3Africa_names = pd.read_csv("/expanse/projects/gymreklab/helia/H3Africa/names/H3A_Baylor_sample_country.txt", header=None, delim_whitespace=True)

for index,row in H3Africa_names.iterrows():
    samp_to_pop[row[0]] = "H3Africa"
    samp_to_subpop[row[0]] = row[1]
    
african_pop = set(list(samp_to_subpop.values()))

In [99]:
######## Finding Allele frequency for H3Africa and 1KG African populations separately ########
def find_freqs(row):
    chrom = row['CHROM'].replace("chr", "")
    header = []
    with open(f"/expanse/projects/gymreklab/helia/ensembl/experiments/charact/diff_dist/diff/{chrom}_diff.txt") as f:
        for line in f:
            if line.startswith("chr"):
                    f.close()
                    break
            header.append(line.strip())

    addr=f"/expanse/projects/gymreklab/helia/ensembl/experiments/charact/diff_dist/diff/{chrom}_diff.txt"

    pos = row['POS']
    x=!grep -w $pos $addr
    call = ""
    if len(x) > 1:
        for l in x:
            if row['MOTIF'] in l:
                call = l
                break
    else:
        call = x[0]
    gbs = call.split("\t")[4:]
    dict_ = dict(zip(header, gbs))
    
    for x in dict_:
        dict_[x] = dict_[x].split("/")
        modified_gbs = []
        for s in dict_[x]:
            if s != "." and s != "":
                modified_gbs.append(int(s)/ row['PERIOD'])
                
        dict_[x] = modified_gbs
        
        
        
    pop_freqs = {}
    for pop in ['AFR', 'H3Africa']:
        gbs = []
        for samp in dict_:
            if samp in samp_to_pop:
                if samp_to_pop[samp] == pop:
                    gbs.extend(dict_[samp])
                
        if len(gbs) == 0:
            pop_freqs[pop] = -1
        else:
            above_threshold = round(len([x for x in gbs if x not in [".",""] \
                       and x > row['outlier_threshold']]) / len([x for x in gbs if x not in [".",""]]),3)
            pop_freqs[pop] = above_threshold
    return pop_freqs

all_expansions[['AFR_freqs', 
                        'H3Africa_freqs']] = all_expansions.apply(lambda row: find_freqs(row), 
                                                                      axis = 1, result_type="expand")

In [100]:
######## Filtering expansions where one of the 1KG or H3Africa frequency is not > 0.01 ########
all_expansions_both = all_expansions[(all_expansions['AFR_freq'] < 0.01) | 
                                     ((all_expansions['AFR_freqs'] > 0.01) & 
                                     (all_expansions['H3Africa_freqs'] > 0.01))]
all_expansions_both['outlier_threshold'] = all_expansions_both['outlier_threshold'].round(1)
all_expansions_both['NON_AFR_freq'] = all_expansions_both['NON_AFR_freq'].round(3)
all_expansions_both['AFR_freq'] = all_expansions_both['AFR_freq'].round(3)
all_expansions_both.columns = ['CHROM','POS','PERIOD','MOTIF','OUTLIER_THRESH',
                               'ALL_AFR_FREQ','NON_AFR_FREQ','GENE','1KG_AFR_FREQ','H3Africa_FREQ']
all_expansions_both = all_expansions_both[all_expansions_both['OUTLIER_THRESH'] > 10]
all_expansions_both

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_expansions_both['outlier_threshold'] = all_expansions_both['outlier_threshold'].round(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_expansions_both['NON_AFR_freq'] = all_expansions_both['NON_AFR_freq'].round(3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_expansions_both['AFR_freq

Unnamed: 0,CHROM,POS,PERIOD,MOTIF,OUTLIER_THRESH,ALL_AFR_FREQ,NON_AFR_FREQ,GENE,1KG_AFR_FREQ,H3Africa_FREQ
5,chr22,24042817,2,AC,16.0,0.033,0.001,CABIN1,0.036,0.024
9,chr22,24757660,2,AC,16.0,0.013,0.001,PIWIL3,0.013,0.011
11,chr22,26667333,2,AC,11.0,0.081,0.001,MIAT,0.078,0.089
30,chr22,39686600,2,AT,11.0,0.003,0.037,CACNA1I,0.003,0.001
34,chr22,41958483,2,AG,12.0,0.070,0.002,SMIM45,0.076,0.055
...,...,...,...,...,...,...,...,...,...,...
291,chr1,227297941,2,AC,12.0,0.028,0.001,CDC42BPA,0.027,0.029
292,chr1,227297968,2,AC,12.0,0.031,0.001,CDC42BPA,0.030,0.032
311,chr1,235882040,2,AC,12.0,0.027,0.000,LYST,0.027,0.029
325,chr1,244553603,2,AC,13.0,0.044,0.001,CATSPERE,0.040,0.056


In [101]:
#### Compare with Ibra's results on ExpansionHunter Denovo and STRetch ####
EH = pd.read_csv("/expanse/projects/gymreklab/helia/ensembl/experiments/charact/diff_dist/STRetch-EHdn/EHdn/loci_detected_by_EHdn.csv")
stretch_all_controls = pd.read_csv("/expanse/projects/gymreklab/helia/ensembl/experiments/charact/diff_dist/STRetch-EHdn/new_STRetch_results/TRs_detected_by_STRetch.csv")
stretch_no_controls = pd.read_csv("/expanse/projects/gymreklab/helia/ensembl/experiments/charact/diff_dist/STRetch-EHdn/new_STRetch_results/TRs_detected_by_STRetch_no_controls.csv")

stretch_all_controls.columns = ['chr', 'start', 'end', 'motif', 'gene', 'dist']
stretch_no_controls.columns = ['chr', 'start', 'end', 'motif', 'gene', 'dist']

def support(row, method):
    if method == "EH":
        compare_df = EH.copy()
    elif method == "stretch_control":
        compare_df = stretch_all_controls.copy()
    elif method == "stretch_no_control":
        compare_df = stretch_no_controls.copy()
        
    df = compare_df[(compare_df['chr'] == row['CHROM']) & 
                    (compare_df['start'] >= row['POS'] - 1000 ) & (compare_df['start'] < row['POS'] + 1000)]
    if len(df) == 0:
        return "N"
    if len(df) > 1:
        cnt = 0
        for index, row1 in df.iterrows():
            if row1['motif'] == row['MOTIF']:
                cnt += 1
        if cnt == 1:
            return "Y"
        else:
            print(f"Multiple records at locus {row['POS']}")
            display(df)
            return "N"
    if len(df) == 1 and str(df['motif'].iloc[0]) == str(row['MOTIF']):
        return "Y"
    if len(df) == 1 and str(df['motif'].iloc[0]) != str(row['MOTIF']):
        print(f"Different motif at locus {row['POS']}")
        return "N"
    else:
        display(df)


all_expansions_both['EHDenovo_supported'] = all_expansions_both.apply(lambda x: support(x, "EH"), axis = 1)
all_expansions_both['STRetch_controls_supported'] = all_expansions_both.apply(lambda x: support(x, "stretch_control"), axis = 1)
all_expansions_both['STRetch_no_controls_supported'] = all_expansions_both.apply(lambda x: support(x, "stretch_no_control"), axis = 1)
all_expansions_both.sort_values('OUTLIER_THRESH', ascending=False).to_csv('Large_Expansions.csv', index = False)

Different motif at locus 92185591
Different motif at locus 92185591


In [2]:
all_expansions_both = pd.read_csv('Large_Expansions.csv')
both_supported = all_expansions_both[(all_expansions_both['EHDenovo_supported'] == "Y") & (all_expansions_both['STRetch_no_controls_supported'] == "Y")]
print(f"Supported by both EH and STRetch at {len(both_supported)} loci.")

at_least_one_supported = all_expansions_both[(all_expansions_both['EHDenovo_supported'] == "Y") | (all_expansions_both['STRetch_no_controls_supported'] == "Y")]
print(f"Supported by at least one of the EH or STRetch at {len(at_least_one_supported)} loci.")

Supported by both EH and STRetch at 5 loci.
Supported by at least one of the EH or STRetch at 11 loci.


In [6]:
print(f"#Expansions in African samples: {len(all_expansions_both[all_expansions_both['ALL_AFR_FREQ'] > all_expansions_both['NON_AFR_FREQ']])}")



#Expansions in African samples: 198


In [7]:
all_expansions_both

Unnamed: 0,CHROM,POS,PERIOD,MOTIF,OUTLIER_THRESH,ALL_AFR_FREQ,NON_AFR_FREQ,GENE,1KG_AFR_FREQ,H3Africa_FREQ,EHDenovo_supported,STRetch_controls_supported,STRetch_no_controls_supported
0,chr17,51831667,3,AGC,65.0,0.016,0.001,CA10,0.014,0.022,Y,Y,Y
1,chr1,77887912,3,AAG,39.7,0.140,0.012,NEXN-AS1,0.141,0.149,Y,Y,Y
2,chr18,2956311,2,AC,32.0,0.015,0.000,LPIN2,0.013,0.019,N,N,N
3,chr13,49441221,2,AT,32.0,0.001,0.013,CAB39L,0.001,0.000,N,N,N
4,chr1,20599617,5,AAAAT,24.0,0.039,0.001,CDA,0.035,0.047,N,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,chr4,25990454,2,AC,11.0,0.073,0.002,,0.076,0.066,N,N,N
260,chr1,217003554,6,AATATT,10.7,0.017,0.000,ESRRG,0.017,0.017,N,N,N
261,chr1,31218013,4,AAAG,10.5,0.029,0.000,NKAIN1,0.034,0.017,Y,N,N
262,chr15,34375564,2,AT,10.5,0.002,0.037,,0.002,0.000,N,N,N
