In [1]:
import subprocess
import sys
import os
import shutil
import pandas as pd
import numpy as np

In [2]:
def shell_do(command, log=False, return_log=False):
    print(f'Executing: {(" ").join(command.split())}', file=sys.stderr)

    res=subprocess.run(command.split(), stdout=subprocess.PIPE)

    if log:
        print(res.stdout.decode('utf-8'))
    if return_log:
        return(res.stdout.decode('utf-8'))

In [3]:
#set paths
basedir = '/data/songy4/twas'
datadir = f'{basedir}/data_folder'
fusiondir = f'{basedir}/fusion_twas'
sumstat_path = f'{datadir}/meta.txt'
top_dir = f'{basedir}/output/top'
anal_dir = f'{basedir}/output/analysis'
fusion_ldref_basename = f'{fusiondir}/LDREF/1000G.EUR'
fusion_post_script = f'{fusiondir}/FUSION.post_process.R'

!mkdir --parents output/top
!mkdir --parents output/analysis

!for i in {1..22}; do mkdir --parents output/analysis/PD.$i; done;

In [4]:
#check shape of .dat files
gene_ls = []
for i in range(1, 23):
    pd_i = pd.read_csv(f"{basedir}/output/pd/PD.{i}.dat", sep='\t', index_col=False)
    print('number of rows in PD.' + str(i) + '.dat:', pd_i.shape[0])
    gene_ls.append(pd_i.shape[0])
print('number of rows in dat files', gene_ls)
##row numbers are number of genes per chromosom

number of rows in PD.1.dat: 1036
number of rows in PD.2.dat: 667
number of rows in PD.3.dat: 567
number of rows in PD.4.dat: 411
number of rows in PD.5.dat: 465
number of rows in PD.6.dat: 467
number of rows in PD.7.dat: 498
number of rows in PD.8.dat: 341
number of rows in PD.9.dat: 386
number of rows in PD.10.dat: 389
number of rows in PD.11.dat: 575
number of rows in PD.12.dat: 504
number of rows in PD.13.dat: 178
number of rows in PD.14.dat: 301
number of rows in PD.15.dat: 320
number of rows in PD.16.dat: 406
number of rows in PD.17.dat: 520
number of rows in PD.18.dat: 147
number of rows in PD.19.dat: 692
number of rows in PD.20.dat: 250
number of rows in PD.21.dat: 112
number of rows in PD.22.dat: 246
number of rows in dat files [1036, 667, 567, 411, 465, 467, 498, 341, 386, 389, 575, 504, 178, 301, 320, 406, 520, 147, 692, 250, 112, 246]


In [5]:
#check null value in TWAS.P in each dat file
for i in range(1,23):
    pd_i = pd.read_csv(f"{basedir}/output/pd/PD.{i}.dat", sep='\t', index_col=False)
    print('Number of null value in TWAS.P in pd_' + str(i) + ':', pd_i['TWAS.P'].isnull().sum())

Number of null value in TWAS.P in pd_1: 0
Number of null value in TWAS.P in pd_2: 0
Number of null value in TWAS.P in pd_3: 0
Number of null value in TWAS.P in pd_4: 0
Number of null value in TWAS.P in pd_5: 0
Number of null value in TWAS.P in pd_6: 0
Number of null value in TWAS.P in pd_7: 0
Number of null value in TWAS.P in pd_8: 0
Number of null value in TWAS.P in pd_9: 0
Number of null value in TWAS.P in pd_10: 0
Number of null value in TWAS.P in pd_11: 0
Number of null value in TWAS.P in pd_12: 0
Number of null value in TWAS.P in pd_13: 0
Number of null value in TWAS.P in pd_14: 0
Number of null value in TWAS.P in pd_15: 0
Number of null value in TWAS.P in pd_16: 0
Number of null value in TWAS.P in pd_17: 0
Number of null value in TWAS.P in pd_18: 0
Number of null value in TWAS.P in pd_19: 0
Number of null value in TWAS.P in pd_20: 0
Number of null value in TWAS.P in pd_21: 0
Number of null value in TWAS.P in pd_22: 0


In [6]:
#from each .dat file in pd, grab rows where TWAS.P column has smaller value than 0.05/number of rows and save them as .top file in top folder
for i in range(1,23):
    pd_i = pd.read_csv(f"{basedir}/output/pd/PD.{i}.dat", sep='\t', index_col=False)
    pd_i['TWAS.P'] = pd.to_numeric(pd_i['TWAS.P'], errors='coerce')
    pd_i['PANEL'] = pd_i['PANEL'].astype(str).str[3].fillna('NA')
    value_i = 0.05/pd_i.shape[0]
    df_i = pd.DataFrame(columns = pd_i.columns.tolist())
    for index, row in pd_i.iterrows():
        if row['TWAS.P'] < value_i:
            df_i.loc[index] = row
    df_i.to_csv(f'./output/top/PD.{i}.top', sep='\t' ,index=False)

In [39]:
#use row numbers (number of genes which is ID) to divide 0.05  ---> this is manula way

#!cat ./output/pd/PD.1.dat | awk 'NR == 1 || $NF < 0.05/1036' > ./output/top/PD.1.top
#!cat ./output/pd/PD.2.dat | awk 'NR == 1 || $NF < 0.05/667' > ./output/top/PD.2.top
#!cat ./output/pd/PD.3.dat | awk 'NR == 1 || $NF < 0.05/567' > ./output/top/PD.3.top
#!cat ./output/pd/PD.4.dat | awk 'NR == 1 || $NF < 0.05/411' > ./output/top/PD.4.top
#!cat ./output/pd/PD.5.dat | awk 'NR == 1 || $NF < 0.05/465' > ./output/top/PD.5.top
#!cat ./output/pd/PD.6.dat | awk 'NR == 1 || $NF < 0.05/467' > ./output/top/PD.6.top
#!cat ./output/pd/PD.7.dat | awk 'NR == 1 || $NF < 0.05/498' > ./output/top/PD.7.top
#!cat ./output/pd/PD.8.dat | awk 'NR == 1 || $NF < 0.05/341' > ./output/top/PD.8.top
#!cat ./output/pd/PD.9.dat | awk 'NR == 1 || $NF < 0.05/386' > ./output/top/PD.9.top
#!cat ./output/pd/PD.10.dat | awk 'NR == 1 || $NF < 0.05/389' > ./output/top/PD.10.top
#!cat ./output/pd/PD.11.dat | awk 'NR == 1 || $NF < 0.05/575' > ./output/top/PD.11.top
#!cat ./output/pd/PD.12.dat | awk 'NR == 1 || $NF < 0.05/504' > ./output/top/PD.12.top
#!cat ./output/pd/PD.13.dat | awk 'NR == 1 || $NF < 0.05/178' > ./output/top/PD.13.top
#!cat ./output/pd/PD.14.dat | awk 'NR == 1 || $NF < 0.05/301' > ./output/top/PD.14.top
#!cat ./output/pd/PD.15.dat | awk 'NR == 1 || $NF < 0.05/320' > ./output/top/PD.15.top
#!cat ./output/pd/PD.16.dat | awk 'NR == 1 || $NF < 0.05/406' > ./output/top/PD.16.top
#!cat ./output/pd/PD.17.dat | awk 'NR == 1 || $NF < 0.05/520' > ./output/top/PD.17.top
#!cat ./output/pd/PD.18.dat | awk 'NR == 1 || $NF < 0.05/147' > ./output/top/PD.18.top
#!cat ./output/pd/PD.19.dat | awk 'NR == 1 || $NF < 0.05/692' > ./output/top/PD.19.top
#!cat ./output/pd/PD.20.dat | awk 'NR == 1 || $NF < 0.05/250' > ./output/top/PD.20.top
#!cat ./output/pd/PD.21.dat | awk 'NR == 1 || $NF < 0.05/112' > ./output/top/PD.21.top
#!cat ./output/pd/PD.22.dat | awk 'NR == 1 || $NF < 0.05/246' > ./output/top/PD.22.top

In [8]:
#fusion post process 
for i in range(1, 23):
    anal_dir = f'output/analysis'
    top_dir = f'output/top'
    fusion_post_cmd_i = f'\
    Rscript {fusion_post_script} \
    --sumstats {sumstat_path} \
    --input {top_dir}/PD.{i}.top \
    --out {anal_dir}/PD.{i}/PD.{i}.top.analysis \
    --ref_ld_chr {fusion_ldref_basename}. \
    --chr {i} \
    --plot --locus_win 100000'
    shell_do(fusion_post_cmd_i)

Executing: Rscript /data/songy4/twas/fusion_twas/FUSION.post_process.R --sumstats /data/songy4/twas/data_folder/meta.txt --input output/top/PD.1.top --out output/analysis/PD.1/PD.1.top.analysis --ref_ld_chr /data/songy4/twas/fusion_twas/LDREF/1000G.EUR. --chr 1 --plot --locus_win 100000
Executing: Rscript /data/songy4/twas/fusion_twas/FUSION.post_process.R --sumstats /data/songy4/twas/data_folder/meta.txt --input output/top/PD.2.top --out output/analysis/PD.2/PD.2.top.analysis --ref_ld_chr /data/songy4/twas/fusion_twas/LDREF/1000G.EUR. --chr 2 --plot --locus_win 100000
Executing: Rscript /data/songy4/twas/fusion_twas/FUSION.post_process.R --sumstats /data/songy4/twas/data_folder/meta.txt --input output/top/PD.3.top --out output/analysis/PD.3/PD.3.top.analysis --ref_ld_chr /data/songy4/twas/fusion_twas/LDREF/1000G.EUR. --chr 3 --plot --locus_win 100000
Executing: Rscript /data/songy4/twas/fusion_twas/FUSION.post_process.R --sumstats /data/songy4/twas/data_folder/meta.txt --input output/

In [4]:
#grab all gene ID from to.analysis.joint_included.dat files
import glob

def read_id(file):
    return pd.read_csv(file, delim_whitespace=1, usecols=[1])

files = glob.glob(f'{anal_dir}/**/*.joint_included.dat')
    
case_df = pd.concat([read_id(file) for file in files], axis=0)

case_df.to_csv(r'./data_folder/twas_genes_case.csv',index=False)

In [5]:
case_df

Unnamed: 0,ID
0,ENSG00000187010
1,ENSG00000036549
2,ENSG00000117500
3,ENSG00000143537
4,ENSG00000160783
...,...
0,ENSG00000136235
1,ENSG00000105854
2,ENSG00000146834
0,ENSG00000164733


In [5]:
#compare twas_genes_case.csv, twas_genes_control.csv and gwas_genes.csv
gwas = pd.read_csv(r"./data_folder/gwas_genes.csv", sep=' ')
twas_case = pd.read_csv(r"./data_folder/twas_genes_case.csv", sep=' ')
twas_cont = pd.read_csv(r"./data_folder/twas_genes_control.csv", sep=' ')

print('shape of gwas:', gwas.shape)
print('shape of twas cases:', twas_case.shape)
print('shape of twas control:', twas_cont.shape)

shape of gwas: (439, 1)
shape of twas cases: (100, 1)
shape of twas control: (87, 1)


In [6]:
#TWAS case & GWAS significant hits
twas_case_gwas = twas_case[twas_case['ID'].isin(gwas['ID'])].reset_index(drop=True)
print("number of TWAS case genes in GWAS genes:", twas_case_gwas.shape[0])

#TWAS case & GWAS significant hits
twas_cont_gwas = twas_cont[twas_cont['ID'].isin(gwas['ID'])].reset_index(drop=True)
print("number of TWAS control genes in GWAS genes:", twas_cont_gwas.shape[0])

#TWAS case & GWAS significant hits
twas_case_not_gwas = twas_case[~twas_case['ID'].isin(gwas['ID'])].reset_index(drop=True)
print("number of TWAS case genes not in GWAS genes:", twas_case_not_gwas.shape[0])

#TWAS case & GWAS significant hits
twas_cont_not_gwas = twas_cont[~twas_cont['ID'].isin(gwas['ID'])].reset_index(drop=True)
print("number of TWAS control genes not in GWAS genes:", twas_cont_not_gwas.shape[0])

number of TWAS case genes in GWAS genes: 40
number of TWAS control genes in GWAS genes: 30
number of TWAS case genes not in GWAS genes: 60
number of TWAS control genes not in GWAS genes: 57


In [7]:
#grab all gene ID from to.analysis.joint_included.dat files
import glob

def read_id(file):
    return pd.read_csv(file, delim_whitespace=1)

files = glob.glob(f'{top_dir}/*.top')
    
case_top = pd.concat([read_id(file) for file in files], axis=0).reset_index(drop=True)
case_top.drop(case_top.iloc[:,0:2], axis=1, inplace=True)
twas_case_top = case_top.merge(twas_case, on='ID', how='inner').reset_index(drop=True)
twas_case_top
#twas_case_top.to_csv(r'./data_folder/twas_case_top_all.csv',index=False)

Unnamed: 0,ID,CHR,P0,P1,HSQ,BEST.GWAS.ID,BEST.GWAS.Z,EQTL.ID,EQTL.R2,EQTL.Z,EQTL.GWAS.Z,NSNP,NWGT,MODEL,MODELCV.R2,MODELCV.PV,TWAS.Z,TWAS.P
0,ENSG00000187010,1,25598884,25656936,0.7739,rs35589882,-4.46,rs3091242,0.366000,20.48,-3.97938,391,14,lasso,0.750,0.000000e+00,-5.02276,5.090000e-07
1,ENSG00000036549,1,78028101,78149104,0.0435,rs9787306,4.79,rs6678622,0.024100,5.61,-2.86735,477,6,lasso,0.030,2.100000e-09,-4.31169,1.620000e-05
2,ENSG00000117500,1,93615299,93646285,0.0461,rs1474927,4.86,rs1060622,0.055900,-9.08,4.58586,279,6,lasso,0.078,4.000000e-22,-4.52440,6.060000e-06
3,ENSG00000143537,1,155023042,155035252,0.4704,rs12726330,15.55,rs35902694,0.160000,13.54,6.41667,379,15,lasso,0.350,9.400000e-108,5.77050,7.900000e-09
4,ENSG00000160783,1,156182784,156212874,0.0522,rs34372695,11.60,rs2758603,-0.000850,-3.95,-3.87879,393,38,enet,0.014,3.500000e-05,5.64360,1.670000e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,ENSG00000136235,7,23275586,23314727,0.0697,rs199347,-10.49,rs1637193,0.115000,-11.82,-10.37113,430,1,top1,0.110,3.900000e-32,10.37113,3.360000e-25
96,ENSG00000105854,7,95034175,95064510,0.2278,rs7778623,-4.76,rs730365,0.193000,15.07,3.60839,475,7,lasso,0.210,1.100000e-59,4.22825,2.360000e-05
97,ENSG00000146834,7,100026413,100031741,0.1507,rs11769700,-4.61,rs6975729,0.220000,-16.01,-4.34454,317,1,top1,0.220,1.200000e-63,4.34454,1.400000e-05
98,ENSG00000164733,8,11700033,11726957,0.3102,rs1293298,-8.16,rs1692821,0.181854,14.77,-7.91667,441,56,enet,0.250,1.500000e-73,-8.55380,1.190000e-17


In [8]:
##grab significant GWAS hits (P-value threshold to 0.05/N-tests per datatype)
sig_twas_case_top = twas_case_top[twas_case_top['TWAS.P']< 2.7e-6]
#reset index
sig_twas_case_top = sig_twas_case_top.reset_index(drop=True)
print("shape of significant TWAS data:", sig_twas_case_top.shape)

sig_twas_case_top.head()

shape of significant TWAS data: (56, 18)


Unnamed: 0,ID,CHR,P0,P1,HSQ,BEST.GWAS.ID,BEST.GWAS.Z,EQTL.ID,EQTL.R2,EQTL.Z,EQTL.GWAS.Z,NSNP,NWGT,MODEL,MODELCV.R2,MODELCV.PV,TWAS.Z,TWAS.P
0,ENSG00000187010,1,25598884,25656936,0.7739,rs35589882,-4.46,rs3091242,0.366,20.48,-3.97938,391,14,lasso,0.75,0.0,-5.02276,5.09e-07
1,ENSG00000143537,1,155023042,155035252,0.4704,rs12726330,15.55,rs35902694,0.16,13.54,6.41667,379,15,lasso,0.35,9.4e-108,5.7705,7.9e-09
2,ENSG00000160783,1,156182784,156212874,0.0522,rs34372695,11.6,rs2758603,-0.00085,-3.95,-3.87879,393,38,enet,0.014,3.5e-05,5.6436,1.67e-08
3,ENSG00000072694,1,161551101,161648444,0.8061,rs1801274,-6.79,rs7529425,0.182,14.45,-5.11888,515,120,enet,0.47,2.9000000000000002e-157,-4.95828,7.11e-07
4,ENSG00000117280,1,205737114,205744588,0.1237,rs823114,-11.31,rs7522056,0.079,-9.65,-8.0,404,22,enet,0.095,1.1999999999999999e-26,8.37656,5.4500000000000004e-17


In [9]:
from scipy import stats
#remove some columns
case_top = sig_twas_case_top[['ID', 'CHR', 'EQTL.ID', 'EQTL.Z', 'TWAS.Z', 'TWAS.P']]
#add EQTL.P
#case_top['EQTL.P'] = case_top.stats.norm.cdf(-Z)
#add GWAS.hit column 1 if ID is in twas_case_gwas 0 if not
case_top['GWAS.hit'] = case_top.ID.isin(twas_case_gwas.ID).astype(int)
#convert 0 to no and 1 to yes
case_top['GWAS.hit'] = case_top['GWAS.hit'].map({0: 'no', 1: 'yes'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  case_top['GWAS.hit'] = case_top.ID.isin(twas_case_gwas.ID).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  case_top['GWAS.hit'] = case_top['GWAS.hit'].map({0: 'no', 1: 'yes'})


In [10]:
#check data type for the dataframe
print("case_top data types: \n", case_top.dtypes)
#convert CHR from object data type to int
case_top['CHR'] = case_top['CHR'].astype(int)

case_top data types: 
 ID           object
CHR          object
EQTL.ID      object
EQTL.Z      float64
TWAS.Z      float64
TWAS.P      float64
GWAS.hit     object
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  case_top['CHR'] = case_top['CHR'].astype(int)


In [11]:
#check data type for the dataframe
print("case_top data types: \n", case_top.dtypes)
#sort the rows by CHR order
case_top = case_top.sort_values('CHR').reset_index(drop=True)
case_top

case_top data types: 
 ID           object
CHR           int64
EQTL.ID      object
EQTL.Z      float64
TWAS.Z      float64
TWAS.P      float64
GWAS.hit     object
dtype: object


Unnamed: 0,ID,CHR,EQTL.ID,EQTL.Z,TWAS.Z,TWAS.P,GWAS.hit
0,ENSG00000187010,1,rs3091242,20.48,-5.02276,5.09e-07,no
1,ENSG00000143537,1,rs35902694,13.54,5.7705,7.9e-09,yes
2,ENSG00000160783,1,rs2758603,-3.95,5.6436,1.67e-08,no
3,ENSG00000072694,1,rs7529425,14.45,-4.95828,7.11e-07,yes
4,ENSG00000117280,1,rs7522056,-9.65,8.37656,5.4500000000000004e-17,yes
5,ENSG00000143772,1,rs10495249,7.95,-6.32427,2.54e-10,yes
6,ENSG00000198648,2,rs10176669,-7.44,-6.45565,1.08e-10,yes
7,ENSG00000152128,2,rs4954159,-14.38,7.01098,2.37e-12,yes
8,ENSG00000172005,2,rs4854241,6.72,-4.85679,1.19e-06,no
9,ENSG00000114030,3,rs17266816,13.46,5.92168,3.19e-09,yes


In [23]:
#check if GWAS.hit has 40 yes and 60 no
pd.value_counts(case_top['GWAS.hit'])

yes    24
no     13
Name: GWAS.hit, dtype: int64

In [12]:
#save twas_cont_top and cont_top --run only once
twas_case_top.to_csv(r'./data_folder/twas_case_top_all.csv',index=False)
sig_twas_case_top.to_csv(r'./data_folder/significant_twas_case_top_all.csv',index=False)
case_top.to_csv(r'./data_folder/twas_case_top.csv', index=False)

Make BESD files to get SMR

In [None]:
#####Dont merge with meta_bim, just calculate Freq, Effect, SE, P-value

In [None]:
sig_twas_case_top = sig_twas_case_top

In [13]:
import scipy.stats

#find p-value for two-tailed test
scipy.stats.norm.sf(abs(1.24))*2

Unnamed: 0,ID,CHR,P0,P1,HSQ,BEST.GWAS.ID,BEST.GWAS.Z,EQTL.ID,EQTL.R2,EQTL.Z,EQTL.GWAS.Z,NSNP,NWGT,MODEL,MODELCV.R2,MODELCV.PV,TWAS.Z,TWAS.P
0,ENSG00000187010,1,25598884,25656936,0.7739,rs35589882,-4.46,rs3091242,0.366,20.48,-3.97938,391,14,lasso,0.75,0.0,-5.02276,5.09e-07
1,ENSG00000143537,1,155023042,155035252,0.4704,rs12726330,15.55,rs35902694,0.16,13.54,6.41667,379,15,lasso,0.35,9.4e-108,5.7705,7.9e-09
2,ENSG00000160783,1,156182784,156212874,0.0522,rs34372695,11.6,rs2758603,-0.00085,-3.95,-3.87879,393,38,enet,0.014,3.5e-05,5.6436,1.67e-08
3,ENSG00000072694,1,161551101,161648444,0.8061,rs1801274,-6.79,rs7529425,0.182,14.45,-5.11888,515,120,enet,0.47,2.9000000000000002e-157,-4.95828,7.11e-07
4,ENSG00000117280,1,205737114,205744588,0.1237,rs823114,-11.31,rs7522056,0.079,-9.65,-8.0,404,22,enet,0.095,1.1999999999999999e-26,8.37656,5.4500000000000004e-17
5,ENSG00000143772,1,226819391,226927024,0.117,rs10495249,-7.98,rs10495249,0.037,7.95,-7.98077,502,6,lasso,0.078,4.4e-22,-6.32427,2.54e-10
6,ENSG00000128805,10,49654077,49864310,0.66193,rs10857614,-4.74,rs1822861,0.173928,14.15,-4.5161,616,49,enet,0.43,1.1e-140,-4.84086,1.29e-06
7,ENSG00000080854,11,133778459,133826880,0.088,rs329648,9.75,rs10791337,0.0298,-6.93,8.11085,555,29,enet,0.072,2.1e-20,-6.84183,7.82e-12
8,ENSG00000139351,12,102122426,102133250,0.0273,rs17032033,5.13,rs1544922,0.023,5.63,4.7241,484,1,top1,0.023,1.6e-07,4.72414,2.31e-06
9,ENSG00000255398,12,123199303,123201439,0.2238,rs11060180,-10.55,rs1798192,0.0886,-10.27,3.3895,305,20,enet,0.1,1.5e-28,-6.12308,9.18e-10


In [17]:
#merge pcs_df into covariate_df
sig_twas_case_meta = sig_twas_case_top.merge(meta_bim, left_on = "EQTL.ID", right_on= "rsid")
sig_twas_case_meta

Unnamed: 0,ID,CHR,P0,P1,HSQ,BEST.GWAS.ID,BEST.GWAS.Z,EQTL.ID,EQTL.R2,EQTL.Z,...,Freq1,Effect,StdErr,P-value,chr,rsid,kb,pos,a1,a2
0,ENSG00000187010,1,25598884,25656936,0.7739,rs35589882,-4.46,rs3091242,0.366,20.48,...,0.457,-0.0386,0.0097,6.383e-05,1,rs3091242,0,25674785,C,T
1,ENSG00000143537,1,155023042,155035252,0.4704,rs12726330,15.55,rs35902694,0.16,13.54,...,0.3824,0.0616,0.0096,1.586e-10,1,rs35902694,0,155042886,T,G
2,ENSG00000160783,1,156182784,156212874,0.0522,rs34372695,11.6,rs2758603,-0.00085,-3.95,...,0.3468,-0.0384,0.0099,0.0001051,1,rs2758603,0,156198994,C,T
3,ENSG00000072694,1,161551101,161648444,0.8061,rs1801274,-6.79,rs7529425,0.182,14.45,...,0.1331,-0.0732,0.0143,3.247e-07,1,rs7529425,0,161479599,A,G
4,ENSG00000117280,1,205737114,205744588,0.1237,rs823114,-11.31,rs7522056,0.079,-9.65,...,0.3128,-0.0808,0.0101,1.502e-15,1,rs7522056,0,205735891,A,G
5,ENSG00000143772,1,226819391,226927024,0.117,rs10495249,-7.98,rs10495249,0.037,7.95,...,0.2802,-0.083,0.0104,1.666e-15,1,rs10495249,0,226919119,G,A
6,ENSG00000128805,10,49654077,49864310,0.66193,rs10857614,-4.74,rs1822861,0.173928,14.15,...,0.4896,-0.042,0.0093,6.678e-06,10,rs1822861,0,49834326,T,G
7,ENSG00000139351,12,102122426,102133250,0.0273,rs17032033,5.13,rs1544922,0.023,5.63,...,0.1259,0.0685,0.0145,2.456e-06,12,rs1544922,0,102109893,C,T
8,ENSG00000255398,12,123199303,123201439,0.2238,rs11060180,-10.55,rs1798192,0.0886,-10.27,...,0.4285,0.0322,0.0095,0.0007096,12,rs1798192,0,123200768,T,G
9,ENSG00000130787,12,123319000,123347507,0.0334,rs11060180,-10.55,rs11060180,0.0178,-5.54,...,0.4388,-0.1034,0.0098,4.4479999999999997e-26,12,rs11060180,0,123303586,G,A


In [24]:
#make .esd file (Chr    SNP Bp  A1  A2  Freq    Beta    se  p)
sig_twas_case_esd = sig_twas_case_meta[['chr', 'rsid', 'pos', 'a1', 'a2', 'Freq1', 'Effect', 'StdErr', 'P-value']]
sig_twas_case_esd.rename(columns={'chr':'Chr', 'rsid':'SNP', 'pos':'Bp', 'a1':'A1', 'a2':'A2', 'Freq1':'Freq', 'Effect':'Beta', 'StdErr':'se', 'P-value':'p'}, inplace=True)
sig_twas_case_esd

Unnamed: 0,Chr,SNP,Bp,A1,A2,Freq,Beta,se,p
0,1,rs3091242,25674785,C,T,0.457,-0.0386,0.0097,6.383e-05
1,1,rs35902694,155042886,T,G,0.3824,0.0616,0.0096,1.586e-10
2,1,rs2758603,156198994,C,T,0.3468,-0.0384,0.0099,0.0001051
3,1,rs7529425,161479599,A,G,0.1331,-0.0732,0.0143,3.247e-07
4,1,rs7522056,205735891,A,G,0.3128,-0.0808,0.0101,1.502e-15
5,1,rs10495249,226919119,G,A,0.2802,-0.083,0.0104,1.666e-15
6,10,rs1822861,49834326,T,G,0.4896,-0.042,0.0093,6.678e-06
7,12,rs1544922,102109893,C,T,0.1259,0.0685,0.0145,2.456e-06
8,12,rs1798192,123200768,T,G,0.4285,0.0322,0.0095,0.0007096
9,12,rs11060180,123303586,G,A,0.4388,-0.1034,0.0098,4.4479999999999997e-26


In [25]:
#save sig_twas_case_esd
sig_twas_case_esd.to_csv(r'./data_folder/sig_twas_case_esd.esd', index=False)

In [26]:
#make .flist file (Chr    ProbeID GeneticDistance ProbeBp Gene    Orientation PathOfEsd)
sig_twas_case_flist = sig_twas_case_meta[['chr', 'ID', 'kb', 'pos']]
sig_twas_case_flist.rename(columns={'chr':'Chr', 'ID':'ProbeID', 'kb':'GeneticDistance', 'pos':'ProbeBp'}, inplace=True)
sig_twas_case_flist

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,Chr,ProbeID,GeneticDistance,ProbeBp
0,1,ENSG00000187010,0,25674785
1,1,ENSG00000143537,0,155042886
2,1,ENSG00000160783,0,156198994
3,1,ENSG00000072694,0,161479599
4,1,ENSG00000117280,0,205735891
5,1,ENSG00000143772,0,226919119
6,10,ENSG00000128805,0,49834326
7,12,ENSG00000139351,0,102109893
8,12,ENSG00000255398,0,123200768
9,12,ENSG00000130787,0,123303586


In [31]:
#merge pcs_df into covariate_df
sig_twas_case_gene = pd.read_csv(f"./data_folder/sig_twas_case_gene.csv")
sig_twas_case_flist = sig_twas_case_flist.merge(sig_twas_case_gene, left_on = "ProbeID", right_on= "ProbeID")
sig_twas_case_flist

Unnamed: 0,Chr,ProbeID,GeneticDistance,ProbeBp,Gene
0,1,ENSG00000187010,0,25674785,RHD
1,1,ENSG00000143537,0,155042886,ADAM15
2,1,ENSG00000160783,0,156198994,PMF1
3,1,ENSG00000072694,0,161479599,FCGR2B
4,1,ENSG00000117280,0,205735891,RAB29
5,1,ENSG00000143772,0,226919119,ITPKB
6,10,ENSG00000128805,0,49834326,ARHGAP22
7,12,ENSG00000139351,0,102109893,SYCP3
8,12,ENSG00000255398,0,123200768,HCAR3
9,12,ENSG00000130787,0,123303586,HIP1R


In [34]:
#add Orientation and PathOfEsd columns and add values
sig_twas_case_flist["Orientation"] = 'NA'
sig_twas_case_flist["PathOfEsd"] = '/data/songy4/twas/data_folder/sig_twas_case_esd.esd'
sig_twas_case_flist

Unnamed: 0,Chr,ProbeID,GeneticDistance,ProbeBp,Gene,Orientation,PathOfEsd
0,1,ENSG00000187010,0,25674785,RHD,,/data/songy4/twas/data_folder/sig_twas_case_es...
1,1,ENSG00000143537,0,155042886,ADAM15,,/data/songy4/twas/data_folder/sig_twas_case_es...
2,1,ENSG00000160783,0,156198994,PMF1,,/data/songy4/twas/data_folder/sig_twas_case_es...
3,1,ENSG00000072694,0,161479599,FCGR2B,,/data/songy4/twas/data_folder/sig_twas_case_es...
4,1,ENSG00000117280,0,205735891,RAB29,,/data/songy4/twas/data_folder/sig_twas_case_es...
5,1,ENSG00000143772,0,226919119,ITPKB,,/data/songy4/twas/data_folder/sig_twas_case_es...
6,10,ENSG00000128805,0,49834326,ARHGAP22,,/data/songy4/twas/data_folder/sig_twas_case_es...
7,12,ENSG00000139351,0,102109893,SYCP3,,/data/songy4/twas/data_folder/sig_twas_case_es...
8,12,ENSG00000255398,0,123200768,HCAR3,,/data/songy4/twas/data_folder/sig_twas_case_es...
9,12,ENSG00000130787,0,123303586,HIP1R,,/data/songy4/twas/data_folder/sig_twas_case_es...


In [35]:
#save sig_twas_case_flist
sig_twas_case_flist.to_csv(r'./data_folder/sig_twas_case_flist.flist', index=False)

In [39]:
#make BESD file

!smr --eqtl-flist /data/songy4/twas/data_folder/sig_twas_case_flist.flist --make-besd --out mybesd 

/bin/bash: smr: command not found
