In [1]:
import os 
import pandas as pd
import numpy as np 
import glob 
from scipy import stats
import scipy

Goal: collate hv/non-hv comparisons across tissue types into a p value matrix

In [2]:
#read in methylation data 
meth=pd.read_csv('/global/scratch/users/chandlersutherland/e16/cs_reports/nlr_meth.tsv', sep='\t', index_col=0)
meth

Unnamed: 0,Chrom,accession,rep,HV,name,CpG,CHH,CHG
0,chr1,B97,ERR5347668,0,Zm00018ab011400,3.026364,0.658504,1.216825
1,chr1,B97,B97,0,Zm00018ab011400,,4.444444,2.727273
2,chr1,B97,ERR5347668,0,Zm00018ab016280,7.629257,0.676756,11.631282
3,chr1,B97,B97,0,Zm00018ab016280,28.979167,1.359483,12.850529
4,chr1,B97,ERR5347668,0,Zm00018ab025360,37.089803,0.389884,0.118125
...,...,...,...,...,...,...,...,...
3266,chr2,Tzi8,ERR5347740,0,Zm00042ab093960,,0.000000,
3267,chr3,M162W,ERR5347710,0,Zm00033ab159360,,0.000000,
3268,chr10,HP301,ERR5347695,1,Zm00027ab410770,,0.000000,
3269,chr3,HP301,ERR5347695,0,Zm00027ab139300,,0.000000,


In [3]:
#define a function that performs a wilcox rank sum test between HV and non-hv 
def p_calc(accession_name, col_name, tbl):
    subset=tbl[tbl['accession']==accession_name]
    test=subset.groupby(['HV', 'name'])[col_name].mean().reset_index()
    #testing if methylation is different (two sided)
    p=scipy.stats.ranksums(test[test['HV']==0][col_name], test[test['HV']==1][col_name]).pvalue
    return p

In [39]:
#calculate on accessions 
meth_p=[]
accessions=meth['accession'].unique()
for accession in accessions:
   CpG_p=p_calc(accession, 'CpG', meth)
   CHH_p=p_calc(accession, 'CHH', meth)
   CHG_p=p_calc(accession, 'CHG', meth)
   meth_p.append({'accession':accession, 'CpG':CpG_p, 'CHH':CHH_p, 'CHG':CHG_p})

meth_p
meth_df=pd.DataFrame(meth_p)

In [4]:
#transform p values for false discovery 
def fdr(p_vals):
    from scipy.stats import rankdata
    ranked_p_values = rankdata(p_vals)
    fdr = p_vals * len(p_vals) / ranked_p_values
    fdr[fdr > 1] = 1
    return fdr

# meth_df['CpG_BH']=fdr(meth_df['CpG'])
# meth_df['CHH_BH']=fdr(meth_df['CHH'])
# meth_df['CHG_BH']=fdr(meth_df['CHG'])
# meth_df

Do per tissue type on expression data

In [5]:
tpm_paths=glob.glob('/global/scratch/users/chandlersutherland/e16/cs_reports/*_all_tissue.tsv')
tpm_paths[0]

test=pd.read_csv(tpm_paths[0], sep='\t', index_col=0)
test['tissue'].unique()

len(tpm_paths)
tpm_paths[0]

'/global/scratch/users/chandlersutherland/e16/cs_reports/KI3_all_tissue.tsv'

In [6]:
gene_table='/global/home/users/chandlersutherland/e16/Maize_NLRome_GeneTable.txt'
gene=pd.read_csv(gene_table,sep = '\t')
nlrs=gene['Gene'].str.split('_', expand=True).iloc[:,0].str.replace('ZM', 'Zm').str.replace('AB', 'ab').str.replace('EB', 'eb').unique()

nlr_tpm=test[test['name'].isin(nlrs)]
nlr_tpm

sub=gene.drop_duplicates(subset=['Gene'])
sub['name']=nlrs
sub=sub[['name', 'HV']]

hv_nlr=pd.merge(nlr_tpm, sub, on='name')
hv_nlr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,accession,rep,tissue,name,chrom,chromStart,chromEnd,strand,gene_length,stranded_1,TPM,log2(TPM),HV
0,KI3,ERR3791427,anther,Zm00029ab011250,chr1,36697975,36702496,-,4522,2,0.761814,0.817061,0
1,KI3,ERR3791428,anther,Zm00029ab011250,chr1,36697975,36702496,-,4522,3,1.288486,1.194394,0
2,KI3,ERR3791477,ear,Zm00029ab011250,chr1,36697975,36702496,-,4522,26,10.227955,3.489023,0
3,KI3,ERR3791478,ear,Zm00029ab011250,chr1,36697975,36702496,-,4522,28,11.206991,3.609636,0
4,KI3,ERR3791514,embryo,Zm00029ab011250,chr1,36697975,36702496,-,4522,7,3.872308,2.284605,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2575,KI3,ERR3791757,root,Zm00029ab462360,scaf_973,10187,16361,-,6175,0,0.000000,0.000000,0
2576,KI3,ERR3791805,shoot,Zm00029ab462360,scaf_973,10187,16361,-,6175,0,0.000000,0.000000,0
2577,KI3,ERR3791806,shoot,Zm00029ab462360,scaf_973,10187,16361,-,6175,0,0.000000,0.000000,0
2578,KI3,ERR3791857,tassel,Zm00029ab462360,scaf_973,10187,16361,-,6175,0,0.000000,0.000000,0


In [7]:
#average TPM between bioreps 
averaged=hv_nlr.groupby(['accession', 'HV', 'name', 'tissue']).aggregate({'log2(TPM)' : 'mean'}).reset_index()
averaged

#re-write p value function to take into account tissue 
def p_calc_t(tissue_name, col_name, tbl):
    subset=tbl[tbl['tissue']==tissue_name]
    test=subset.groupby(['HV', 'name'])[col_name].mean().reset_index()
    #test
    p=scipy.stats.ranksums(test[test['HV']==0][col_name], test[test['HV']==1][col_name]).pvalue
    return p

p_calc_t('shoot', 'log2(TPM)', hv_nlr)

exp_p=[]
tissues=hv_nlr['tissue'].unique()
for tissue in tissues:
    p=p_calc_t(tissue, 'log2(TPM)', hv_nlr)
    exp_p.append({'accession':tissue, 'tip':p})

exp_p
exp_df=pd.DataFrame(exp_p)
exp_df 

Unnamed: 0,accession,tip
0,anther,0.933411
1,ear,0.630918
2,embryo,0.195291
3,endosperm,0.280482
4,base,0.727734
5,middle,0.533171
6,tip,0.802076
7,root,0.958352
8,shoot,0.735592
9,tassel,0.872767


In [8]:
gene_table='/global/home/users/chandlersutherland/e16/Maize_NLRome_GeneTable.txt'
gene=pd.read_csv(gene_table,sep = '\t')
nlrs=gene['Gene'].str.split('_', expand=True).iloc[:,0].str.replace('ZM', 'Zm').str.replace('AB', 'ab').str.replace('EB', 'eb').unique()

#write a function to import and subset to nlrs 
def importer(path):
    test=pd.read_csv(path, sep='\t', index_col=0)
    nlr_tpm=test[test['name'].isin(nlrs)]

    sub=gene.drop_duplicates(subset=['Gene'])
    sub['name']=nlrs
    sub=sub[['name', 'HV']]

    hv_nlr=pd.merge(nlr_tpm, sub, on='name')
    return(hv_nlr)

In [9]:
exp_p=[]
for i in tpm_paths: 
    nlr_only=importer(i)
    tissues=nlr_only['tissue'].unique()
    accession=nlr_only['accession'][0]
    nlr_only.to_csv('/global/scratch/users/chandlersutherland/e16/cs_reports/'+accession+'_nlr_tpm_tissue.csv')
    for tissue in tissues:
        p=p_calc_t(tissue, 'log2(TPM)', nlr_only)
        exp_p.append({'accession':accession, 'tissue':tissue, 'p':p})

exp_p
exp_df=pd.DataFrame(exp_p)
exp_df 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,accession,tissue,p
0,KI3,anther,0.933411
1,KI3,ear,0.630918
2,KI3,embryo,0.195291
3,KI3,endosperm,0.280482
4,KI3,base,0.727734
...,...,...,...
246,CML277,middle,0.339575
247,CML277,tip,0.259380
248,CML277,root,0.778799
249,CML277,shoot,0.798757


In [10]:
#fdr adjust the p values 
exp_df['p_BH']=exp_df.groupby('accession').p.apply(fdr)
exp_df

Unnamed: 0,accession,tissue,p,p_BH
0,KI3,anther,0.933411,1.000000
1,KI3,ear,0.630918,1.000000
2,KI3,embryo,0.195291,1.000000
3,KI3,endosperm,0.280482,1.000000
4,KI3,base,0.727734,1.000000
...,...,...,...,...
246,CML277,middle,0.339575,0.905533
247,CML277,tip,0.259380,1.000000
248,CML277,root,0.778799,0.890057
249,CML277,shoot,0.798757,0.798757


In [11]:
tissue_matrix=exp_df.pivot(index='accession', columns='tissue', values='p')
tissue_matrix.to_csv('/global/scratch/users/chandlersutherland/e16/cs_reports/tissue_p_matrix.csv')
tissue_matrix


tissue,anther,base,ear,embryo,endosperm,middle,root,shoot,tassel,tip
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
B73,0.303536,0.885181,0.846142,0.989199,0.626,0.755518,0.981999,0.551398,,0.278795
B97,0.09789,0.000335,0.014068,0.085792,0.088894,0.00025,0.011931,0.003265,0.020264,0.001492
CML103,0.765584,0.92196,0.91182,0.888218,0.788437,0.38723,0.864715,0.945666,0.781888,0.565281
CML228,0.449477,0.594799,0.429802,,0.953608,0.044301,0.291206,0.815991,0.439576,0.194806
CML247,0.190549,0.267307,0.380847,,0.194334,0.028064,0.21205,0.470887,0.034584,0.003789
CML277,0.548067,0.414086,0.465208,,,0.339575,0.778799,0.798757,0.196334,0.25938
CML322,0.538,0.833135,0.273987,0.635475,0.609702,0.099982,0.758143,0.576081,0.485883,0.058479
CML333,0.054728,0.013784,0.142812,,0.517956,0.000749,0.746506,0.188661,0.011897,0.000218
CML52,0.45053,0.714238,0.633269,,,0.586643,0.146062,0.911648,0.226482,0.239522
CML69,0.892589,0.119696,0.67129,0.680693,0.75271,0.537052,0.969225,0.797025,0.459636,0.821941


In [12]:
bh_tissue_matrix=exp_df.pivot(index='accession', columns='tissue', values='p_BH')
bh_tissue_matrix.to_csv('/global/scratch/users/chandlersutherland/e16/cs_reports/tissue_pbh_matrix.csv')
bh_tissue_matrix

tissue,anther,base,ear,embryo,endosperm,middle,root,shoot,tassel,tip
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
B73,1.0,1.0,1.0,0.989199,1.0,1.0,1.0,1.0,,1.0
B97,0.09789,0.001673,0.023447,0.10724,0.098771,0.002499,0.023862,0.008163,0.028948,0.004973
CML103,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.945666,1.0,1.0
CML228,0.674215,0.764741,0.967054,,0.953608,0.398713,0.873619,0.91799,0.791237,0.876625
CML247,0.428734,0.34368,0.428453,,0.3498,0.126289,0.318075,0.470887,0.103753,0.034098
CML277,0.730756,0.828172,0.744333,,,0.905533,0.890057,0.798757,1.0,1.0
CML322,1.0,0.833135,0.913289,0.794343,0.871002,0.499908,0.842381,0.960135,1.0,0.584791
CML333,0.098511,0.031013,0.214218,,0.5827,0.00337,0.746506,0.242565,0.03569,0.001961
CML52,0.90106,0.816272,0.844359,,,0.938629,1.0,0.911648,0.90593,0.638726
CML69,0.991765,1.0,1.0,1.0,1.0,1.0,0.969225,1.0,1.0,1.0


In [13]:
nlrs = glob.glob('/global/scratch/users/chandlersutherland/e16/cs_reports/*_nlr_tpm_tissue.csv')
all_nlr=pd.DataFrame()
for path in range(0, len(nlrs)):
    t=pd.read_csv(nlrs[path], index_col=0)
    all_nlr=all_nlr.append(t)
    
all_nlr

Unnamed: 0,accession,rep,tissue,name,chrom,chromStart,chromEnd,strand,gene_length,stranded_1,TPM,log2(TPM),HV
0,MO18W,ERR3791435,anther,Zm00034ab011550,chr1,36316025,36320996,-,4972,4,0.825847,0.868566,0
1,MO18W,ERR3791436,anther,Zm00034ab011550,chr1,36316025,36320996,-,4972,3,1.012539,1.009017,0
2,MO18W,ERR3791485,ear,Zm00034ab011550,chr1,36316025,36320996,-,4972,12,4.997908,2.584459,0
3,MO18W,ERR3791486,ear,Zm00034ab011550,chr1,36316025,36320996,-,4972,16,6.177385,2.843458,0
4,MO18W,ERR3791519,embryo,Zm00034ab011550,chr1,36316025,36320996,-,4972,6,2.008450,1.589020,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2575,KI3,ERR3791757,root,Zm00029ab462360,scaf_973,10187,16361,-,6175,0,0.000000,0.000000,0
2576,KI3,ERR3791805,shoot,Zm00029ab462360,scaf_973,10187,16361,-,6175,0,0.000000,0.000000,0
2577,KI3,ERR3791806,shoot,Zm00029ab462360,scaf_973,10187,16361,-,6175,0,0.000000,0.000000,0
2578,KI3,ERR3791857,tassel,Zm00029ab462360,scaf_973,10187,16361,-,6175,0,0.000000,0.000000,0


In [14]:
path='/global/scratch/users/chandlersutherland/e16/cs_reports/B73_all_tissue.tsv'
test=pd.read_csv(path, sep='\t', index_col=0)
test
#nlr_tpm=test[test['name'].isin(nlrs)]

#nlr_tpm
#     sub=gene.drop_duplicates(subset=['Gene'])
#     sub['name']=nlrs
#     sub=sub[['name', 'HV']]

#     hv_nlr=pd.merge(nlr_tpm, sub, on='name')
#tpm_paths[0]

Unnamed: 0,accession,rep,tissue,name,chrom,chromStart,chromEnd,strand,gene_length,stranded_1,TPM,log2(TPM)
17,B73,MN01011,root,Zm00001eb000010,chr1,34617,40204,+,5588,0,0.000000,0.000000
36,B73,MN01011,root,Zm00001eb000020,chr1,41214,46762,-,5549,47,16.400748,4.121077
55,B73,MN01011,root,Zm00001eb000050,chr1,108554,114382,-,5829,1,0.332190,0.413800
74,B73,MN01011,root,Zm00001eb000060,chr1,188559,189581,-,1023,10,18.928007,4.316726
93,B73,MN01011,root,Zm00001eb000070,chr1,190192,198832,-,8641,0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
755280,B73,MN01102,embryo,Zm00001eb442990,scaf_675,23981,25052,+,1072,0,0.000000,0.000000
755299,B73,MN01102,embryo,Zm00001eb443000,scaf_675,25430,26259,-,830,0,0.000000,0.000000
755318,B73,MN01102,embryo,Zm00001eb443010,scaf_692,17668,21429,-,3762,0,0.000000,0.000000
755337,B73,MN01102,embryo,Zm00001eb443020,scaf_692,26462,30223,-,3762,0,0.000000,0.000000


In [15]:
all_nlr.to_csv('/global/scratch/users/chandlersutherland/e16/cs_reports/all_nlr_tissue.csv')