## blood, spleen, muscle RNA editing specific, high-confidence genes as determined by Gabay et al. using RediToolKnown.py

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from scipy import stats
import os
import seaborn as sns
import math



### blood

In [3]:
# get files from folder
bloodFolder="/Users/karlfrontzek/Documents/UCL/rna_editing/prionREDI/blood"
os.chdir(bloodFolder)
onlyfiles = [f for f in os.listdir(bloodFolder) if os.path.isfile(os.path.join(bloodFolder, f))]

In [19]:
# read files in dataframe
df = pd.DataFrame()
for i in range(0,len(onlyfiles)):
    fname=onlyfiles[i].split(".")[0] # assign sample name
    df0=pd.read_csv(onlyfiles[i], sep="\t") # read file
    df0['sampleID']=fname # assign sample to table
    df=df.append(df0) # append table

In [20]:
# split base counts
dfb=df["BaseCount[A,C,G,T]"].str.split(",",expand=True) # expand column
dfb.columns=['A',"C","G","T"] # rename columns
dfb['A']=dfb['A'].str.replace('[',"") # replace brackets
dfb['T']=dfb['T'].str.replace(']',"")

In [21]:
df[['A',"C","G","T"]]=dfb # merge tables

In [22]:
df[['A',"C","G","T"]]= df[['A',"C","G","T"]].astype("int")

In [23]:
# 4 rows with T>=3, but A as ref allele
# keep only G>=3
df_raw=df
df=df.loc[df["G"]>=3]

In [24]:
# merge metadata
os.chdir("/Users/karlfrontzek/Documents/UCL/rna_editing/prion/")
# read meta-data and extract ID
peri_meta=pd.read_csv("meta_p3506_PeripheralSamples.csv",sep=";")
peri_meta.columns=['TubeID','SampleID','Treatment','wpi','Region']
peri_meta=peri_meta.loc[peri_meta['Region']=="blood"]
peri_meta['SampleID']=peri_meta['SampleID'].str.split("_",expand=True)[0]

In [25]:
# calculate editing frequency
df['editFreq']=df['G']/df['A']
# sort by editing frequency >= 1%
df.loc[df['editFreq']>=0.01] # all values are > 1 %

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0.1,AllSubs,"BaseCount[A,C,G,T]",Coverage-q30,Frequency,MeanQ,Position,Reference,Region,Strand,Unnamed: 0,sampleID,A,C,G,T,editFreq
0,AG,"[264, 0, 650, 0]",914.0,0.71,40.59,74226862.0,A,chrX,0.0,,PB-79,264,0,650,0,2.462121
2,AG,"[11, 0, 15, 0]",26.0,0.58,43.04,17804344.0,A,chr13,0.0,,PB-79,11,0,15,0,1.363636
18,AG,"[19, 0, 3, 0]",22.0,0.14,40.09,75719719.0,A,chr14,0.0,,PB-79,19,0,3,0,0.157895
30,-,"[38, 0, 3, 0]",41.0,0.00,41.71,93189584.0,A,chr5,0.0,,PB-79,38,0,3,0,0.078947
0,AG,"[300, 0, 1319, 0]",1619.0,0.81,37.29,74226862.0,A,chrX,0.0,,PB-122,300,0,1319,0,4.396667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,AG,"[2, 0, 9, 0]",11.0,0.82,69.55,17804344.0,A,chr13,0.0,,PB-74,2,0,9,0,4.500000
3,AG,"[13, 0, 3, 0]",16.0,0.19,70.94,17804548.0,A,chr13,0.0,,PB-74,13,0,3,0,0.230769
13,-,"[97, 0, 5, 0]",102.0,0.00,61.55,91656089.0,A,chr16,1.0,,PB-74,97,0,5,0,0.051546
17,AG,"[20, 0, 3, 0]",23.0,0.13,46.13,75719719.0,A,chr14,0.0,,PB-74,20,0,3,0,0.150000


In [26]:
df=df.merge(peri_meta,left_on="sampleID",right_on="SampleID") # merge tables

In [27]:
# keep df_raw for statistical comparisons
df_raw=df_raw.merge(peri_meta,left_on="sampleID",right_on="SampleID") # merge tables

In [28]:
df_raw.loc[(df_raw['wpi']=='4') & (df_raw['Position']==172092348)].sort_values(['Region_x',"Position"])

Unnamed: 0.1,AllSubs,"BaseCount[A,C,G,T]",Coverage-q30,Frequency,MeanQ,Position,Reference,Region_x,Strand,Unnamed: 0,sampleID,A,C,G,T,TubeID,SampleID,Treatment,wpi,Region_y
366,-,"[91, 0, 3, 0]",94.0,0.0,40.79,172092348.0,A,chr1,1.0,,PB-9,91,0,3,0,p1608_4212/37,PB-9,RML6,4,blood
1221,-,"[114, 0, 6, 0]",120.0,0.0,40.6,172092348.0,A,chr1,1.0,,PB-11,114,0,6,0,p1608_4212/19,PB-11,RML6,4,blood
1314,-,"[63, 0, 0, 0]",63.0,0.0,40.17,172092348.0,A,chr1,1.0,,PB-14,63,0,0,0,p1608_4212/38,PB-14,NBH,4,blood
1346,-,"[90, 0, 0, 0]",90.0,0.0,40.43,172092348.0,A,chr1,1.0,,PB-16,90,0,0,0,p1608_4212/20,PB-16,NBH,4,blood
1467,-,"[60, 0, 1, 0]",61.0,0.0,40.44,172092348.0,A,chr1,1.0,,PB-2,60,0,1,0,p1608_4212/1,PB-2,RML6,4,blood
2071,-,"[44, 0, 0, 0]",44.0,0.0,42.36,172092348.0,A,chr1,1.0,,PB-7,44,0,0,0,p1608_4212/2,PB-7,NBH,4,blood


In [30]:
# test if edited site occurs at least in 2/3 of replicates, easy: positions are all unique
occurence = []
wpii=[4,4,8,8,12,12,14,14,16,16,18,18,20,20,'term','term']
treat=['RML6','NBH']*8
loopi=pd.DataFrame({"treat":treat,"wpi":wpii})


In [31]:
df['Position'].loc[(df['wpi']==str(loopi['wpi'].iloc[0])) & (df['Treatment']==loopi['treat'].iloc[0])].value_counts()

74226862.0     3
17804344.0     3
91655615.0     2
91656089.0     2
75719719.0     2
172092348.0    2
17804548.0     2
91656122.0     1
91656061.0     1
91656133.0     1
93189584.0     1
157558149.0    1
17804365.0     1
Name: Position, dtype: int64

In [32]:
tS=pd.DataFrame([])
tS[['Position','Counts']]=pd.DataFrame(df['Position'].loc[(df['wpi']==str(loopi['wpi'].iloc[0])) & (df['Treatment']==loopi['treat'].iloc[0])].value_counts()).reset_index()


In [34]:
# extract sites that are edited at least in 2/3 replicates

testSites=pd.DataFrame([])
for i in range(0,len(loopi)):
    tS=pd.DataFrame([])
    tS[['Position','Counts']]=pd.DataFrame(df['Position'].loc[(df['wpi']==str(loopi['wpi'].iloc[i])) & (df['Treatment']==loopi['treat'].iloc[i])].value_counts()).reset_index()
    tS['wpi']=loopi['wpi'].iloc[i]
    tS['Treatment']=loopi['treat'].iloc[i]
    testSites=testSites.append(tS)

In [35]:
dfReplicates=df[['wpi','Treatment','SampleID']].drop_duplicates()
totalRep=pd.Series(list(zip(dfReplicates.wpi, dfReplicates.Treatment))).value_counts()
totalRep=totalRep.reset_index()
totalRep.columns=['Index','Rep']

In [36]:
testSites['wpi']=testSites['wpi'].astype('str')

In [37]:
testSites['wpiTreatment']=list(zip(testSites.wpi, testSites.Treatment)) # merge wpi+Treatment for threshold 
Thr=pd.DataFrame({'Index':[],'Thr':[]})                                                       # calculation of edited sites
for i in range(0,len(testSites)): # iterate over testsite counts, keep those occuring in floor(2/3 of samples)
    wpiTreat=testSites['wpiTreatment'].iloc[i]  # read current treatment+wpi combi
    Tr=math.floor(totalRep['Rep'].loc[totalRep['Index']==wpiTreat]*2/3) # calculate current threshold
    Tp=pd.DataFrame({'Index':str(i),'Thr':str(Tr)},index=[i]) # make list of thresholds per edited site
    Thr=Thr.append(Tp)
#testSitesThr=testSites['Position'].loc[testSites['Counts']!=1] # extract test sites 
#testSites['Counts'].loc[testSites['Position'].isin(testSites1)].head(50)

In [38]:
testSitesThr=testSites.reset_index().join(Thr)# join TestSites and Threshold

In [39]:
testSitesThr['Counts']=testSitesThr['Counts'].astype('int')
testSitesThr['Thr']=testSitesThr['Thr'].astype('int') # convert counts to int

In [40]:
testSitesThr=testSitesThr.loc[testSitesThr['Counts']>=testSitesThr['Thr']] # select for editing events which pass threshold

In [41]:
# make combination of position+wpi
uniqueEdSitesThr=pd.Series(list(zip(testSitesThr.wpi, testSitesThr.Position)))
uniqueEdSitesThr=uniqueEdSitesThr.astype('str') # convert tuple to str

In [42]:
df_raw['wpiPosition']=list(zip(df_raw.wpi, df_raw.Position)) # combine wpi+Position in df_raw
df_raw['wpiPosition']=df_raw['wpiPosition'].astype('str') # convert tuple to str

In [43]:
testSitesThr['wpiPosition']=testSitesThr['wpi']+'+'+testSitesThr['Position'].astype('str')
df_raw['wpiPosition']=df_raw['wpi']+'+'+df_raw['Position'].astype('str')

In [44]:
uniqueTestSitesThr=testSitesThr['wpiPosition'].unique() # retrieve unique editing sites

In [45]:
uniqueTestSitesThr=list(uniqueTestSitesThr) # convert unique editing sites to list

In [46]:
dfThr=df_raw.loc[df_raw['wpiPosition'].isin(uniqueTestSitesThr)] # keep editing sites that pass threshold

In [47]:
cog3=df_raw.loc[(df_raw['Position']==75719719) & (df_raw['wpi']=="term")]#write cog3 editing events
cog3.to_csv('cog3_editing.csv')

In [48]:
# clean up dfThr for export
dfThrClean=dfThr[['A','G','Treatment','wpi','wpiPosition']]


In [49]:
# test whether at least 2 replicates of non-edited site exist
uniqueSites=dfThrClean['wpiPosition'].unique()
excludeSites=[]
for i in range(0,len(uniqueSites)):
    uS=dfThrClean.loc[dfThrClean['wpiPosition']==uniqueSites[i]]
    if len(uS['Treatment'].loc[uS['Treatment']=="NBH"]) < 2:
        #print("In wpiPosition {} treated with NBH less than 2 replicates".format(uniqueSites[i]))
        excludeSites.append(uniqueSites[i])
    if len(uS['Treatment'].loc[uS['Treatment']=="RML6"]) < 2:
        #print("In wpiPosition {} treated with RML6 less than 2 replicates".format(uniqueSites[i]))
         excludeSites.append(uniqueSites[i])

In [50]:
dfThrClean=dfThrClean.loc[~dfThrClean['wpiPosition'].isin(excludeSites)]
dfThrClean.to_csv('bloodRediEli.csv') # export dfThrClean

In [51]:
# analyse statistical testing with Redit, see R
dfP=pd.read_csv('/Users/karlfrontzek/Documents/UCL/rna_editing/prionREDI/bloodEditingSitesRedit.csv')

In [52]:
dfP[['wpi','site1']]=dfP['site'].str.split("+",expand=True)

In [53]:
from statsmodels.stats.multitest import multipletests # for fdr

In [54]:
wpiUnique=dfP['wpi'].unique()

In [55]:
# adjust for fdr < 0.05 
# preassign dataframe
tT=pd.DataFrame({'wpi':[],'sig':[],'fdr':[],'pValue':[],'index':[]})
for i in range(0,len(wpiUnique)): # loop through all wpi
    tTtemp=pd.DataFrame({'wpi':[],'sig':[],'fdr':[],'pValue':[],'index':[]}) # preassign temp datafram
    # add sig and pVal to dataframe
    tTtemp['sig']=multipletests(dfP['pValue'].loc[dfP['wpi']==wpiUnique[i]],alpha=0.05,method="fdr_bh")[0]
    tTtemp['fdr']=multipletests(dfP['pValue'].loc[dfP['wpi']==wpiUnique[i]],alpha=0.05,method="fdr_bh")[1]
    tTtemp[['index','pValue']]=dfP['pValue'].loc[dfP['wpi']==wpiUnique[i]].reset_index()
    tTtemp['wpi']=wpiUnique[i]
    tT=tT.append(tTtemp)

In [56]:
bloodRediAllEvents=tT.merge(dfP,left_on="pValue",right_on="pValue") # merge sites and fdr on "pValue"

In [57]:
bloodRediAllEvents.loc[bloodRediAllEvents['sig']==1] # show significant editing events

Unnamed: 0,wpi_x,sig,fdr,pValue,index,site,wpi_y,site1
12,term,1.0,0.002927,0.000418,12.0,term+75719719.0,term,75719719.0


# muscle

In [58]:
# get files from folder
muscleFolder="/Users/karlfrontzek/Documents/UCL/rna_editing/prionREDI/muscle"
os.chdir(muscleFolder)
onlyfiles = [f for f in os.listdir(muscleFolder) if os.path.isfile(os.path.join(muscleFolder, f))]

In [59]:
# read files in dataframe
df = pd.DataFrame()
df0 = pd.DataFrame()
for i in range(0,len(onlyfiles)):
    fname=onlyfiles[i].split(".")[0] # assign sample name
    df0=pd.read_csv(onlyfiles[i], sep="\t") # read file
    df0['sampleID']=fname # assign sample to table
    df=df.append(df0) # append table

In [60]:
# split base counts
dfb=df["BaseCount[A,C,G,T]"].str.split(",",expand=True) # expand column
dfb.columns=['A',"C","G","T"] # rename columns
dfb['A']=dfb['A'].str.replace('[',"") # replace brackets
dfb['T']=dfb['T'].str.replace(']',"")

In [61]:
df[['A',"C","G","T"]]=dfb # merge tables

In [62]:
df[['A',"C","G","T"]]= df[['A',"C","G","T"]].astype("int")

In [63]:
# 4 rows with T>=3, but A as ref allele
# keep only G>=3
df_raw=df
df=df.loc[df["G"]>=3]

In [64]:
# merge metadata
os.chdir("/Users/karlfrontzek/Documents/UCL/rna_editing/prion/")
# read meta-data and extract ID
peri_meta=pd.read_csv("meta_p3506_PeripheralSamples.csv",sep=";")
peri_meta.columns=['TubeID','SampleID','Treatment','wpi','Region']
peri_meta=peri_meta.loc[peri_meta['Region']=="muscle"]
peri_meta['SampleID']=peri_meta['SampleID'].str.split("_",expand=True)[0]

In [65]:
# calculate editing frequency
df['editFreq']=df['G']/df['A']
# sort by editing frequency >= 1%
#len(df['editFreq'])
len(df.loc[df['editFreq']>=0.01]) # all values are > 1 %

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


307

In [66]:
df=df.merge(peri_meta,left_on="sampleID",right_on="SampleID") # merge tables

In [67]:
# keep df_raw for statistical comparisons
df_raw=df_raw.merge(peri_meta,left_on="sampleID",right_on="SampleID") # merge tables

In [68]:
# test if edited site occurs at least in 2/3 of replicates, easy: positions are all unique
occurence = []
wpii=[4,4,8,8,12,12,14,14,16,16,18,18,20,20,'term','term']
treat=['RML6','NBH']*8
loopi=pd.DataFrame({"treat":treat,"wpi":wpii})

In [69]:
# extract sites that are edited at least in 2/3 replicates

testSites=pd.DataFrame([])
for i in range(0,len(loopi)):
    tS=pd.DataFrame([])
    tS[['Position','Counts']]=pd.DataFrame(df['Position'].loc[(df['wpi']==str(loopi['wpi'].iloc[i])) & (df['Treatment']==loopi['treat'].iloc[i])].value_counts()).reset_index()
    tS['wpi']=loopi['wpi'].iloc[i]
    tS['Treatment']=loopi['treat'].iloc[i]
    testSites=testSites.append(tS)

In [70]:
dfReplicates=df[['wpi','Treatment','SampleID']].drop_duplicates()
totalRep=pd.Series(list(zip(dfReplicates.wpi, dfReplicates.Treatment))).value_counts()
totalRep=totalRep.reset_index()
totalRep.columns=['Index','Rep']

In [71]:
testSites['wpi']=testSites['wpi'].astype('str')

In [72]:
testSites['wpiTreatment']=list(zip(testSites.wpi, testSites.Treatment)) # merge wpi+Treatment for threshold 
Thr=pd.DataFrame({'Index':[],'Thr':[]})                                                       # calculation of edited sites
for i in range(0,len(testSites)): # iterate over testsite counts, keep those occuring in floor(2/3 of samples)
    wpiTreat=testSites['wpiTreatment'].iloc[i]  # read current treatment+wpi combi
    Tr=math.floor(totalRep['Rep'].loc[totalRep['Index']==wpiTreat]*2/3) # calculate current threshold
    Tp=pd.DataFrame({'Index':str(i),'Thr':str(Tr)},index=[i]) # make list of thresholds per edited site
    Thr=Thr.append(Tp)
#testSitesThr=testSites['Position'].loc[testSites['Counts']!=1] # extract test sites 
#testSites['Counts'].loc[testSites['Position'].isin(testSites1)].head(50)

In [73]:
testSitesThr=testSites.reset_index().join(Thr)# join TestSites and Threshold

In [74]:
testSitesThr['Counts']=testSitesThr['Counts'].astype('int')
testSitesThr['Thr']=testSitesThr['Thr'].astype('int') # convert counts to int

In [75]:
testSitesThr=testSitesThr.loc[testSitesThr['Counts']>=testSitesThr['Thr']] # select for editing events which pass threshold

In [76]:
# make combination of position+wpi
uniqueEdSitesThr=pd.Series(list(zip(testSitesThr.wpi, testSitesThr.Position)))
uniqueEdSitesThr=uniqueEdSitesThr.astype('str') # convert tuple to str

In [77]:
df_raw['wpiPosition']=list(zip(df_raw.wpi, df_raw.Position)) # combine wpi+Position in df_raw
df_raw['wpiPosition']=df_raw['wpiPosition'].astype('str') # convert tuple to str

In [78]:
testSitesThr['wpiPosition']=testSitesThr['wpi']+'+'+testSitesThr['Position'].astype('str')
df_raw['wpiPosition']=df_raw['wpi']+'+'+df_raw['Position'].astype('str')

In [79]:
uniqueTestSitesThr=testSitesThr['wpiPosition'].unique() # retrieve unique editing sites

In [80]:
uniqueTestSitesThr=list(uniqueTestSitesThr) # convert unique editing sites to list

In [81]:
df_raw.loc[df_raw['wpiPosition']=="4+77407731"]

Unnamed: 0,Region_x,Position,Reference,Strand,Coverage-q30,MeanQ,"BaseCount[A,C,G,T]",AllSubs,Frequency,sampleID,A,C,G,T,TubeID,SampleID,Treatment,wpi,Region_y,wpiPosition
1122,chr5,77407731,A,0,11,55.18,"[2, 0, 9, 0]",AG,0.82,PB-11,2,0,9,0,p1608_4211/55,PB-11,RML6,4,muscle,4+77407731


In [82]:
dfThr=df_raw.loc[df_raw['wpiPosition'].isin(uniqueTestSitesThr)] # keep editing sites that pass threshold

In [83]:
# clean up dfThr for export
dfThrClean=dfThr[['A','G','Treatment','wpi','wpiPosition']]


In [84]:
# test whether at least 2 replicates of non-edited site exist
uniqueSites=dfThrClean['wpiPosition'].unique()
excludeSites=[]
for i in range(0,len(uniqueSites)):
    uS=dfThrClean.loc[dfThrClean['wpiPosition']==uniqueSites[i]]
    if len(uS['Treatment'].loc[uS['Treatment']=="NBH"]) < 2:
        #print("In wpiPosition {} treated with NBH less than 2 replicates".format(uniqueSites[i]))
        excludeSites.append(uniqueSites[i])
    if len(uS['Treatment'].loc[uS['Treatment']=="RML6"]) < 2:
        #print("In wpiPosition {} treated with RML6 less than 2 replicates".format(uniqueSites[i]))
         excludeSites.append(uniqueSites[i])

In [85]:
dfThrClean=dfThrClean.loc[~dfThrClean['wpiPosition'].isin(excludeSites)]
dfThrClean.to_csv('muscleRediEli.csv') # export dfThrClean

In [86]:
excludeSites

['14+75719719',
 '18+7936048',
 '18+7936048',
 '4+7936048',
 '4+7936048',
 '4+77407731',
 '4+77407731']

In [87]:
# clean up dfThr for export
#dfThrClean=dfThr[['A','G','Treatment','wpi','wpiPosition']]
# only on 1 replicate: 4+7936048, 4+77407731, only on 4: 14+74226862, 14+75719719; only on 2: 18+7936048
#dfThrClean.to_csv('/Users/karlfrontzek/Documents/UCL/rna_editing/prion/muscleRediEli.csv')


In [88]:
# load p-Value adjusted results from Redit


In [89]:
# analyse statistical testing with Redit, see R
dfP=pd.read_csv('/Users/karlfrontzek/Documents/UCL/rna_editing/prionREDI/muscleEditingSitesRedit.csv')

In [90]:
dfP[['wpi','site1']]=dfP['site'].str.split("+",expand=True)

In [91]:
from statsmodels.stats.multitest import multipletests # for fdr

In [92]:
wpiUnique=dfP['wpi'].unique()

In [93]:
# adjust for fdr < 0.05 
# preassign dataframe
tT=pd.DataFrame({'wpi':[],'sig':[],'fdr':[],'pValue':[],'index':[]})
for i in range(0,len(wpiUnique)): # loop through all wpi
    tTtemp=pd.DataFrame({'wpi':[],'sig':[],'fdr':[],'pValue':[],'index':[]}) # preassign temp datafram
    # add sig and pVal to dataframe
    tTtemp['sig']=multipletests(dfP['pValue'].loc[dfP['wpi']==wpiUnique[i]],alpha=0.05,method="fdr_bh")[0]
    tTtemp['fdr']=multipletests(dfP['pValue'].loc[dfP['wpi']==wpiUnique[i]],alpha=0.05,method="fdr_bh")[1]
    tTtemp[['index','pValue']]=dfP['pValue'].loc[dfP['wpi']==wpiUnique[i]].reset_index()
    tTtemp['wpi']=wpiUnique[i]
    tT=tT.append(tTtemp)

In [94]:
bloodRediAllEvents=tT.merge(dfP,left_on="pValue",right_on="pValue") # merge sites and fdr on "pValue"

In [95]:
bloodRediAllEvents.loc[bloodRediAllEvents['sig']==1] # show significant editing events

Unnamed: 0,wpi_x,sig,fdr,pValue,index,site,wpi_y,site1


# spleen

In [158]:
# get files from folder
muscleFolder="/Users/karlfrontzek/Documents/UCL/rna_editing/prionREDI/spleen"
os.chdir(muscleFolder)
onlyfiles = [f for f in os.listdir(muscleFolder) if os.path.isfile(os.path.join(muscleFolder, f))]

In [159]:
# read files in dataframe
df = pd.DataFrame()
df0 = pd.DataFrame()
for i in range(0,len(onlyfiles)):
    fname=onlyfiles[i].split(".")[0] # assign sample name
    df0=pd.read_csv(onlyfiles[i], sep="\t") # read file
    df0['sampleID']=fname # assign sample to table
    df=df.append(df0) # append table

In [160]:
# split base counts
dfb=df["BaseCount[A,C,G,T]"].str.split(",",expand=True) # expand column
dfb.columns=['A',"C","G","T"] # rename columns
dfb['A']=dfb['A'].str.replace('[',"") # replace brackets
dfb['T']=dfb['T'].str.replace(']',"")

In [161]:
df[['A',"C","G","T"]]=dfb # merge tables

In [162]:
df[['A',"C","G","T"]]= df[['A',"C","G","T"]].astype("int")

In [163]:
df.loc[df['T']>=3]# 1 site with 3xT, but A>T, not C>G site due to 172xA->807xG

Unnamed: 0,Region,Position,Reference,Strand,Coverage-q30,MeanQ,"BaseCount[A,C,G,T]",AllSubs,Frequency,sampleID,A,C,G,T
39,chr5,77407731,A,0,982,53.95,"[172, 0, 807, 3]",AG,0.82,PB-23,172,0,807,3


In [164]:
# keep only G>=3
df_raw=df
df=df.loc[df["G"]>=3]

In [165]:
# merge metadata
os.chdir("/Users/karlfrontzek/Documents/UCL/rna_editing/prion/")
# read meta-data and extract ID
peri_meta=pd.read_csv("meta_p3506_PeripheralSamples.csv",sep=";")
peri_meta.columns=['TubeID','SampleID','Treatment','wpi','Region']
peri_meta=peri_meta.loc[peri_meta['Region']=="spleen"]
peri_meta['SampleID']=peri_meta['SampleID'].str.split("_",expand=True)[0]

In [166]:
# calculate editing frequency
df['editFreq']=df['G']/df['A']
# sort by editing frequency >= 1%
#len(df['editFreq'])
#len(df.loc[df['editFreq']]) # 2 values are below 0.1%
df=df.loc[df['editFreq']>= 0.01]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [167]:
df=df.merge(peri_meta,left_on="sampleID",right_on="SampleID") # merge tables

In [168]:
# keep df_raw for statistical comparisons
df_raw=df_raw.merge(peri_meta,left_on="sampleID",right_on="SampleID") # merge tables

In [169]:
# export raw data for supplementary table, e.g. cog3 terminal stage
df_raw.loc[(df_raw['wpi']=="16") & (df_raw['Position']==7936048.0)].to_csv("rediFlnbSuppTab3.csv")
df_raw.loc[(df_raw['wpi']=="16") & (df_raw['Position']==172092348.0)].to_csv("rediCopaSuppTab3.csv")

In [107]:
# test if edited site occurs at least in 2/3 of replicates, easy: positions are all unique
occurence = []
wpii=[4,4,8,8,12,12,14,14,16,16,18,18,20,20,'term','term']
treat=['RML6','NBH']*8
loopi=pd.DataFrame({"treat":treat,"wpi":wpii})

In [108]:
# extract sites that are edited at least in 2/3 replicates

testSites=pd.DataFrame([])
for i in range(0,len(loopi)):
    tS=pd.DataFrame([])
    tS[['Position','Counts']]=pd.DataFrame(df['Position'].loc[(df['wpi']==str(loopi['wpi'].iloc[i])) & (df['Treatment']==loopi['treat'].iloc[i])].value_counts()).reset_index()
    tS['wpi']=loopi['wpi'].iloc[i]
    tS['Treatment']=loopi['treat'].iloc[i]
    testSites=testSites.append(tS)

In [109]:
dfReplicates=df[['wpi','Treatment','SampleID']].drop_duplicates()
totalRep=pd.Series(list(zip(dfReplicates.wpi, dfReplicates.Treatment))).value_counts()
totalRep=totalRep.reset_index()
totalRep.columns=['Index','Rep']

In [110]:
testSites['wpi']=testSites['wpi'].astype('str')

In [111]:
testSites['wpiTreatment']=list(zip(testSites.wpi, testSites.Treatment)) # merge wpi+Treatment for threshold 
Thr=pd.DataFrame({'Index':[],'Thr':[]})                                                       # calculation of edited sites
for i in range(0,len(testSites)): # iterate over testsite counts, keep those occuring in floor(2/3 of samples)
    wpiTreat=testSites['wpiTreatment'].iloc[i]  # read current treatment+wpi combi
    Tr=math.floor(totalRep['Rep'].loc[totalRep['Index']==wpiTreat]*2/3) # calculate current threshold
    Tp=pd.DataFrame({'Index':str(i),'Thr':str(Tr)},index=[i]) # make list of thresholds per edited site
    Thr=Thr.append(Tp)
#testSitesThr=testSites['Position'].loc[testSites['Counts']!=1] # extract test sites 
#testSites['Counts'].loc[testSites['Position'].isin(testSites1)].head(50)

In [112]:
testSitesThr=testSites.reset_index().join(Thr) # join TestSites and Threshold

In [113]:
testSitesThr['Counts']=testSitesThr['Counts'].astype('int')
testSitesThr['Thr']=testSitesThr['Thr'].astype('int') # convert counts to int

In [114]:
testSitesThr=testSitesThr.loc[testSitesThr['Counts']>=testSitesThr['Thr']] # select for editing events which pass threshold

In [115]:
# make combination of position+wpi
uniqueEdSitesThr=pd.Series(list(zip(testSitesThr.wpi, testSitesThr.Position)))
uniqueEdSitesThr=uniqueEdSitesThr.astype('str') # convert tuple to str

In [116]:
df_raw['wpiPosition']=list(zip(df_raw.wpi, df_raw.Position)) # combine wpi+Position in df_raw
df_raw['wpiPosition']=df_raw['wpiPosition'].astype('str') # convert tuple to str

In [117]:
testSitesThr['wpiPosition']=testSitesThr['wpi']+'+'+testSitesThr['Position'].astype('str')
df_raw['wpiPosition']=df_raw['wpi']+'+'+df_raw['Position'].astype('str')

In [118]:
uniqueTestSitesThr=testSitesThr['wpiPosition'].unique() # retrieve unique editing sites

In [119]:
uniqueTestSitesThr=list(uniqueTestSitesThr) # convert unique editing sites to list

In [120]:
dfThr=df_raw.loc[df_raw['wpiPosition'].isin(uniqueTestSitesThr)] # keep editing sites that pass threshold

In [121]:
# clean up dfThr for export
dfThrClean=dfThr[['A','G','Treatment','wpi','wpiPosition']]


In [122]:
# test whether at least 2 replicates of non-edited site exist
uniqueSites=dfThrClean['wpiPosition'].unique()
excludeSites=[]
for i in range(0,len(uniqueSites)):
    uS=dfThrClean.loc[dfThrClean['wpiPosition']==uniqueSites[i]]
    if len(uS['Treatment'].loc[uS['Treatment']=="NBH"]) < 2:
        #print("In wpiPosition {} treated with NBH less than 2 replicates".format(uniqueSites[i]))
        excludeSites.append(uniqueSites[i])
    if len(uS['Treatment'].loc[uS['Treatment']=="RML6"]) < 2:
        #print("In wpiPosition {} treated with RML6 less than 2 replicates".format(uniqueSites[i]))
         excludeSites.append(uniqueSites[i])

In [123]:
dfThrClean=dfThrClean.loc[~dfThrClean['wpiPosition'].isin(excludeSites)]
dfThrClean.to_csv('spleenRediEli.csv') # export dfThrClean

In [124]:
# load p-Value adjusted results from Redit

In [125]:
# analyse statistical testing with Redit, see R
dfP=pd.read_csv('/Users/karlfrontzek/Documents/UCL/rna_editing/prionREDI/spleenEditingSitesRedit.csv')

In [126]:
dfP[['wpi','site1']]=dfP['site'].str.split("+",expand=True)

In [127]:
from statsmodels.stats.multitest import multipletests # for fdr

In [128]:
wpiUnique=dfP['wpi'].unique()

In [129]:
# adjust for fdr < 0.05 
# preassign dataframe
tT=pd.DataFrame({'wpi':[],'sig':[],'fdr':[],'pValue':[],'index':[]})
for i in range(0,len(wpiUnique)): # loop through all wpi
    tTtemp=pd.DataFrame({'wpi':[],'sig':[],'fdr':[],'pValue':[],'index':[]}) # preassign temp datafram
    # add sig and pVal to dataframe
    tTtemp['sig']=multipletests(dfP['pValue'].loc[dfP['wpi']==wpiUnique[i]],alpha=0.05,method="fdr_bh")[0]
    tTtemp['fdr']=multipletests(dfP['pValue'].loc[dfP['wpi']==wpiUnique[i]],alpha=0.05,method="fdr_bh")[1]
    tTtemp[['index','pValue']]=dfP['pValue'].loc[dfP['wpi']==wpiUnique[i]].reset_index()
    tTtemp['wpi']=wpiUnique[i]
    tT=tT.append(tTtemp)

In [130]:
bloodRediAllEvents=tT.merge(dfP,left_on="pValue",right_on="pValue") # merge sites and fdr on "pValue"

In [131]:
bloodRediAllEvents.loc[bloodRediAllEvents['sig']==1] # show significant editing events

Unnamed: 0,wpi_x,sig,fdr,pValue,index,site,wpi_y,site1
5,16,1.0,0.045061,0.005688,5.0,16+7936048,16,7936048
7,16,1.0,0.045061,0.006932,7.0,16+172092348,16,172092348


In [132]:
#df_raw
df_raw.loc[(df_raw['Position']==7936048) & (df_raw['wpi']=="16")]

Unnamed: 0,Region_x,Position,Reference,Strand,Coverage-q30,MeanQ,"BaseCount[A,C,G,T]",AllSubs,Frequency,sampleID,A,C,G,T,TubeID,SampleID,Treatment,wpi,Region_y,wpiPosition
18,chr14,7936048,A,1,32,45.06,"[32, 0, 0, 0]",-,0.0,PB-79,32,0,0,0,p1608_4211/80,PB-79,NBH,16,spleen,16+7936048
174,chr14,7936048,A,1,39,45.54,"[28, 0, 11, 0]",AG,0.28,PB-78,28,0,11,0,p1608_4211/26,PB-78,NBH,16,spleen,16+7936048
1350,chr14,7936048,A,1,47,38.57,"[45, 0, 2, 0]",-,0.0,PB-71,45,0,2,0,p1608_4211/8,PB-71,NBH,16,spleen,16+7936048
1431,chr14,7936048,A,1,17,45.71,"[9, 0, 8, 0]",AG,0.47,PB-73,9,0,8,0,p1608_4211/79,PB-73,RML6,16,spleen,16+7936048
1594,chr14,7936048,A,1,46,48.8,"[28, 0, 18, 0]",AG,0.39,PB-66,28,0,18,0,p1608_4211/7,PB-66,RML6,16,spleen,16+7936048
1821,chr14,7936048,A,1,24,41.12,"[12, 0, 12, 0]",AG,0.5,PB-75,12,0,12,0,p1608_4211/25,PB-75,RML6,16,spleen,16+7936048


In [133]:
df_raw.loc[(df_raw['Position']==172092348) & (df_raw['wpi']=="16")]

Unnamed: 0,Region_x,Position,Reference,Strand,Coverage-q30,MeanQ,"BaseCount[A,C,G,T]",AllSubs,Frequency,sampleID,A,C,G,T,TubeID,SampleID,Treatment,wpi,Region_y,wpiPosition
32,chr1,172092348,A,1,318,38.36,"[283, 0, 35, 0]",AG,0.11,PB-79,283,0,35,0,p1608_4211/80,PB-79,NBH,16,spleen,16+172092348
187,chr1,172092348,A,1,341,38.16,"[304, 0, 37, 0]",AG,0.11,PB-78,304,0,37,0,p1608_4211/26,PB-78,NBH,16,spleen,16+172092348
1364,chr1,172092348,A,1,477,37.7,"[432, 0, 45, 0]",-,0.0,PB-71,432,0,45,0,p1608_4211/8,PB-71,NBH,16,spleen,16+172092348
1443,chr1,172092348,A,1,361,38.23,"[335, 0, 26, 0]",-,0.0,PB-73,335,0,26,0,p1608_4211/79,PB-73,RML6,16,spleen,16+172092348
1606,chr1,172092348,A,1,332,38.34,"[322, 0, 10, 0]",-,0.0,PB-66,322,0,10,0,p1608_4211/7,PB-66,RML6,16,spleen,16+172092348
1833,chr1,172092348,A,1,311,38.19,"[296, 0, 15, 0]",-,0.0,PB-75,296,0,15,0,p1608_4211/25,PB-75,RML6,16,spleen,16+172092348


# brain

In [134]:
# get files from folder
muscleFolder="/Users/karlfrontzek/Documents/UCL/rna_editing/prionREDI/brain"
os.chdir(muscleFolder)
onlyfiles = [f for f in os.listdir(muscleFolder) if os.path.isfile(os.path.join(muscleFolder, f))]

In [135]:
# read files in dataframe
df = pd.DataFrame()
df0 = pd.DataFrame()
for i in range(0,len(onlyfiles)):
    fname=onlyfiles[i].split("A-")[1] # assign sample name
    df0=pd.read_csv(onlyfiles[i], sep="\t") # read file
    df0['sampleID']=fname[0:-4] # assign sample to table
    df=df.append(df0) # append table

In [136]:
# split base counts
dfb=df["BaseCount[A,C,G,T]"].str.split(",",expand=True) # expand column
dfb.columns=['A',"C","G","T"] # rename columns
dfb['A']=dfb['A'].str.replace('[',"") # replace brackets
dfb['T']=dfb['T'].str.replace(']',"")

In [137]:
df[['A',"C","G","T"]]=dfb # merge tables

In [138]:
df[['A',"C","G","T"]]= df[['A',"C","G","T"]].astype("int")

In [139]:
df.loc[df['T']>=3]# 3 sites with 3xT, but A>T, not C>G site due to 172xA->807xG

Unnamed: 0,Region,Position,Reference,Strand,Coverage-q30,MeanQ,"BaseCount[A,C,G,T]",AllSubs,Frequency,sampleID,A,C,G,T
38,chr3,80706912,A,0,734,41.89,"[0, 0, 730, 4]",AG AT,1.0,PB-15,0,0,730,4
9,chr11,46272643,A,0,996,40.26,"[208, 0, 784, 4]",AG,0.79,PB-43,208,0,784,4
9,chr11,46272643,A,0,1176,40.08,"[241, 0, 931, 4]",AG,0.79,PB-27,241,0,931,4


In [140]:
# keep only G>=3
df_raw=df
df=df.loc[df["G"]>=3]

In [141]:
# merge metadata
os.chdir("/Users/karlfrontzek/Documents/UCL/rna_editing/prionREDI/")
# read meta-data and extract ID
peri_meta=pd.read_csv("brainMetaMain.csv",sep=";")

peri_meta.columns=['TubeID','SampleID','Genotype','Treatment','wpi','Region','Cohort']
peri_meta['SampleID']=peri_meta['SampleID'].str.split("_",expand=True)[0]


In [142]:
# calculate editing frequency
df['editFreq']=df['G']/df['A']
# sort by editing frequency >= 1%
#len(df['editFreq'])
#len(df.loc[df['editFreq']]) # 2 values are below 0.1%
df=df.loc[df['editFreq']>= 0.01]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [143]:
df.merge(peri_meta,left_on="sampleID",right_on="SampleID")

Unnamed: 0,Region_x,Position,Reference,Strand,Coverage-q30,MeanQ,"BaseCount[A,C,G,T]",AllSubs,Frequency,sampleID,...,G,T,editFreq,TubeID,SampleID,Genotype,Treatment,wpi,Region_y,Cohort
0,chrX,41654252,A,1,231,41.18,"[37, 0, 194, 0]",AG,0.84,PB-15,...,194,0,5.243243,p1608_2530/5,PB-15,wt,NBH,4,Hippocampus,main
1,chrX,72445292,A,0,62,47.69,"[10, 0, 52, 0]",AG,0.84,PB-15,...,52,0,5.200000,p1608_2530/5,PB-15,wt,NBH,4,Hippocampus,main
2,chrX,74226862,A,0,58,46.93,"[18, 0, 40, 0]",AG,0.69,PB-15,...,40,0,2.222222,p1608_2530/5,PB-15,wt,NBH,4,Hippocampus,main
3,chrX,147169686,A,1,20,48.55,"[1, 0, 19, 0]",AG,0.95,PB-15,...,19,0,19.000000,p1608_2530/5,PB-15,wt,NBH,4,Hippocampus,main
4,chrX,147169688,A,1,20,48.35,"[1, 0, 19, 0]",AG,0.95,PB-15,...,19,0,19.000000,p1608_2530/5,PB-15,wt,NBH,4,Hippocampus,main
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1791,chr2,157558140,A,0,153,57.10,"[121, 0, 32, 0]",AG,0.21,PB-30,...,32,0,0.264463,p1608_2530/10,PB-30,wt,NBH,8,Hippocampus,main
1792,chr2,157558149,A,0,152,56.24,"[107, 0, 45, 0]",AG,0.30,PB-30,...,45,0,0.420561,p1608_2530/10,PB-30,wt,NBH,8,Hippocampus,main
1793,chr1,66672714,A,1,146,40.41,"[113, 0, 33, 0]",AG,0.23,PB-30,...,33,0,0.292035,p1608_2530/10,PB-30,wt,NBH,8,Hippocampus,main
1794,chr1,172092348,A,1,411,40.27,"[390, 0, 21, 0]",-,0.00,PB-30,...,21,0,0.053846,p1608_2530/10,PB-30,wt,NBH,8,Hippocampus,main


In [144]:
df=df.merge(peri_meta,left_on="sampleID",right_on="SampleID") # merge tables

In [145]:
# keep df_raw for statistical comparisons
df_raw=df_raw.merge(peri_meta,left_on="sampleID",right_on="SampleID") # merge tables

In [146]:
# test if edited site occurs at least in 2/3 of replicates, easy: positions are all unique
occurence = []
wpii=[4,4,8,8,12,12,14,14,16,16,18,18,20,20,'term','term']
treat=['RML6','NBH']*8
loopi=pd.DataFrame({"treat":treat,"wpi":wpii})

In [147]:
# extract sites that are edited at least in 2/3 replicates

testSites=pd.DataFrame([])
for i in range(0,len(loopi)):
    tS=pd.DataFrame([])
    tS[['Position','Counts']]=pd.DataFrame(df['Position'].loc[(df['wpi']==str(loopi['wpi'].iloc[i])) & (df['Treatment']==loopi['treat'].iloc[i])].value_counts()).reset_index()
    tS['wpi']=loopi['wpi'].iloc[i]
    tS['Treatment']=loopi['treat'].iloc[i]
    testSites=testSites.append(tS)

In [148]:
testSites

Unnamed: 0,Position,Counts,wpi,Treatment
0,12411582,3,4,RML6
1,75719719,3,4,RML6
2,74226862,3,4,RML6
3,7936048,3,4,RML6
4,32047432,3,4,RML6
...,...,...,...,...
44,23960539,1,term,NBH
45,3415084,1,term,NBH
46,23960622,1,term,NBH
47,49244334,1,term,NBH


In [149]:
dfReplicates=df[['wpi','Treatment','SampleID']].drop_duplicates()
totalRep=pd.Series(list(zip(dfReplicates.wpi, dfReplicates.Treatment))).value_counts()
totalRep=totalRep.reset_index()
totalRep.columns=['Index','Rep']

In [150]:
testSites['wpi']=testSites['wpi'].astype('str')

In [151]:
testSites['wpiTreatment']=list(zip(testSites.wpi, testSites.Treatment)) # merge wpi+Treatment for threshold 
Thr=pd.DataFrame({'Index':[],'Thr':[]})                                                       # calculation of edited sites
for i in range(0,len(testSites)): # iterate over testsite counts, keep those occuring in floor(2/3 of samples)
    wpiTreat=testSites['wpiTreatment'].iloc[i]  # read current treatment+wpi combi
    Tr=math.floor(totalRep['Rep'].loc[totalRep['Index']==wpiTreat]*2/3) # calculate current threshold
    Tp=pd.DataFrame({'Index':str(i),'Thr':str(Tr)},index=[i]) # make list of thresholds per edited site
    Thr=Thr.append(Tp)
#testSitesThr=testSites['Position'].loc[testSites['Counts']!=1] # extract test sites 
#testSites['Counts'].loc[testSites['Position'].isin(testSites1)].head(50)

In [152]:
testSitesThr=testSites.reset_index().join(Thr)# join TestSites and Threshold

In [153]:
testSitesThr['Counts']=testSitesThr['Counts'].astype('int')
testSitesThr['Thr']=testSitesThr['Thr'].astype('int') # convert counts to int

In [154]:
testSitesThr

Unnamed: 0,index,Position,Counts,wpi,Treatment,wpiTreatment,Index,Thr
0,0,12411582,3,4,RML6,"(4, RML6)",0,2
1,1,75719719,3,4,RML6,"(4, RML6)",1,2
2,2,74226862,3,4,RML6,"(4, RML6)",2,2
3,3,7936048,3,4,RML6,"(4, RML6)",3,2
4,4,32047432,3,4,RML6,"(4, RML6)",4,2
...,...,...,...,...,...,...,...,...
700,44,23960539,1,term,NBH,"(term, NBH)",700,2
701,45,3415084,1,term,NBH,"(term, NBH)",701,2
702,46,23960622,1,term,NBH,"(term, NBH)",702,2
703,47,49244334,1,term,NBH,"(term, NBH)",703,2


In [155]:
testSitesThr=testSitesThr.loc[testSitesThr['Counts']>=testSitesThr['Thr']] # select for editing events which pass threshold

In [156]:
# make combination of position+wpi
uniqueEdSitesThr=pd.Series(list(zip(testSitesThr.wpi, testSitesThr.Position)))
uniqueEdSitesThr=uniqueEdSitesThr.astype('str') # convert tuple to str

In [157]:
df_raw['wpiPosition']=list(zip(df_raw.wpi_y, df_raw.Position)) # combine wpi+Position in df_raw
df_raw['wpiPosition']=df_raw['wpiPosition'].astype('str') # convert tuple to str

AttributeError: 'DataFrame' object has no attribute 'wpi_y'

In [None]:
testSitesThr['wpiPosition']=testSitesThr['wpi']+'+'+testSitesThr['Position'].astype('str')
df_raw['wpiPosition']=df_raw['wpi_y']+'+'+df_raw['Position'].astype('str')

In [None]:
uniqueTestSitesThr=testSitesThr['wpiPosition'].unique() # retrieve unique editing sites

In [None]:
uniqueTestSitesThr=list(uniqueTestSitesThr) # convert unique editing sites to list

In [None]:
dfThr=df_raw.loc[df_raw['wpiPosition'].isin(uniqueTestSitesThr)] # keep editing sites that pass threshold

In [None]:
# clean up dfThr for export
dfThrClean=dfThr[['A','G','Treatment_y','wpi_y','wpiPosition']]


In [None]:
# test whether at least 2 replicates of non-edited site exist
uniqueSites=dfThrClean['wpiPosition'].unique()
excludeSites=[]
for i in range(0,len(uniqueSites)):
    uS=dfThrClean.loc[dfThrClean['wpiPosition']==uniqueSites[i]]
    if len(uS['Treatment_y'].loc[uS['Treatment_y']=="NBH"]) < 2:
        #print("In wpiPosition {} treated with NBH less than 2 replicates".format(uniqueSites[i]))
        excludeSites.append(uniqueSites[i])
    if len(uS['Treatment_y'].loc[uS['Treatment_y']=="RML6"]) < 2:
        #print("In wpiPosition {} treated with RML6 less than 2 replicates".format(uniqueSites[i]))
         excludeSites.append(uniqueSites[i])

In [None]:
dfThrClean=dfThrClean.loc[~dfThrClean['wpiPosition'].isin(excludeSites)]
dfThrClean.to_csv('brainRediEli.csv') # export dfThrClean

In [None]:
# load p-Value adjusted results from Redit

In [None]:
# analyse statistical testing with Redit, see R
dfP=pd.read_csv('/Users/karlfrontzek/Documents/UCL/rna_editing/prionREDI/brainEditingSitesRedit.csv')

In [None]:
dfP[['wpi','site1']]=dfP['site'].str.split("+",expand=True)

In [None]:
from statsmodels.stats.multitest import multipletests # for fdr

In [None]:
wpiUnique=dfP['wpi'].unique()

In [None]:
# adjust for fdr < 0.05 
# preassign dataframe
tT=pd.DataFrame({'wpi':[],'sig':[],'fdr':[],'pValue':[],'index':[]})
for i in range(0,len(wpiUnique)): # loop through all wpi
    tTtemp=pd.DataFrame({'wpi':[],'sig':[],'fdr':[],'pValue':[],'index':[]}) # preassign temp datafram
    # add sig and pVal to dataframe
    tTtemp['sig']=multipletests(dfP['pValue'].loc[dfP['wpi']==wpiUnique[i]],alpha=0.05,method="fdr_bh")[0]
    tTtemp['fdr']=multipletests(dfP['pValue'].loc[dfP['wpi']==wpiUnique[i]],alpha=0.05,method="fdr_bh")[1]
    tTtemp[['index','pValue']]=dfP['pValue'].loc[dfP['wpi']==wpiUnique[i]].reset_index()
    tTtemp['wpi']=wpiUnique[i]
    tT=tT.append(tTtemp)

In [None]:
bloodRediAllEvents=tT.merge(dfP,left_on="pValue",right_on="pValue") # merge sites and fdr on "pValue"

In [None]:
bloodRediAllEvents.loc[bloodRediAllEvents['sig']==1] # show significant editing events

In [None]:
#df_raw
df_raw.loc[(df_raw['Position']==7936048) & (df_raw['wpi']=="16")]

In [None]:
df_raw.loc[(df_raw['Position']==172092348) & (df_raw['wpi']=="16")]