In [1]:
import pandas as pd

In [2]:
def filter_shortstack_df(shortstack_df,sampleName):
    """
    - filters to keep only microRNAs (microRNA==Y) 
    - keep the "#Locus", "Name" and "MajorRNAReads" columns
    """
    # filters the dataframe
    df = pd.read_csv(shortstack_df,sep="\t")
    df = df[df.MIRNA == "Y"]
    df = df[["MajorRNA","MajorRNAReads"]]
    
    # rename counts column with sampleName
    df.rename(columns={'MajorRNAReads':sampleName}, inplace=True)
    df = df.set_index("MajorRNA")
    return df

In [3]:
pwd

'/Users/mgalland/Documents/workspace/small_rna-seq_pipeline/scripts'

In [4]:
import os

resultdir = "../results/shortstack/"

samples = [s for s in os.listdir(resultdir) if s not in [".DS_Store"]]
samples

['C32.sub', 'LA4024.sub']

In [5]:
list_of_shortstack_dfs = [resultdir + s + "/Results.txt" for s in samples]
list_of_shortstack_dfs

['../results/shortstack/C32.sub/Results.txt',
 '../results/shortstack/LA4024.sub/Results.txt']

In [6]:
dfs = [filter_shortstack_df(df,sample) for df,sample in zip(list_of_shortstack_dfs,samples)]
len(dfs)

2

In [7]:
dfs[0]

Unnamed: 0_level_0,C32.sub
MajorRNA,Unnamed: 1_level_1
UCGGACCAGGCUUCAUUCCC,285


In [8]:
dfs[1]

Unnamed: 0_level_0,LA4024.sub
MajorRNA,Unnamed: 1_level_1
UCGGACCAGGCUUCAUUCCC,230
ACGGGGACGAGCCAGAGCAUG,10


In [9]:
df_merged = pd.concat(dfs,axis=1) 

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [10]:
df_merged

Unnamed: 0,C32.sub,LA4024.sub
ACGGGGACGAGCCAGAGCAUG,,10
UCGGACCAGGCUUCAUUCCC,285.0,230


## Scale counts (DESEq2 style)
1. DESeq2 performs an internal normalization where geometric mean is calculated for each gene across all samples.
2. The counts for a gene in each sample is then divided by this mean. 
3. The median of these ratios in a sample is the size factor for that sample. 

This procedure corrects for library size and RNA composition bias, which can arise for example when only a small number of genes are very highly expressed in one experiment condition but not in the other.


In [11]:
# Step 1: calculate geometric mean
df_merged["geomean"] = df_merged.mean(axis=1,skipna=True)
df_merged

Unnamed: 0,C32.sub,LA4024.sub,geomean
ACGGGGACGAGCCAGAGCAUG,,10,10.0
UCGGACCAGGCUUCAUUCCC,285.0,230,257.5


In [14]:
# Step 2: divide each mirna count by the mean of this gene
import numpy as np
df_merged.apply(lambda x: x / df_merged["geomean"],axis=1)

TypeError: '(slice(None, None, None), slice(None, None, None))' is an invalid key