In [1]:
%load_ext autoreload
%autoreload 1
%aimport prepare_data

from prepare_data import * 
from pygenesig.tools import collapse_matrix
from pygenesig.file_formats import * 

import numpy as np
import pandas as pd

## Load data

In [2]:
pdata = pd.read_csv("../../data/v3/roche_annotated_pdata.tsv", sep="\t")
fdata = pd.read_csv("../../data/v3/roche_annotated_fdata.tsv", sep="\t")
fdata_unprocessed = pd.read_csv("../../data/v3/RocheAnnotated_GTEx_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm__Pilot_2013_01_31-featureAnnotation.txt", sep="\t")

In [3]:
exprs_unprocessed = read_gct("../../data/v3/RocheAnnotated_GTEx_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm__Pilot_2013_01_31.gct")

In [4]:
exprs_processed = read_gct("../../data/v3/roche_annotated_cpm.gct")

In [5]:
pdata.shape, exprs_unprocessed.shape, exprs_processed.shape, fdata_unprocessed.shape

((1641, 65), (52576, 1641), (22211, 1641), (52576, 5))

## Process data

In [6]:
fdata_unprocessed.set_index(fdata_unprocessed.GeneID.apply(str), inplace=True)

In [7]:
target = pdata.UDISCV.apply(sanitze_name).as_matrix()

#### Collapse gene expression by symbol

In [8]:
exprs_unprocessed_collapsed = collapse_matrix(exprs_unprocessed, fdata_unprocessed.index, axis=0, aggregate_fun=np.sum)

In [9]:
exprs_unprocessed_collapsed = exprs_unprocessed_collapsed[exprs_unprocessed_collapsed.index != 'nan']

In [10]:
fdata_unprocessed_collapsed = fdata_unprocessed.loc[exprs_unprocessed_collapsed.index, ('GeneSymbol', 'GeneName', 'GeneType')]

In [11]:
fdata_unprocessed_collapsed = fdata_unprocessed_collapsed.drop_duplicates()

In [12]:
exprs_unprocessed_collapsed = exprs_unprocessed_collapsed.as_matrix()

In [13]:
exprs_unprocessed_collapsed.shape, fdata_unprocessed_collapsed.shape

((24521, 1641), (24521, 3))

## save to numpy objects

In [15]:
write_target(target, "../../data_processed/v3/target.csv")
write_rosetta(fdata.GeneSymbol.as_matrix(), "../../data_processed/v3/rosetta_processed.csv")
write_rosetta(fdata_unprocessed_collapsed.GeneSymbol.as_matrix(), "../../data_processed/v3/rosetta_unprocessed.csv")

In [16]:
write_expr(exprs_unprocessed_collapsed, "../../data_processed/v3/exprs_unprocessed.npy")
write_expr(exprs_processed, "../../data_processed/v3/exprs_processed.npy")