In [198]:
%load_ext autoreload
%load_ext rpy2.ipython

%autoreload 2

import os
import glob
import re
import pandas as pd
import numpy as np
import itertools

import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [203]:
%%R -o load_annotations
library(GEOquery)
load_annotations <- function(datasetids) {
    annotations=list()
    GSEList <- list()
    for(i in 1:length(datasetids)){
        geo <- getGEO(datasetids[[i]], GSEMatrix=TRUE)
        if(length(geo)>0){
            annotations[[i]]<-pData(geo[[1]])
        }
    }
    names(annotations)<-datasetids
    return(annotations)
}

In [204]:
import re

def catmapping(cat):
    cat=cat.replace("DLBCL","").replace("GCB.ABC.signature","")
    if(("ABC" in cat) or ("activated" in cat)):
        return "ABC"
    if(("GCB" in cat) or ("germinal center" in cat)):
        return "GCB"
    return "UNC"
    
def statusmapping(status, dead, alive):
    if(not status):
        return "UNC"
    if(dead in status or "1" in status):
        return "DEAD"
    if(alive in status or "0" in status):
        return "ALIVE"
    return "UNC"

def osmapping(os):
    if(not os):
        return np.NaN
    
    mul = 1
    if("year" in os):
        mul = 12
    if("day" in os):
        mul = 1/30
        
    rs = re.search("\d+\.\d+", os)
    if(rs):
        number = np.float(rs.group(0))*mul
    else:
        number=np.NaN
    
    return number
    
def convertcolumns(annotation, subtype_column, os_column, os_status, pfs_column, pfs_status,
                   dead_label="DEAD", alive_label="ALIVE"):
    
    if(os_column not in annotation):
        annotation["os_column"] = None
        os_column="os_column"
    if(os_status not in annotation):
        annotation["os_status"] = None
        os_status="os_status"
    if(pfs_column not in annotation):
        annotation["pfs_column"] = None
        pfs_column="pfs_column"
    if(pfs_status not in annotation):
        annotation["pfs_status"] = None
        pfs_status="pfs_status"
        
    annotation=annotation[[subtype_column,os_column,os_status,pfs_column,pfs_status]]
    annotation.columns=["type", "os", "os_status","pfs","pfs_status"]
    annotation.type = [catmapping(g) for g in annotation.type]
    annotation.type =  annotation.type.astype('category')

    annotation.os_status = [statusmapping(g, dead_label, alive_label)
                                      for g in annotation.os_status]
    annotation.os_status =  annotation.os_status.astype('category')
    annotation.pfs_status = [statusmapping(g, dead_label, alive_label)
                                      for g in annotation.pfs_status]
    annotation.pfs_status =  annotation.pfs_status.astype('category')

    
    annotation.os = [osmapping(g) for g in annotation.os]
    annotation.pfs = [osmapping(g) for g in annotation.pfs]
    
    return annotation

In [205]:
def process_annotations():
    datasets = pd.read_excel("data//B_cell_lymph_datasets_corrected.xlsx")
    datasets=datasets.fillna("")
    datasetids = list(datasets.Dataset_id)
    
    annotations = load_annotations(datasetids)
    
    annotations[16] = annotations[16][annotations[16]["characteristics_ch1.5"].apply(lambda x: ": BL" not in x)]
    annotations[19] = annotations[19][annotations[19]["characteristics_ch1.5"].apply(lambda x: ": BL" not in x)]
    annotations[6] = annotations[6][annotations[6]["characteristics_ch1.1"].apply(lambda x: ": MHG" not in x)]
    annotations[9] = annotations[9][annotations[9]["characteristics_ch1.1"].apply(lambda x: ": MHG" not in x)]
    i=0
    prannotations=list()
    for v in datasets.iterrows(): 
        typecolumn = v[1].subtype_column
        oscolumn = v[1].OS_column
        osstatuscolumn = v[1].OS_status_column
        pfscolumn = v[1].PFS_column
        pfsstatuscolumn = v[1].PFS_status_column
        prannotations.append(convertcolumns(annotations[i],typecolumn,oscolumn,osstatuscolumn,pfscolumn,pfsstatuscolumn))
        i=i+1

    processed = pd.concat(prannotations)
    return processed

In [206]:
process_annotations()

R[write to console]: Found 1 file(s)

R[write to console]: GSE10846_series_matrix.txt.gz

R[write to console]: Using locally cached version: /tmp/Rtmp6ukntQ/GSE10846_series_matrix.txt.gz

R[write to console]: Parsed with column specification:
cols(
  .default = col_double(),
  ID_REF = col_character()
)

R[write to console]: See spec(...) for full column specifications.

R[write to console]: Using locally cached version of GPL570 found here:
/tmp/Rtmp6ukntQ/GPL570.soft 

R[write to console]:  62 parsing failures.
  row     col           expected    actual         file
54614 SPOT_ID 1/0/T/F/TRUE/FALSE --Control literal data
54615 SPOT_ID 1/0/T/F/TRUE/FALSE --Control literal data
54616 SPOT_ID 1/0/T/F/TRUE/FALSE --Control literal data
54617 SPOT_ID 1/0/T/F/TRUE/FALSE --Control literal data
54618 SPOT_ID 1/0/T/F/TRUE/FALSE --Control literal data
..... ....... .................. ......... ............
See problems(...) for more details.


R[write to console]: Found 2 file(s)

R[write to co

R[write to console]: Found 1 file(s)

R[write to console]: GSE117556_series_matrix.txt.gz

R[write to console]: Using locally cached version: /tmp/Rtmp6ukntQ/GSE117556_series_matrix.txt.gz

R[write to console]: Parsed with column specification:
cols(
  .default = col_double(),
  ID_REF = col_character()
)

R[write to console]: See spec(...) for full column specifications.

R[write to console]: Using locally cached version of GPL14951 found here:
/tmp/Rtmp6ukntQ/GPL14951.soft 

R[write to console]: Found 1 file(s)

R[write to console]: GSE99276_series_matrix.txt.gz

R[write to console]: Using locally cached version: /tmp/Rtmp6ukntQ/GSE99276_series_matrix.txt.gz

R[write to console]: Parsed with column specification:
cols(
  ID_REF = col_double(),
  GSM2640279 = col_double(),
  GSM2640280 = col_double(),
  GSM2640281 = col_double(),
  GSM2640282 = col_double(),
  GSM2640283 = col_double(),
  GSM2640284 = col_double(),
  GSM2640285 = col_double(),
  GSM2640286 = col_double()
)

R[write to

R[write to console]: Using locally cached version of GPL570 found here:
/tmp/Rtmp6ukntQ/GPL570.soft 

R[write to console]:  62 parsing failures.
  row     col           expected    actual         file
54614 SPOT_ID 1/0/T/F/TRUE/FALSE --Control literal data
54615 SPOT_ID 1/0/T/F/TRUE/FALSE --Control literal data
54616 SPOT_ID 1/0/T/F/TRUE/FALSE --Control literal data
54617 SPOT_ID 1/0/T/F/TRUE/FALSE --Control literal data
54618 SPOT_ID 1/0/T/F/TRUE/FALSE --Control literal data
..... ....... .................. ......... ............
See problems(...) for more details.




Unnamed: 0,type,os,os_status,pfs,pfs_status
GSM274895,GCB,32.16,DEAD,,UNC
GSM274896,GCB,9.84,DEAD,,UNC
GSM274897,ABC,30.48,DEAD,,UNC
GSM274898,ABC,116.04,ALIVE,,UNC
GSM274899,ABC,57.96,ALIVE,,UNC
GSM274900,ABC,86.52,ALIVE,,UNC
GSM274901,ABC,0.60,DEAD,,UNC
GSM274902,ABC,121.68,ALIVE,,UNC
GSM274903,GCB,15.96,DEAD,,UNC
GSM274904,GCB,184.92,ALIVE,,UNC
