# Exploracion de archivos de datos (microarray) de la UofT

Exploring the data in the files sent by Dr. Nick Hovarth (UofT)

Link:
http://bar.utoronto.ca/ntools/cgi-bin/ATGE_Pathogen_raw.txt  
Comments from the author:
These are just the raw data (but normalized by the GCOS method with an TGT value set to 100 - the approximate background value is 20 and the average expression level will be around 100 units, from the ATH1 platform, so not RNA-seq-based), not averaged across replicates or set relative to appropriate control. Let me know if something like this would work for you. You may need to bite the bullet and do the remapping for SRA accessions yourself if you want the platforms and normalization/summarization to be the same

In [4]:
import pandas as pd
import numpy as np
import os 

# PANDAS init config 
# Use 3 decimal places in output display
pd.set_option("display.precision", 3)
# Don't wrap repr(DataFrame) across additional lines
pd.set_option("display.expand_frame_repr", False)
# pd.set_option("display.max_rows", 4)

In [6]:
try:
    # Change the current working Directory to the quantification dir
    os.chdir("/home/cyntsc/Proyectos/tesis-code/meta-xtrome-analysis/athal_UofT/")
    print("Directory changed")
except OSError:
    print("Can't change the Current Working Directory")


Directory changed


In [31]:
df1 = pd.read_csv('arabidopsis_toronto.txt',sep = '\t', index_col=False, dtype='unicode')

# Notes: leading with msg: "have mixed types.Specify dtype option on import or set low_memory=False."  
# low_memory warning is because guessing dtypes for each column is very memory demanding. 
# Pandas tries to determine what dtype to set by analyzing the data in each column.
# Setting dtype=object 
#         will silence the above warning, but will not make it more memory efficient, only process efficient if anything.
# Setting dtype=unicode 
#         will not do anything, since to numpy, a unicode is represented as object.


In [32]:
print(df1.columns)
print(df1.shape) 
df1.head(10)

Index(['NAME', 'AtGen_A-2_17-2_REP2_ATH1_Psy-C-00 [ATGE_ExpID_120]',
       'AtGen_A-1_17-1_REP1_ATH1_Psy-C-00 [ATGE_ExpID_120]',
       'AtGen_A-3_17-3_REP3_ATH1_Psy-C-00 [ATGE_ExpID_120]',
       'AtGen_A-54_33-2_REP2_ATH1_Psy-C-02 [ATGE_ExpID_120]',
       'AtGen_A-53_33-1_REP1_ATH1_Psy-C-02 [ATGE_ExpID_120]',
       'AtGen_A-55_33-3_REP3_ATH1_Psy-C-02 [ATGE_ExpID_120]',
       'AtGen_A-59_34-3_REP2_ATH1_Psy-C-06 [ATGE_ExpID_120]',
       'AtGen_A-58_34-2_REP1_ATH1_Psy-C-06 [ATGE_ExpID_120]',
       'AtGen_A-60_34-4_REP3_ATH1_Psy-C-06 [ATGE_ExpID_120]',
       ...
       'JD AT+EO COL WT 02D INFECTED_Eo-T-048 [ATGE_ExpID_169]',
       'JD AT+EO COL WT 03D INFECTED_Eo-T-072 [ATGE_ExpID_169]',
       'JD AT+EO TIME EXP3 EO INF 3D_Eo-T-072 [ATGE_ExpID_169]',
       'JD AT+EO COL WT EXP2 03D INFECTED_Eo-T-072 [ATGE_ExpID_169]',
       'JD AT+EO COL WT 04D INFECTED_Eo-T-096 [ATGE_ExpID_169]',
       'JD AT+EO TIME EXP3 EO INF 4D_Eo-T-096 [ATGE_ExpID_169]',
       'JD AT+EO COL WT EXP2 04

Unnamed: 0,NAME,AtGen_A-2_17-2_REP2_ATH1_Psy-C-00 [ATGE_ExpID_120],AtGen_A-1_17-1_REP1_ATH1_Psy-C-00 [ATGE_ExpID_120],AtGen_A-3_17-3_REP3_ATH1_Psy-C-00 [ATGE_ExpID_120],AtGen_A-54_33-2_REP2_ATH1_Psy-C-02 [ATGE_ExpID_120],AtGen_A-53_33-1_REP1_ATH1_Psy-C-02 [ATGE_ExpID_120],AtGen_A-55_33-3_REP3_ATH1_Psy-C-02 [ATGE_ExpID_120],AtGen_A-59_34-3_REP2_ATH1_Psy-C-06 [ATGE_ExpID_120],AtGen_A-58_34-2_REP1_ATH1_Psy-C-06 [ATGE_ExpID_120],AtGen_A-60_34-4_REP3_ATH1_Psy-C-06 [ATGE_ExpID_120],...,JD AT+EO COL WT 02D INFECTED_Eo-T-048 [ATGE_ExpID_169],JD AT+EO COL WT 03D INFECTED_Eo-T-072 [ATGE_ExpID_169],JD AT+EO TIME EXP3 EO INF 3D_Eo-T-072 [ATGE_ExpID_169],JD AT+EO COL WT EXP2 03D INFECTED_Eo-T-072 [ATGE_ExpID_169],JD AT+EO COL WT 04D INFECTED_Eo-T-096 [ATGE_ExpID_169],JD AT+EO TIME EXP3 EO INF 4D_Eo-T-096 [ATGE_ExpID_169],JD AT+EO COL WT EXP2 04D INFECTED_Eo-T-096 [ATGE_ExpID_169],JD AT+EO COL WT 05D INFECTED_Eo-T-120 [ATGE_ExpID_169],JD AT+EO COL WT EXP2 05D INFECTED_Eo-T-120 [ATGE_ExpID_169],JD AT+EO TIME EXP3 EO INF 5D_Eo-T-120 [ATGE_ExpID_169]
0,#timecourse,0,0,0,2,2,2,6,6,6,...,48,72,72,72,96,96,96,120,120,120
1,#mutant,WT_Col-0,WT_Col-0,WT_Col-0,WT_Col-0,WT_Col-0,WT_Col-0,WT_Col-0,WT_Col-0,WT_Col-0,...,WT_Col-0,WT_Col-0,WT_Col-0,WT_Col-0,WT_Col-0,WT_Col-0,WT_Col-0,WT_Col-0,WT_Col-0,WT_Col-0
2,#category,"Response to virulent, avirulent, typeIII-secre...","Response to virulent, avirulent, typeIII-secre...","Response to virulent, avirulent, typeIII-secre...","Response to virulent, avirulent, typeIII-secre...","Response to virulent, avirulent, typeIII-secre...","Response to virulent, avirulent, typeIII-secre...","Response to virulent, avirulent, typeIII-secre...","Response to virulent, avirulent, typeIII-secre...","Response to virulent, avirulent, typeIII-secre...",...,Response to Erysiphe orontii infection,Response to Erysiphe orontii infection,Response to Erysiphe orontii infection,Response to Erysiphe orontii infection,Response to Erysiphe orontii infection,Response to Erysiphe orontii infection,Response to Erysiphe orontii infection,Response to Erysiphe orontii infection,Response to Erysiphe orontii infection,Response to Erysiphe orontii infection
3,#control,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,...,No,No,No,No,No,No,No,No,No,No
4,#age,3.7,3.7,3.7,3.7,3.7,3.7,3.7,3.7,3.7,...,3.7,3.7,3.7,3.7,3.7,3.7,3.7,3.7,3.7,3.7
5,#tissue,leaf,leaf,leaf,leaf,leaf,leaf,leaf,leaf,leaf,...,leaf,leaf,leaf,leaf,leaf,leaf,leaf,leaf,leaf,leaf
6,244901_at,33.70,38.80,30.00,52.30,72.60,49.90,75.50,94.70,56.90,...,32.20,27.61,27.90,33.34,21.02,40.08,38.20,31.61,34.11,43.51
7,244902_at,53.40,48.20,39.10,62.80,73.10,74.70,82.70,101.10,52.00,...,17.95,25.26,18.15,26.27,23.24,19.45,34.84,25.16,21.51,26.81
8,244903_at,48.00,43.80,46.60,55.50,44.90,61.40,71.30,91.90,65.80,...,31.67,38.68,44.62,40.39,19.84,75.74,60.38,53.86,69.38,65.21
9,244904_at,13.40,22.50,21.00,14.60,12.20,8.20,23.80,21.10,27.90,...,8.03,10.27,4.93,11.94,11.76,17.89,11.67,22.90,21.97,24.35


In [23]:
df = df1.T
print(df.columns)
print(df.shape) 
df.head(10)

RangeIndex(start=0, stop=22819, step=1)
(201, 22819)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22809,22810,22811,22812,22813,22814,22815,22816,22817,22818
NAME,#timecourse,#mutant,#category,#control,#age,#tissue,244901_at,244902_at,244903_at,244904_at,...,AFFX-r2-Bs-thr-M_s_at,AFFX-r2-Ec-bioB-3_at,AFFX-r2-Ec-bioB-5_at,AFFX-r2-Ec-bioB-M_at,AFFX-r2-Ec-bioC-3_at,AFFX-r2-Ec-bioC-5_at,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at
AtGen_A-2_17-2_REP2_ATH1_Psy-C-00 [ATGE_ExpID_120],0,WT_Col-0,"Response to virulent, avirulent, typeIII-secre...",Yes,3.7,leaf,33.70,53.40,48.00,13.40,...,3.70,34.90,98.30,62.30,179.40,105.60,498.90,354.20,2242.50,1979.90
AtGen_A-1_17-1_REP1_ATH1_Psy-C-00 [ATGE_ExpID_120],0,WT_Col-0,"Response to virulent, avirulent, typeIII-secre...",Yes,3.7,leaf,38.80,48.20,43.80,22.50,...,4.70,42.80,86.50,66.30,164.30,131.50,480.90,455.00,2899.30,2224.10
AtGen_A-3_17-3_REP3_ATH1_Psy-C-00 [ATGE_ExpID_120],0,WT_Col-0,"Response to virulent, avirulent, typeIII-secre...",Yes,3.7,leaf,30.00,39.10,46.60,21.00,...,19.70,52.50,99.60,57.00,151.10,100.10,457.20,262.60,2162.80,1753.40
AtGen_A-54_33-2_REP2_ATH1_Psy-C-02 [ATGE_ExpID_120],2,WT_Col-0,"Response to virulent, avirulent, typeIII-secre...",Yes,3.7,leaf,52.30,62.80,55.50,14.60,...,1.90,34.80,79.20,54.60,144.50,100.20,328.00,272.60,2553.70,1640.30
AtGen_A-53_33-1_REP1_ATH1_Psy-C-02 [ATGE_ExpID_120],2,WT_Col-0,"Response to virulent, avirulent, typeIII-secre...",Yes,3.7,leaf,72.60,73.10,44.90,12.20,...,1.60,42.40,93.90,59.70,136.50,101.40,406.50,310.60,2109.10,1988.40
AtGen_A-55_33-3_REP3_ATH1_Psy-C-02 [ATGE_ExpID_120],2,WT_Col-0,"Response to virulent, avirulent, typeIII-secre...",Yes,3.7,leaf,49.90,74.70,61.40,8.20,...,11.80,28.00,73.90,33.00,139.20,65.10,263.70,176.90,1663.10,1483.70
AtGen_A-59_34-3_REP2_ATH1_Psy-C-06 [ATGE_ExpID_120],6,WT_Col-0,"Response to virulent, avirulent, typeIII-secre...",Yes,3.7,leaf,75.50,82.70,71.30,23.80,...,1.20,27.30,59.00,39.60,113.90,69.20,296.20,223.30,2025.90,1463.90
AtGen_A-58_34-2_REP1_ATH1_Psy-C-06 [ATGE_ExpID_120],6,WT_Col-0,"Response to virulent, avirulent, typeIII-secre...",Yes,3.7,leaf,94.70,101.10,91.90,21.10,...,1.70,31.50,68.90,50.50,129.00,72.30,307.80,202.20,1817.30,1466.90
AtGen_A-60_34-4_REP3_ATH1_Psy-C-06 [ATGE_ExpID_120],6,WT_Col-0,"Response to virulent, avirulent, typeIII-secre...",Yes,3.7,leaf,56.90,52.00,65.80,27.90,...,1.80,26.10,70.20,42.20,112.20,50.10,249.40,186.10,1600.70,1292.20
