# （練習）RNA-seqカウントデータの読み込み・前処理

In [1]:
# pandasのインポート
import pandas as pd
import numpy as np

In [2]:
count_file = "../input/counts.txt"
gene_id_product_file = "../input/gene_id_product.tsv"

## 課題１ pd.read_tableまたはpd_read_csvを使って カウントデータを読み込む
データフレーム名は`df`とすること。

```
# Program:featureCounts v1.6.2; Command:"../tools/subread-1.6.2-Linux-x86_64/bin/featureCounts" "-p" "-T" "8" "-t" "exon" "-g" "gene_id" "-a" "../reference/s288c_e.gff" "-o" "../featurecount/counts.txt" "SRR453566.sorted.bam" "SRR453567.sorted.bam" "SRR453568.sorted.bam" "SRR453569.sorted.bam" "SRR453570.sorted.bam" "SRR453571.sorted.bam" 
Geneid  Chr     Start   End     Strand  Length  SRR453566.sorted.bam    SRR453567.sorted.bam    SRR453568.sorted.bam    SRR453569.sorted.bam    SRR453570.sorted.bam    SRR453571.sorted.bam
gene_0001       NC_001133.9     1807    2169    -       363     1       3       2       0       0       1
gene_0002       NC_001133.9     2480    2707    +       228     0       0       0       0       0       0
gene_0003       NC_001133.9     7235    9016    -       1782    0       0       0       0       0       0
gene_0004... 以下省略
```
1行目はfeatureCountsの実行条件が記載されているだけなので無視する。<br>
2行目は列タイトルを表すヘッダー行。<br>
3行目以降からがデータ行, 一番左の列が遺伝子idになっているのでこれをインデックスに用いる。

`pd.read_table()` メソッドの`skiprows`、`index_col`オプションを指定して読み込む。<br>
`header`オプションを使う方法, `pd.read_csv()`メソッドを使ってもよい。

In [3]:
# (.....)に必要なオプションを指定して実行
df = pd.read_table(count_file, index_col=0, skiprows=1) 

読み込めたら、データ件数などの確認を行う<br>
`df.head()`  `df.sum()` `df.shape` など

In [4]:
# 列名とインデックスが正しく読み込まれているかを確認　head()を使う
df.head()

Unnamed: 0_level_0,Chr,Start,End,Strand,Length,SRR453566.sorted.bam,SRR453567.sorted.bam,SRR453568.sorted.bam,SRR453569.sorted.bam,SRR453570.sorted.bam,SRR453571.sorted.bam
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
gene_0001,NC_001133.9,1807,2169,-,363,0,2,6,0,0,1
gene_0002,NC_001133.9,2480,2707,+,228,0,0,0,0,0,0
gene_0003,NC_001133.9,7235,9016,-,1782,0,0,0,0,0,0
gene_0004,NC_001133.9,11565,11951,-,387,0,0,0,0,0,0
gene_0005,NC_001133.9,12046,12426,+,381,2,8,10,6,7,18


In [5]:
# データ件数を確認　shapeを使う
df.shape

(6420, 11)

以後の解析のため、列名を変更しておきます。  
下記２セルを実行

In [6]:
# 列名を変更するための対応表
names = {'SRR453566.sorted.bam': 'batch_1',
         'SRR453567.sorted.bam': 'batch_2',
         'SRR453568.sorted.bam': 'batch_3',
         'SRR453569.sorted.bam': 'chemostat_1',
         'SRR453570.sorted.bam': 'chemostat_2',
         'SRR453571.sorted.bam': 'chemostat_3'}

In [8]:
# renameをaxis=1を適用して使い、列名を変更する
df = df.rename(mapper=names, axis=1)

In [9]:
# 確認
df.head()

Unnamed: 0_level_0,Chr,Start,End,Strand,Length,batch_1,batch_2,batch_3,chemostat_1,chemostat_2,chemostat_3
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
gene_0001,NC_001133.9,1807,2169,-,363,0,2,6,0,0,1
gene_0002,NC_001133.9,2480,2707,+,228,0,0,0,0,0,0
gene_0003,NC_001133.9,7235,9016,-,1782,0,0,0,0,0,0
gene_0004,NC_001133.9,11565,11951,-,387,0,0,0,0,0,0
gene_0005,NC_001133.9,12046,12426,+,381,2,8,10,6,7,18


## 課題２ ミトコンドリア上の遺伝子のカウントデータを除く  
df.Chrの値が NC_001224.1 に一致するものがミトコンドリアに該当するので、boolean indexingを使って除く。

In [11]:
# df.Chrの値が NC_001224.1 に一致しないものを抽出してdfに代入する
df = df[df.Chr != 'NC_001224.1']

In [12]:
# 正しく処理ができればデータ件数は6394件になるはず
df.shape

(6394, 11)

## 課題３　pd.read_tableを使ってgene_id_productを読み込む
1列目をインデックスとして用いること。ヘッダー行がないので、 `names=["gene_id", "product"]`オプションを指定して読み込むこと。  
データフレーム名は `gene_products` とすること。

`gene_products = pd.read_table(.....)`


In [13]:
# (.....)に必要なオプションを指定して実行
gene_products = pd.read_table(gene_id_product_file, index_col=0, names=['gene_id', 'product'])

In [14]:
# 正しく読み込めているか確認
gene_products.head()

Unnamed: 0_level_0,product
gene_id,Unnamed: 1_level_1
gene_0001,seripauperin PAU8
gene_0002,hypothetical protein
gene_0003,putative permease SEO1
gene_0004,hypothetical protein
gene_0005,hypothetical protein


## 課題４　gene_productsにdfを連結する  
インデックス列を使って接続するので `join()` を使うのが良い。 <br> 
データフレーム名はdf_with_productとすること。<br>

`df_with_product = gene_products.join(...`


In [15]:
# (.....)に必要なオプションを指定して実行
df_with_product = gene_products.join(df)

本来であれば rRNA　にマップされたリードは無視したいので除く必要がある。  
gene_products には mRNA のデータしか含まれていないので、rRNA などのデータはこの時点で除かれる。  
`pd.merge()`や`pd.concat()`を用いてもよい。

In [16]:
# 確認
df_with_product.head()

Unnamed: 0_level_0,product,Chr,Start,End,Strand,Length,batch_1,batch_2,batch_3,chemostat_1,chemostat_2,chemostat_3
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
gene_0001,seripauperin PAU8,NC_001133.9,1807,2169,-,363,0,2,6,0,0,1
gene_0002,hypothetical protein,NC_001133.9,2480,2707,+,228,0,0,0,0,0,0
gene_0003,putative permease SEO1,NC_001133.9,7235,9016,-,1782,0,0,0,0,0,0
gene_0004,hypothetical protein,NC_001133.9,11565,11951,-,387,0,0,0,0,0,0
gene_0005,hypothetical protein,NC_001133.9,12046,12426,+,381,2,8,10,6,7,18


__カウントデータ部分のみを切り出し__<br>

In [17]:
df_count = df_with_product[["batch_1","batch_2","batch_3","chemostat_1","chemostat_2","chemostat_3"]]
# スライスで指定するなら　df_with_product.iloc[:, 6:12]

__ファイルの保存__<br>
カレントフォルダにoutputフォルダを作成しておく。<br>
`df_with_product`をoutputフォルダにcount_preprocessed.tsvとして保存する。<br>
`df_count`をoutputフォルダにcount_raw.tsvとして保存する。<br>
`df.to_csv(sep="\t")`を用いる。

In [18]:
# アノテーション付きカウントデータの保存
# (.....)に必要なオプションを入れて実行
df_with_product.to_csv('../output/count_preprocessed.tsv', sep='\t')

In [19]:
# raw カウントのデータを保存
# (.....)に必要なオプションを入れて実行
df_count.to_csv('../output/count_raw.tsv', sep='\t')

In [20]:
%%bash
# シェルコマンド head を使って確認
head output/count_raw.tsv

gene_id	batch_1	batch_2	batch_3	chemostat_1	chemostat_2	chemostat_3
gene_0001	0	2	6	0	0	1
gene_0002	0	0	0	0	0	0
gene_0003	0	0	0	0	0	0
gene_0004	0	0	0	0	0	0
gene_0005	2	8	10	6	7	18
gene_0006	0	0	0	0	0	0
gene_0007	0	0	0	0	0	0
gene_0008	0	0	0	0	0	0
gene_0009	32	37	33	43	63	84


## 課題５　リード数による正規化（FPM/RPM）
RPM = reads per million <br>
カウントを100万リードあたりのカウント数に揃え, 正規化する。<br>
カウントデータをコピーした`df_tmp`を使う

In [21]:
#カウントデータを別のデータフレームとしてコピーしておく
df_tmp = df_count.copy()

In [23]:
# 列ごとのリード数の合計
sum_count = df_tmp.sum()
# カウント数を列ごとのリード数の合計で割り,100万をかける
df_tmp = 10**6 * df_tmp / sum_count

In [24]:
# ただしく処理できていれば、下記のようになるはず。
# リード数の合計が100万に揃っていることを確認。
df_tmp.sum()

batch_1        1000000.0
batch_2        1000000.0
batch_3        1000000.0
chemostat_1    1000000.0
chemostat_2    1000000.0
chemostat_3    1000000.0
dtype: float64

上記の処理を関数化しておく。

In [25]:
def normalize_per_million_reads(df):
    sum_count = df.sum()
    return 10**6 * df / sum_count

In [26]:
# 関数の適用
df_count_fpm = normalize_per_million_reads(df_count)

In [27]:
# 確認
df_count_fpm.sum()

batch_1        1000000.0
batch_2        1000000.0
batch_3        1000000.0
chemostat_1    1000000.0
chemostat_2    1000000.0
chemostat_3    1000000.0
dtype: float64

In [28]:
# FPM正規化を行った結果を保存
df_count_fpm.to_csv("../output/count_fpm.tsv", sep="\t")

## 課題６　遺伝子長による正規化 (RPKM/FPKM)
FPKM = fragments per kilobase of exon per million reads mapped<br>
上で求めたFPMをさらに遺伝子長で割って1000をかければ良い。<br>
以下に示す1~4のいずれかの方法を用いること。

In [36]:
# テスト用にデータフレームをコピーしておく
df_tmp = df_count_fpm.copy()

In [30]:
gene_length = df_with_product["Length"]

__1. for ループを使う方法__

データフレームをforループで回すと、列名が取得できるのでそれを利用する。

In [37]:
for col_name in df_tmp:
    df_tmp[col_name] = df_tmp[col_name] / gene_length * 10**3

In [38]:
# 確認
df_tmp.head()

Unnamed: 0_level_0,batch_1,batch_2,batch_3,chemostat_1,chemostat_2,chemostat_3
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gene_0001,0.0,0.881103,3.653609,0.0,0.0,0.561926
gene_0002,0.0,0.0,0.0,0.0,0.0,0.0
gene_0003,0.0,0.0,0.0,0.0,0.0,0.0
gene_0004,0.0,0.0,0.0,0.0,0.0,0.0
gene_0005,1.149909,3.357905,5.801662,5.123019,4.74806,9.636806


または`iteritems()`を用いて

In [39]:
df_tmp = df_count_fpm.copy()
for col_name, col in df_tmp.iteritems():
    df_tmp[col_name] = col / gene_length * 10**3

In [40]:
# 確認
df_tmp.head()

Unnamed: 0_level_0,batch_1,batch_2,batch_3,chemostat_1,chemostat_2,chemostat_3
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gene_0001,0.0,0.881103,3.653609,0.0,0.0,0.561926
gene_0002,0.0,0.0,0.0,0.0,0.0,0.0
gene_0003,0.0,0.0,0.0,0.0,0.0,0.0
gene_0004,0.0,0.0,0.0,0.0,0.0,0.0
gene_0005,1.149909,3.357905,5.801662,5.123019,4.74806,9.636806


__2. データフレームを転置してから計算する方法__

データフレームの転置は`df.T`を使う。

In [34]:
# テスト用にFPMをコピー
df_tmp = df_count_fpm.copy()
# df_tmpを転置してFPMを遺伝子長で割り, 1000をかける
df_tmp = df_tmp.T / gene_length * 10**3
# 戻す（もう一度転置する）
df_tmp = df_tmp.T

In [35]:
# 確認
df_tmp.head()

Unnamed: 0_level_0,batch_1,batch_2,batch_3,chemostat_1,chemostat_2,chemostat_3
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gene_0001,0.0,0.881103,3.653609,0.0,0.0,0.561926
gene_0002,0.0,0.0,0.0,0.0,0.0,0.0
gene_0003,0.0,0.0,0.0,0.0,0.0,0.0
gene_0004,0.0,0.0,0.0,0.0,0.0,0.0
gene_0005,1.149909,3.357905,5.801662,5.123019,4.74806,9.636806


__3. applyを使い各列に関数を適用する方法__

In [41]:
# はじめに列を入力とし、各要素を遺伝子長でわる処理を行う関数を定義する
def divide_by_length(S):
    return S / gene_length * 10**3

In [43]:
# テスト用にFPMをコピー
df_tmp = df_count_fpm.copy()
# 関数を適用する
df_tmp.apply(divide_by_length).head()

Unnamed: 0_level_0,batch_1,batch_2,batch_3,chemostat_1,chemostat_2,chemostat_3
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gene_0001,0.0,0.881103,3.653609,0.0,0.0,0.561926
gene_0002,0.0,0.0,0.0,0.0,0.0,0.0
gene_0003,0.0,0.0,0.0,0.0,0.0,0.0
gene_0004,0.0,0.0,0.0,0.0,0.0,0.0
gene_0005,1.149909,3.357905,5.801662,5.123019,4.74806,9.636806


__4. データフレームメソッドのdivideを使用する方法__

In [44]:
# データフレームメソッドのdivideを使用する方法
df_tmp = df_count_fpm.copy()
df_tmp = df_tmp.divide(gene_length, axis='index') * 10**3

In [45]:
# 確認
df_tmp.head()

Unnamed: 0_level_0,batch_1,batch_2,batch_3,chemostat_1,chemostat_2,chemostat_3
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gene_0001,0.0,0.881103,3.653609,0.0,0.0,0.561926
gene_0002,0.0,0.0,0.0,0.0,0.0,0.0
gene_0003,0.0,0.0,0.0,0.0,0.0,0.0
gene_0004,0.0,0.0,0.0,0.0,0.0,0.0
gene_0005,1.149909,3.357905,5.801662,5.123019,4.74806,9.636806


1~4のいずれかの方法を関数化して計算する(解答例はデータフレームを転置させて計算する方法)

In [46]:
def normalize_per_kilobase(df, gene_length):
    df_tmp = df.copy()
    df_tmp = (df.T * 10**3 / gene_length).T 
    return df_tmp

In [47]:
df_count_fpkm = normalize_per_kilobase(df_count_fpm, gene_length)

In [48]:
# 確認
df_count_fpkm.head()

Unnamed: 0_level_0,batch_1,batch_2,batch_3,chemostat_1,chemostat_2,chemostat_3
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gene_0001,0.0,0.881103,3.653609,0.0,0.0,0.561926
gene_0002,0.0,0.0,0.0,0.0,0.0,0.0
gene_0003,0.0,0.0,0.0,0.0,0.0,0.0
gene_0004,0.0,0.0,0.0,0.0,0.0,0.0
gene_0005,1.149909,3.357905,5.801662,5.123019,4.74806,9.636806


In [60]:
# 保存する
df_count_fpkm.to_csv("../output/count_fpkm.tsv", sep="\t")

## TPM 正規化
TPM = transcripts per kilobase million  
TPM の説明については以下のページが詳しい https://bi.biopapyrus.jp/  
FPKM/RPKM のときとは逆に、長さ1,000bpあたりのリード数を求めてから、総リード数を100万に揃えれば良い。

In [49]:
df_tmp = df_count.copy()

In [50]:
df_tmp = normalize_per_kilobase(df_tmp, gene_length)
df_tmp = normalize_per_million_reads(df_tmp)

In [51]:
df_tmp.head()

Unnamed: 0_level_0,batch_1,batch_2,batch_3,chemostat_1,chemostat_2,chemostat_3
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gene_0001,0.0,0.734587,3.129839,0.0,0.0,0.50481
gene_0002,0.0,0.0,0.0,0.0,0.0,0.0
gene_0003,0.0,0.0,0.0,0.0,0.0,0.0
gene_0004,0.0,0.0,0.0,0.0,0.0,0.0
gene_0005,0.94849,2.799529,4.969954,4.689762,4.372026,8.657291


In [52]:
# RPKM/FPKMと違い、合計が100万となっています。
df_tmp.sum()

batch_1        1000000.0
batch_2        1000000.0
batch_3        1000000.0
chemostat_1    1000000.0
chemostat_2    1000000.0
chemostat_3    1000000.0
dtype: float64

In [53]:
# 関数化します
def normalize_tpm(df, gene_length):
    df_tmp = df.copy()
    df_tmp = normalize_per_kilobase(df_tmp, gene_length)
    df_tmp = normalize_per_million_reads(df_tmp)
    return df_tmp

In [54]:
df_count_tpm = normalize_tpm(df_count, gene_length)

In [55]:
df_count_tpm.sum() #  確認

batch_1        1000000.0
batch_2        1000000.0
batch_3        1000000.0
chemostat_1    1000000.0
chemostat_2    1000000.0
chemostat_3    1000000.0
dtype: float64

In [56]:
# 保存
df_count_tpm.to_csv("../output/count_tpm.tsv", sep="\t")

## 課題7　発現変動遺伝子を抽出する
以下のセルを実行し, 各条件の平均から発現変動をlog2 foldとして求める。

In [57]:
# batch cultureの平均を求める
df_count_tpm["batch"] = (df_count_tpm["batch_1"] + df_count_tpm["batch_2"] + df_count_tpm["batch_3"]) / 3

# chemostatの平均を求める
df_count_tpm["chemostat"] = (df_count_tpm["chemostat_1"] + df_count_tpm["chemostat_2"] + df_count_tpm["chemostat_3"]) / 3

# 発現変動をlog2 fold として求める
# 0 での除算を防ぐため、分母に微小な値を加えている
df_count_tpm["log2fold"] = df_count_tpm["chemostat"] / (df_count_tpm["batch"] + 10**-6)
df_count_tpm["log2fold"] = df_count_tpm["log2fold"].apply(np.log2)

In [58]:
# 確認
df_count_tpm.head()

Unnamed: 0_level_0,batch_1,batch_2,batch_3,chemostat_1,chemostat_2,chemostat_3,batch,chemostat,log2fold
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
gene_0001,0.0,0.734587,3.129839,0.0,0.0,0.50481,1.288142,0.16827,-2.936443
gene_0002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-inf
gene_0003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-inf
gene_0004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-inf
gene_0005,0.94849,2.799529,4.969954,4.689762,4.372026,8.657291,2.905991,5.906359,1.023238


`df_count_tpm` から"batch", "chemostat", "log2fold"の列を抜き出し`diff_ex`とする

In [59]:
# [.......]に抜き出す列のリストを入れて実行
diff_ex = df_count_tpm[['batch', 'chemostat', 'log2fold']]

`diff_ex.join()` でproductと結合,productとgene_idの対応は`gene_products`を使う

In [60]:
# join()でproductと結合
diff_ex = diff_ex.join(gene_products)

In [61]:
# 確認
diff_ex.head()

Unnamed: 0_level_0,batch,chemostat,log2fold,product
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gene_0001,1.288142,0.16827,-2.936443,seripauperin PAU8
gene_0002,0.0,0.0,-inf,hypothetical protein
gene_0003,0.0,0.0,-inf,putative permease SEO1
gene_0004,0.0,0.0,-inf,hypothetical protein
gene_0005,2.905991,5.906359,1.023238,hypothetical protein


In [62]:
# カウント数が0であるデータを除いておく
diff_ex = diff_ex[diff_ex["batch"] > 0]
diff_ex = diff_ex[diff_ex["chemostat"] > 0]

In [63]:
# 確認
diff_ex.head()

Unnamed: 0_level_0,batch,chemostat,log2fold,product
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gene_0001,1.288142,0.16827,-2.936443,seripauperin PAU8
gene_0005,2.905991,5.906359,1.023238,hypothetical protein
gene_0009,1.424695,3.627256,1.348225,flocculin FLO9
gene_0010,8.211912,155.846213,4.246261,glutamate dehydrogenase (NADP(+)) GDH3
gene_0011,14.270889,155.994499,3.450348,putative dehydrogenase BDH2


log2foldの列を降順に並び替える（diff_ex.sort_valuesを使う)

In [64]:
# (.....)に必要なオプションを入れて実行
diff_ex = diff_ex.sort_values('log2fold', ascending=False)

In [65]:
# chemostat > batch の上位5番目まで表示
diff_ex.head()

Unnamed: 0_level_0,batch,chemostat,log2fold,product
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gene_2989,0.428753,1469.08209,11.742478,Rgi2p
gene_4740,3.101195,5075.124519,10.676403,Sip18p
gene_4667,4.944971,4658.135852,9.879575,Spg4p
gene_4237,0.96131,708.223065,9.524985,hypothetical protein
gene_5965,7.232487,5295.81244,9.516144,Gre1p


In [66]:
# batch > chemostat の上位5番目まで表示
diff_ex.tail()

Unnamed: 0_level_0,batch,chemostat,log2fold,product
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gene_0314,10.570903,1.105484,-3.257349,ADP/ATP carrier protein AAC3
gene_2429,2.082786,0.181792,-3.518158,hypothetical protein
gene_2725,784.525221,66.820686,-3.553453,hexose transporter HXT4
gene_5487,77.913513,6.244048,-3.64132,hypothetical protein
gene_1320,812.668223,46.017423,-4.142414,hexose transporter HXT3
