# Read Data

Read in data from pickle files and transform to CSV.

In [30]:
# read data from pickle files
# output: dataframes

import pandas as pd
import numpy as np

# read data from pickle files
test_neg = pd.read_pickle('test_neg.pickle')
test_pos = pd.read_pickle('test_pos.pickle')
train_neg = pd.read_pickle('train_neg.pickle')
train_pos = pd.read_pickle('train_pos.pickle')

Convert edge lists, represented as tuples, to CSV.

In [31]:
# convert tuples to dataframes
test_neg = pd.DataFrame(test_neg).transpose()
test_pos = pd.DataFrame(test_pos).transpose()

# add columns with labels
test_neg['label'] = 0
test_pos['label'] = 1

# combine dataframes
test = pd.concat([test_neg, test_pos], ignore_index=True)

# summarize data
print('Test Dataset:')
print(test.shape)
test.head()

Test Dataset:
(4070, 3)


Unnamed: 0,0,1,label
0,101,3851,0
1,138,2034,0
2,56,1450,0
3,104,2126,0
4,112,3519,0


Save data as CSV.

In [27]:
# save data as CSV
test.to_csv('test_dataset.csv', index=False)

# Read DREAM5

In [32]:
# read DREAM5
ecoli_data = pd.read_csv('../DREAM5/Ecoli/ecoli_data.tsv', sep='\t', header=None)
ecoli_gene_names = pd.read_csv('../DREAM5/Ecoli/ecoli_gene_names.tsv', sep='\t', header=None)
ecoli_tf_names = pd.read_csv('../DREAM5/Ecoli/ecoli_tf_names.tsv', sep='\t', header=None)
ecoli_experiments = pd.read_csv('../DREAM5/Ecoli/ecoli_experiments.tsv', sep='\t', header=None)

# add row names: gene names
ecoli_data.insert(0, 'Gene', ecoli_gene_names[0])

# add column names: experiments

Run sanity check that TF is in gene names.

In [33]:
# check if TF in gene names
tf_names_list = ecoli_tf_names[0].to_list()
gene_names_list = ecoli_gene_names[0].to_list()
'aaeR' in gene_names_list

True

Save to CSV.

In [None]:
# save data to CSV
ecoli_data.to_csv('ecoli_data.csv', index=False)
ecoli_gene_names.to_csv('ecoli_gene_names.csv', index=False)
ecoli_tf_names.to_csv('ecoli_tf_names.csv', index=False)

# Read Weight Matrices

First, read in weight matrices computed in `benchmarking.Rmd`.

In [16]:
# read data from CSV files
aracne = pd.read_csv('aracne_weight_matrix.csv')
clr = pd.read_csv('clr_weight_matrix.csv')
mrnet = pd.read_csv('mrnet_weight_matrix.csv')
genie3 = pd.read_csv('genie3_weight_matrix.csv')

# set first column name to 'Gene'
aracne.columns.values[0] = 'Gene'
clr.columns.values[0] = 'Gene'
mrnet.columns.values[0] = 'Gene'
genie3.columns.values[0] = 'Gene'

# set first column as index and drop first column
aracne = aracne.set_index(aracne.columns[0])
aracne = aracne.drop(aracne.index[0])
clr = clr.set_index(clr.columns[0])
clr = clr.drop(clr.index[0])
mrnet = mrnet.set_index(mrnet.columns[0])
mrnet = mrnet.drop(mrnet.index[0])
genie3 = genie3.set_index(genie3.columns[0])
genie3 = genie3.drop(genie3.index[0])

Read in test dataset.

In [22]:
test = pd.read_csv('test_dataset.csv')
test_names = pd.read_csv('ecoli_gene_names.csv')

# set names of test_names column as Gene
test_names.columns.values[0] = 'Gene'

test.head()

Unnamed: 0,0,1,label
0,101,3851,0
1,138,2034,0
2,56,1450,0
3,104,2126,0
4,112,3519,0


In [None]:
for index, row in test.iterrows():

    gene_0 = row['0']
    gene_1 = row['1']