In [1]:
import os
import pandas as pd
import numpy as np
import random
import pickle
from sklearn.decomposition import PCA
from functions import utils

base_dirs = [os.path.join(os.path.dirname(os.getcwd()), 'data'),
             os.path.join(os.path.dirname(os.getcwd()), 'encoded'),
             os.path.join(os.path.dirname(os.getcwd()), 'models'),
             os.path.join(os.path.dirname(os.getcwd()), 'output'),
             os.path.join(os.path.dirname(os.getcwd()), 'stats'),
             os.path.join(os.path.dirname(os.getcwd()), 'viz') 
            ]
             
analysis_name = 'sim_1_test'
             
model_dir = os.path.join(base_dirs[2], analysis_name)
             
model_file = os.path.join(model_dir, "pca_model.pkl")

data_dir = os.path.join(base_dirs[0], analysis_name)

gene_id = 'PA0996'

percent_low = 5
percent_high = 95

In [2]:
# Load arguments
target_gene_file = os.path.join(data_dir, gene_id + ".txt")
non_target_gene_file = os.path.join(data_dir, "train_model_input.txt.xz")

# Read in data
target_gene_data = pd.read_table(target_gene_file, header=0, index_col=0)
non_target_gene_data = pd.read_table(non_target_gene_file, header=0, index_col=0)
    
# Sort target gene data by expression (lowest --> highest)
target_gene_sorted = target_gene_data.sort_values(by=[gene_id])

# Collect the extreme gene expressions
[low_ids, high_ids] = utils.get_gene_expression_above_percent(target_gene_sorted, gene_id, percent_low, percent_high)
low_exp = non_target_gene_data.loc[low_ids]    
high_exp = non_target_gene_data.loc[high_ids]

print('Number of genes in low expression group is {}'.format(low_exp.shape))
print('Number of gene in high expression group is {}'.format(high_exp.shape))
    
# Load pca model
infile = open(model_file,'rb')
pca = pickle.load(infile)
infile.close()
    
# Transform data using loaded model
low_exp_encoded = pca.transform(low_exp)
high_exp_encoded = pca.transform(high_exp)

low_exp_encoded_df = pd.DataFrame(low_exp_encoded, index=low_exp.index)
high_exp_encoded_df = pd.DataFrame(high_exp_encoded, index=high_exp.index)

Number of genes in low expression group is (60, 5548)
Number of gene in high expression group is (60, 5548)


In [3]:
low_exp_encoded_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
GSM1027584_062508WT.CEL,-1.23981,4.142039,-1.181063,-0.524478,-1.7006,-2.283038,0.753686,4.296769,-3.431478,-0.335385
GSM155245.CEL,-9.552384,-4.585051,0.538678,-0.781862,1.493007,-1.711524,-0.537629,1.407204,-0.202147,2.083603
GSM663170_Pa_label_cDNA_5-13-08_s2.CEL,-4.246457,0.57212,-1.295016,0.122188,0.517214,-0.956507,-0.958408,3.20101,-1.734832,1.269201
GSM261938.CEL,-4.019179,-0.759127,1.548159,2.454012,-3.594731,0.879994,0.564567,0.359141,-0.96098,1.053802
GSM567543.CEL,-6.839532,2.191288,0.90177,-0.426709,-2.21677,0.158207,1.18034,2.982603,-0.382269,-2.309257
GSM155242.CEL,-8.994454,-3.016537,-0.324655,-1.780123,0.461991,-0.314005,-0.34353,1.905706,0.011704,2.461801
GSM1133187_Weiqing_He_L-Arg-1_021810.CEL,-5.830278,0.618156,0.525469,-1.482811,-1.463283,-2.434372,0.854942,4.380904,-0.827657,-1.871059
GSM155243.CEL,-7.619286,-1.199118,-0.175419,-1.581557,0.461748,-0.58336,-0.553898,0.89054,2.149239,1.861955
GSM1133190_Weiqing_He_L-Arg-2_022410.CEL,-7.854855,-2.015394,1.858741,0.04861,0.069556,-3.753073,-0.416232,4.850582,-1.387167,-2.657191
GSM2026894_L-Glu.CEL,-7.519476,-0.181896,0.737885,-1.299594,0.008476,-2.27207,0.120935,4.916159,-0.610686,-2.041694


In [4]:
high_exp_encoded_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
GSM208608.CEL,-4.286471,-1.805134,0.285426,0.091575,-1.162462,1.833304,3.216135,-0.474109,2.75194,-2.822914
GSM898001_MHH0122_Pae_G1a.CEL,0.028494,6.572528,0.951587,1.87118,-1.591663,1.582983,-1.649471,-0.947124,-1.169091,-1.759486
GSM541648.CEL,0.028494,6.572528,0.951587,1.87118,-1.591663,1.582983,-1.649471,-0.947124,-1.169091,-1.759486
GSM685447.CEL,7.054826,-1.222481,7.14062,-3.106754,-1.536508,-0.431685,-0.60003,0.970543,3.210231,0.103496
GSM208602.CEL,-3.5561,1.812016,-1.478425,-0.730187,-2.349115,4.091522,1.849317,-1.178234,2.430937,-1.027437
GSM334311.CEL,-7.902443,-3.489353,0.221178,-1.94851,1.046786,0.371547,-2.713069,1.225904,-0.316609,-0.64198
GSM2055820_7629_A+_20130705.CEL,-8.757856,-2.698643,-0.397148,-2.76024,-0.308852,0.230936,-0.470907,0.181187,0.525946,1.668347
GSM954576_Nomura_PA01-1_Pae_G1a_.CEL,5.008757,5.264418,-0.25385,0.175152,-1.152484,-2.632444,-1.998398,-0.93673,-2.730594,-1.973668
GSM1267088_HZI1951_Pae_G1a.CEL,-1.574313,3.211965,-1.751967,0.513246,2.268577,-0.049955,-4.086949,1.201664,-0.037004,-1.147886
GSM637596.CEL,7.59161,-2.383643,7.595589,-3.676093,-2.632753,1.41427,-2.564825,0.696258,0.946718,1.58127


In [6]:
# Average the gene expression transformed
lowest_mean = low_exp_encoded_df.mean(axis=0)
highest_mean = high_exp_encoded_df.mean(axis=0)

# Generate offset using average gene expression in original dataset
offset_latent_space = highest_mean - lowest_mean
offset_latent_space_df = pd.Series.to_frame(offset_latent_space).T
    
offset_latent_space_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.712752,-0.117266,0.784392,-0.541569,-0.307746,2.053757,-1.499955,-1.64421,0.292373,0.044598


In [7]:
# output lowest and highest expressing samples
low_exp_encoded_df.to_csv(lowest_file, sep='\t', float_format="%.5g")
high_exp_encoded_df.to_csv(highest_file, sep='\t', float_format="%.5g")

# ouput gene space offset vector
offset_latent_space_df.to_csv(offset_file, sep='\t', float_format="%.5g")

NameError: name 'lowest_file' is not defined