In [114]:
import os
import numpy as np
import pandas as pd

In [115]:
# Read the dataset
df = pd.read_csv('../../data/gene_expression_data/up_down_encoded_vector.csv')
df.head()

Unnamed: 0,DRUGS,IGF1R,RPS6,FBXO21,PLSCR1,PSME1,NFE2L2,MRPL19,AURKA,PLSCR3,...,TFAP2A,NPRL2,USP14,RPS6KA1,BNIP3,SATB1,PPARG,CSNK2A2,NFKBIB,WDR7
0,2-iodomelatonin,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2-phenylmelatonin,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5-methoxytryptamine,0,0,1,0,0,0,0,1,1,...,-1,-1,0,1,0,-1,-1,1,1,0
3,6-chloromelatonin,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,acalabrutinib,0,0,1,0,0,-1,0,0,-1,...,0,0,0,-1,0,0,-1,0,0,-1


In [116]:
# Extract the drug names
drug_names = df.DRUGS.to_list()
drug_names

['2-iodomelatonin',
 '2-phenylmelatonin',
 '5-methoxytryptamine',
 '6-chloromelatonin',
 'acalabrutinib',
 'acedapsone',
 'acemetacin',
 'acipimox',
 'adapalene',
 'agomelatine',
 'alectinib',
 'alisertib',
 'allantoin',
 'altretamine',
 'amifostine',
 'aminogenistein',
 'aminoglutethimide',
 'aminophylline',
 'amisulpride',
 'amlexanox',
 'amlodipine',
 'amonafide',
 'amoxapine',
 'ampiroxicam',
 'amsacrine',
 'amuvatinib',
 'anastrozole',
 'androsterone',
 'aniracetam',
 'anisodamine',
 'apixaban',
 'aripiprazole',
 'aspirin',
 'ataluren',
 'atenolol',
 'auranofin',
 'avanafil',
 'azacyclonol',
 'azaperone',
 'baclofen',
 'balsalazide',
 'baricitinib',
 'batimastat',
 'bemegride',
 'benzbromarone',
 'benzocaine',
 'bergapten',
 'beta-alanine',
 'betaxolol',
 'bexarotene',
 'bezafibrate',
 'bicalutamide',
 'bifonazole',
 'bindarit',
 'bisacodyl',
 'bosentan',
 'bosutinib',
 'brexpiprazole',
 'brimonidine',
 'broxyquinoline',
 'budesonide',
 'bufexamac',
 'bumetanide',
 'busulfan',
 'b

In [117]:
# For each drug, create a matrix of 978x3, and stack them
# Final tensor size becomes 475x978x3 

In [118]:
# Read the landmark genes
df_landmark_genes = pd.read_csv('../../data/landmark_genes.csv', header=None)
landmark_genes = df_landmark_genes[0].to_list()
landmark_genes

['INSIG1',
 'FOXO3',
 'CDH3',
 'ORC1',
 'ITGAE',
 'CDK7',
 'FOS',
 'CDK6',
 'CDK4',
 'CDK2',
 'ITGB5',
 'CDKN2A',
 'CDKN1B',
 'CDKN1A',
 'FPGS',
 'CEBPD',
 'CEBPA',
 'CENPE',
 'MICALL1',
 'CETN3',
 'UBE2C',
 'DUSP14',
 'TOPBP1',
 'B4GAT1',
 'RBM34',
 'PAPD7',
 'WDTC1',
 'XPO7',
 'AKAP8L',
 'RAB31',
 'SPEN',
 'FBXO21',
 'RAB21',
 'SLC27A3',
 'KIF2C',
 'CCDC85B',
 'TLK2',
 'KDELR2',
 'MCUR1',
 'CDK19',
 'JUN',
 'TBC1D9B',
 'PRSS23',
 'RRP1B',
 'MYCBP2',
 'CHEK1',
 'KCNK1',
 'CHN1',
 'CIRBP',
 'HDGFRP3',
 'COPS7A',
 'NSDHL',
 'ATP11B',
 'PRAF2',
 'JMJD6',
 'RRS1',
 'MPC2',
 'ARID5B',
 'POLG2',
 'CHEK2',
 'BACE2',
 'COG4',
 'SPDEF',
 'BAMBI',
 'SPRED2',
 'SLC2A6',
 'SNX13',
 'NISCH',
 'AARS',
 'ABCF1',
 'ABL1',
 'FCHO1',
 'DCUN1D4',
 'PSIP1',
 'ACAA1',
 'ACAT2',
 'ACLY',
 'PKIG',
 'CORO1A',
 'MAST2',
 'GPATCH8',
 'FBXO7',
 'LSM6',
 'PWP1',
 'KIF5C',
 'KIT',
 'FUT1',
 'FYN',
 'GAA',
 'CLTC',
 'CLTB',
 'SLC37A4',
 'GABPB1',
 'KTN1',
 'GALE',
 'GAPDH',
 'COL4A1',
 'COL1A1',
 'TRIM2',
 'TSKU',

In [119]:
# Working with first drug
drug = drug_names[0]

# Set the upregulated and downregulated gene expression paths
up_path = "../../data/gene_expression_data/up/up_genes_output"
down_path = "../../data/gene_expression_data/down_genes_output"

# Count the up replicates for selected drug
for filename in os.listdir(up_path):
    if drug in filename:
        path = os.path.join(up_path, filename)
        df_up = pd.read_csv(path, header=None)
    
# Count the up replicates for selected drug
for filename in os.listdir(down_path):
    if drug in filename:
        path = os.path.join(down_path, filename)
        df_down = pd.read_csv(path, header=None)

In [120]:
df_up

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,241,242,243,244,245,246,247,248,249,250
0,2-iodomelatonin,GUCY1B1,SIDT2,COCH,FAM216A,KHSRP,PITRM1,KLK8,CSF2RB,TTC37,...,CLIC5,LIMK2,MME,KRT18,GDF15,ASRGL1,WT1,SERPINA1,MICAL2,SPON1


In [121]:
df_down[:]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,241,242,243,244,245,246,247,248,249,250
0,2-iodomelatonin,PHKA1,F12,PRAME,CXCL14,SERPINA3,LTF,HTRA1,FCGBP,TENT5A,...,SLC12A2,PRRX1,CEACAM8,ANO1,RHCG,EIF2AK3,MAP1LC3B,NDRG2,TMEM243,CDCA4


In [122]:
# Initialize up and down genes count array
up_genes = np.zeros((978))
down_genes = np.zeros((978))


In [123]:
# Update the up and down count lists
for up, down in zip(df_up.values[0], df_down.values[0]):
    for i in range(len(landmark_genes)):
        if landmark_genes[i] == up:
            up_genes[i] += 1
        elif landmark_genes[i] == down:
            down_genes[i] += 1 
            

In [125]:
# Print the summary of the up and down count lists
print(f'Up: {np.unique(up_genes, return_counts=True)}')
print(f'Down: {np.unique(down_genes, return_counts=True)}')

Up: (array([0., 1.]), array([939,  39], dtype=int64))
Down: (array([0., 1.]), array([935,  43], dtype=int64))


In [127]:
# Initialize a datframe, with landamark genes
df_gene_expr = pd.DataFrame(landmark_genes, columns=['landmark_genes'])
df_gene_expr.head()

Unnamed: 0,landmark_genes
0,INSIG1
1,FOXO3
2,CDH3
3,ORC1
4,ITGAE


In [135]:
# Add the up and down arrays as columns
df_gene_expr['up'] = up_genes
df_gene_expr['down'] = down_genes
df_gene_expr

Unnamed: 0,landmark_genes,up,down
0,INSIG1,0.0,0.0
1,FOXO3,0.0,0.0
2,CDH3,0.0,0.0
3,ORC1,0.0,0.0
4,ITGAE,0.0,0.0
...,...,...,...
973,NUCB2,0.0,0.0
974,NUP88,0.0,0.0
975,NVL,0.0,0.0
976,ILK,0.0,0.0
