# Import basic libraries

In [1]:
import numpy as np
import pandas as pd

# Load data from project

In [2]:
data_dir = '../lish-moa/'

## Exploring the main training dataset (features with gene and cell info)

In [3]:
train_features_df = pd.read_csv(f'{data_dir}train_features.csv')

In [4]:
#Shape
train_features_df.shape

(23814, 876)

In [5]:
train_features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23814 entries, 0 to 23813
Columns: 876 entries, sig_id to c-99
dtypes: float64(872), int64(1), object(3)
memory usage: 159.2+ MB


In [6]:
train_features_df.head().T # checking sample

Unnamed: 0,0,1,2,3,4
sig_id,id_000644bb2,id_000779bfc,id_000a6266a,id_0015fd391,id_001626bd3
cp_type,trt_cp,trt_cp,trt_cp,trt_cp,trt_cp
cp_time,24,72,48,48,72
cp_dose,D1,D1,D1,D1,D2
g-0,1.062,0.0743,0.628,-0.5138,-0.3254
g-1,0.5577,0.4087,0.5817,-0.2491,-0.4009
g-2,-0.2479,0.2991,1.554,-0.2656,0.97
g-3,-0.6208,0.0604,-0.0764,0.5288,0.6919
g-4,-0.1944,1.019,-0.0323,4.062,1.418
g-5,-1.012,0.5207,1.239,-0.8095,-0.8244


We know that the column are: sig_id for the ID of every sample, information of different gene expressions (g- prefix columns), information with cell viability (c- prefix columns), the column cp_type indicates samples treated with a compound (cp_vehicle value) or with a control perturbation (ctrl_vehicle value), cp_time are related to the time of the treatment, and cp_dose if the dose was high or low.

In [7]:
print(len(train_features_df.columns))

876


In [8]:
cell_cols = [col for col in train_features_df.columns if 'c-' in col]
gene_cols = [col for col in train_features_df.columns if 'g-' in col]

In [9]:
print(f'Number of cell columns: {len(cell_cols)}')
print(f'Number of gene columns: {len(gene_cols)}')

Number of cell columns: 100
Number of gene columns: 772


In [10]:
# Displaying
temp_list = []
top_display = 10
print('Cell columns:')
for ix, item in enumerate(cell_cols):
    temp_list.append(item)
    if ((ix + 1) % top_display) == 0:
        print(temp_list)
        temp_list = []
print(temp_list)
temp_list = []
        
        
print('Gene columns:')
for ix, item in enumerate(gene_cols):
    temp_list.append(item)
    if ((ix + 1) % top_display) == 0:
        print(temp_list)
        temp_list = []
print(temp_list)
temp_list = []

Cell columns:
['c-0', 'c-1', 'c-2', 'c-3', 'c-4', 'c-5', 'c-6', 'c-7', 'c-8', 'c-9']
['c-10', 'c-11', 'c-12', 'c-13', 'c-14', 'c-15', 'c-16', 'c-17', 'c-18', 'c-19']
['c-20', 'c-21', 'c-22', 'c-23', 'c-24', 'c-25', 'c-26', 'c-27', 'c-28', 'c-29']
['c-30', 'c-31', 'c-32', 'c-33', 'c-34', 'c-35', 'c-36', 'c-37', 'c-38', 'c-39']
['c-40', 'c-41', 'c-42', 'c-43', 'c-44', 'c-45', 'c-46', 'c-47', 'c-48', 'c-49']
['c-50', 'c-51', 'c-52', 'c-53', 'c-54', 'c-55', 'c-56', 'c-57', 'c-58', 'c-59']
['c-60', 'c-61', 'c-62', 'c-63', 'c-64', 'c-65', 'c-66', 'c-67', 'c-68', 'c-69']
['c-70', 'c-71', 'c-72', 'c-73', 'c-74', 'c-75', 'c-76', 'c-77', 'c-78', 'c-79']
['c-80', 'c-81', 'c-82', 'c-83', 'c-84', 'c-85', 'c-86', 'c-87', 'c-88', 'c-89']
['c-90', 'c-91', 'c-92', 'c-93', 'c-94', 'c-95', 'c-96', 'c-97', 'c-98', 'c-99']
[]
Gene columns:
['g-0', 'g-1', 'g-2', 'g-3', 'g-4', 'g-5', 'g-6', 'g-7', 'g-8', 'g-9']
['g-10', 'g-11', 'g-12', 'g-13', 'g-14', 'g-15', 'g-16', 'g-17', 'g-18', 'g-19']
['g-20', 'g-21', 

## Exploring cell columns

In [11]:
train_features_df[cell_cols].describe()

Unnamed: 0,c-0,c-1,c-2,c-3,c-4,c-5,c-6,c-7,c-8,c-9,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
count,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,...,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0
mean,-0.355156,-0.455359,-0.480542,-0.377666,-0.493552,-0.446037,-0.574855,-0.266293,-0.451307,-0.458664,...,-0.469244,-0.461411,-0.513256,-0.500142,-0.507093,-0.353726,-0.463485,-0.378241,-0.470252,-0.301505
std,1.752565,2.004962,2.039346,1.554583,2.091888,1.893484,2.234152,1.465313,1.948829,1.646122,...,2.000488,2.042475,2.001714,2.107105,2.159589,1.629291,2.059725,1.703615,1.834828,1.407918
min,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,...,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0
25%,-0.5493,-0.538425,-0.550375,-0.601975,-0.559975,-0.578775,-0.57125,-0.524275,-0.5511,-0.654675,...,-0.566175,-0.565975,-0.589975,-0.5687,-0.563775,-0.567975,-0.552575,-0.561,-0.5926,-0.5629
50%,-0.009,0.0098,-0.004,-0.03905,0.0114,-0.0089,0.0074,0.00635,-0.00535,-0.02535,...,-0.0099,0.00325,-0.0091,-0.01375,-0.0033,-0.01025,-0.00125,-0.0068,0.014,-0.0195
75%,0.448975,0.476875,0.460575,0.4131,0.4703,0.442675,0.459475,0.469375,0.453175,0.4244,...,0.45775,0.4615,0.445675,0.4529,0.4709,0.44475,0.465225,0.4464,0.461275,0.43865
max,3.365,3.915,2.97,2.408,4.555,3.614,3.904,4.918,3.648,3.194,...,4.069,3.96,3.927,3.596,3.747,2.814,3.505,2.924,3.111,3.805


In [12]:
set(train_features_df[cell_cols].dtypes.values)

{dtype('float64')}

All cell columns are floats. Let's check if their distribution is similar: