# Randomly Split Test Partition from Training

Randomly split out 10% of the training data into a testing partion to assess model performance during model development.

In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
np.random.seed(123)

## Load data that was downloaded in the `1.download-data` module

In [3]:
data_dir = os.path.join("..", "1.download-data", "data")
os.listdir(data_dir)

['validation_data.csv',
 'validation.tar.gz',
 'training.tar.gz',
 'training_data.csv']

In [4]:
train_file = os.path.join(data_dir, "training_data.csv")
full_train_df = pd.read_csv(train_file)

print(full_train_df.shape)
full_train_df.head()

(51267, 123)


Unnamed: 0,cell_code,cell_id,plate,replicate,well,field,actin.s.area,actin.s.perimeter,actin.s.radius.mean,actin.s.radius.sd,...,DNA.h.ent.s3,DNA.h.dva.s3,DNA.h.den.s3,DNA.h.f12.s3,DNA.h.f13.s3,dist.10.nn,dist.20.nn,dist.30.nn,nuclear.displacement,target
0,OaJHcDs2kh,1,P1,1,C10,1,579,90,13.942759,4.352958,...,0.0,0.0,0.0,0.0,0.0,128.889522,178.595764,207.18737,5.938882,adrenoceptor
1,nwLFF4l070,2,P1,1,C10,1,1132,136,20.179007,4.583255,...,0.0,0.0,0.0,0.0,0.0,79.885304,123.697473,190.59876,0.478024,adrenoceptor
2,JU4SIplWZ7,3,P1,1,C10,1,736,99,15.209404,2.408709,...,0.0,0.0,0.0,0.0,0.0,64.370636,88.931673,127.452918,3.928325,adrenoceptor
3,pqkTwaHa2L,4,P1,1,C10,1,611,92,14.398528,3.662234,...,0.0,0.0,0.0,0.0,0.0,87.536494,121.879169,160.752674,3.336081,adrenoceptor
4,pB2BlQoW94,5,P1,1,C10,1,585,93,14.030675,3.408844,...,0.0,0.0,0.0,0.0,0.0,105.866735,142.023858,182.990012,2.343776,adrenoceptor


## Split data and write to file

In [5]:
train_df, test_df = train_test_split(full_train_df,
                                     test_size=0.1,
                                     stratify=full_train_df.target)

In [6]:
file = os.path.join("data", "train.tsv.gz")
train_df.to_csv(file, sep='\t', float_format="%.4f", index=False)

file = os.path.join("data", "test.tsv.gz")
test_df.to_csv(file, sep='\t', float_format="%.4f", index=False)

In [7]:
print(train_df.shape)
print(test_df.shape)

(46140, 123)
(5127, 123)


In [8]:
target_count_df = (
    pd.DataFrame(train_df.target.value_counts()).merge(
        pd.DataFrame(test_df.target.value_counts()),
        left_index=True,
        right_index=True
    )
)

target_count_df.columns = ["train_counts", "test_counts"]

file = os.path.join("results", "target_counts.tsv")
target_count_df.to_csv(file, sep='\t', index=True)

target_count_df

Unnamed: 0,train_counts,test_counts
dopaminereceptor,4620,513
EGFR,4563,507
ROCK,4437,493
adrenoceptor,3466,385
DNA_intercalation,3450,383
AMPA,3442,382
CDK,3288,365
TopoII,3199,356
Tubulin,2934,326
DNAMetabolism,2626,292
