In [6]:
from random import choice
import pandas as pd

# Generate Random Explanatory Texts for Mitosis

In [10]:
structures = ['prometaphase', 'metaphase', 'anaphase', 'telophase', 'tripolar', 'asymmetric anaphase', 'chromosome bridging', 'chromosome lagging']

In [34]:
structure_explanations = {
    'prometaphase': [
        'dark aggregate cluster with spikes', 
        'dark aggregate ring shape with spikes', 
        'dark aggregate cluster with projections',
        'dark aggregate ring shape with projections',
        'darken grouped cluster with projections', 
        'darken ring shape blobs with projections',
        'clusters of spikey dark rings present',
        'clumps of dark projections are apparent'
    ],
    'metaphase': [
        'dark aggregate linear shape with spikes', 
        'dark aggregate ring shape with spikes', 
        'dark aggregate linear shape with projections',
        'dark aggregate ring shape with projections',
        'darken grouped linear clusters with projections', 
        'darken ring shape blobs with projections',
        'clusters of linear spikey dark rings present',
        'linear clumps of dark projections are apparent'
    ],
    'anaphase': [
        'presence of two separated aggregates of variable distances apart',
        'two separated spikey figured of variable distances apart',
        'presence of two separated linear figures with some distance between',
        'linear figured aggregates are separated with some distance apart',
        'dark spikey figures are separated with variable distances apart',
        'linear shape with projections are separated with little distance apart',
        'spikey shape with projections are separated with some distance between'
    ],
    'telophase': [
        'presence of two separated aggregates at opposite ends of cell',
        'two separated spikey figures at opposite ends of cell',
        'presence of two separated linear figures at opposite ends of cell',
        'linear figured aggregates are separated at opposite ends of cell',
        'dark spikey figures are separated at opposite ends of cell',
        'linear shape with projections located at opposite ends of cell',
        'spikey shape with projections located at opposite ends of cell'
    ],
    'tripolar': [
        'more than two chromosome clusters during anaphase with three linear plates',
        'more than two chromosome clusters during telophase with six linear plates',
        'more than two chromosome clusters during metaphase with a linear plate',
        'more than two chromosome clusters during prometaphase with nine linear plates'
    ],
    'asymmetric anaphase': [
        'unequal sizes of two chromosome clusters',
        'two chromosome clusters of unequal size are present',
        'there are clear depictions of unequal sized two chromosome clusters',
        'two chromosome clusters of unequal size are depicted',       
    ],
    'chromosome bridging': [
        'chromosomes stretching from one pole to other'
    ],
    'chromosome lagging': [
        'chromosomes not in contact with larger linear cord'
    ]
}

In [35]:
structure_sentences = [
    'Classified {} structure because of {}.', 
    'The image is classified as {}. Reasons for this are {}.', 
    'Classification for this image is {} structure. This is explained by {}.',
    'Cell division phase {} present. Observations of {} are evident.', 
    'Phase structure {} most evidently described by {}.'
]

In [36]:
structure_explanations = [sentence.format(phase,explanation) for sentence in structure_sentences for phase in structures for explanation in structure_explanations[phase]]

In [37]:
len(structure_explanations)

200

# Generate Nonmitotic Explanations

In [40]:
nonmitotic_explanations = [
    'The whole slide is clean. No affected cells.',
    'All cells are healthy.',
    'No presence of increased mitotic count.',
    'No anomalies present.',
    'Nothing to consider.',
    'Nothing here.',
    'Not available.',
    'Healthy-looking cells.',
    'No concerns.',
    'No concern here.',
    'Cells look fine.',
    'No structural anomalies.',
    'No phase structure problems.',
    'Nothing wrong.'
]

# Create Fake Text

In [38]:
n_train_mitosis = 10495
train_mitosis_explanations = [choice(structure_explanations) for i in range(n_train_mitosis)]

In [43]:
train_mitosis_explanations[:10]

['Cell division phase tripolar present. Observations of more than two chromosome clusters during prometaphase with nine linear plates are evident.',
 'Classification for this image is anaphase structure. This is explained by presence of two separated linear figures with some distance between.',
 'Phase structure anaphase most evidently described by presence of two separated aggregates of variable distances apart.',
 'Classification for this image is telophase structure. This is explained by presence of two separated linear figures at opposite ends of cell.',
 'Cell division phase prometaphase present. Observations of clumps of dark projections are apparent are evident.',
 'The image is classified as anaphase. Reasons for this are two separated spikey figured of variable distances apart.',
 'Cell division phase prometaphase present. Observations of dark aggregate cluster with projections are evident.',
 'Phase structure telophase most evidently described by two separated spikey figures 

In [41]:
n_train_nonmitosis = 27455
train_nonmitosis_explanations = [choice(nonmitotic_explanations) for i in range(n_train_nonmitosis)]

In [42]:
train_nonmitosis_explanations[:10]

['Healthy-looking cells.',
 'No presence of increased mitotic count.',
 'Cells look fine.',
 'Healthy-looking cells.',
 'Healthy-looking cells.',
 'Cells look fine.',
 'No phase structure problems.',
 'Cells look fine.',
 'The whole slide is clean. No affected cells.',
 'Nothing wrong.']

In [46]:
n_test_mitosis = 3412
test_mitosis_explanations = [choice(structure_explanations) for i in range(n_test_mitosis)]

In [47]:
test_mitosis_explanations[:10]

['Phase structure prometaphase most evidently described by dark aggregate cluster with projections.',
 'Classified tripolar structure because of more than two chromosome clusters during prometaphase with nine linear plates.',
 'Phase structure asymmetric anaphase most evidently described by unequal sizes of two chromosome clusters.',
 'The image is classified as asymmetric anaphase. Reasons for this are two chromosome clusters of unequal size are present.',
 'Classified chromosome lagging structure because of chromosomes not in contact with larger linear cord.',
 'Cell division phase anaphase present. Observations of presence of two separated aggregates of variable distances apart are evident.',
 'Cell division phase metaphase present. Observations of dark aggregate ring shape with spikes are evident.',
 'Classified metaphase structure because of dark aggregate ring shape with spikes.',
 'Phase structure prometaphase most evidently described by dark aggregate ring shape with spikes.',


In [50]:
n_test_nonmitosis = 8920
test_nonmitosis_explanations = [choice(nonmitotic_explanations) for i in range(n_test_nonmitosis)]

In [51]:
test_nonmitosis_explanations[:10]

['Healthy-looking cells.',
 'No concern here.',
 'All cells are healthy.',
 'All cells are healthy.',
 'No phase structure problems.',
 'No anomalies present.',
 'Nothing wrong.',
 'All cells are healthy.',
 'Nothing wrong.',
 'No concerns.']

# Save Individual Files to csv then to S3 Bucket

In [53]:
tme = pd.DataFrame(train_mitosis_explanations, columns=['text'])
tnme = pd.DataFrame(train_nonmitosis_explanations, columns=['text'])
tsme = pd.DataFrame(test_mitosis_explanations, columns=['text'])
tsnme = pd.DataFrame(test_nonmitosis_explanations, columns=['text'])

In [54]:
from sagemaker import get_execution_role
import boto3

In [65]:
def upload_to_s3(df, filename, bucket='sjargs-dev-mle9'):
    local_file = filename + '.csv'
    df.to_csv(local_file, index=False)
    conn = boto3.resource('s3')
    conn.meta.client.upload_file(
            local_file,
            bucket,
            local_file
    )

In [63]:
upload_to_s3(tme, 'train-mitosis-explanations')

In [66]:
upload_to_s3(tnme, 'train-nonmitosis-explanations')

In [67]:
upload_to_s3(tsme, 'test-mitosis-explanations')

In [68]:
upload_to_s3(tsnme, 'test-nonmitosis-explanations')