# Data Processing with pandas
In this notebook, I am creating a pandas dataframe from the raw JSON data obtained from scrapper, which will subsequently be uploaded to an Amazon S3 bucket

In [77]:
# import libraries
import os
import json

import numpy as np
import pandas as pd

In [66]:
# load raw data
with open("raw_data/data.json", "r") as f:
    data = json.load(f)

print(data.keys())

dict_keys(['uuids', 'website_ids', 'gene_function', 'spatial_expression_patterns', 'cellular_expression_patterns', 'begining', 'termination', 'detailed_expression_patterns', 'promoters', 'strain_information', 'strain_name', 'date_created', 'source', 'reporter', 'lineage', 'construct', 'created_by', 'construct_info', 'plasmid_name', 'gene', 'transcript', 'promoter_length', 'left', 'forward', 'right', 'reverse', 'vector', 'expressing_strains', 'image_urls'])


In [67]:
# check if all values of each key are of the same length
print(list(zip(data.keys(), [len(value) for value in data.values()])))

[('uuids', 75), ('website_ids', 75), ('gene_function', 75), ('spatial_expression_patterns', 75), ('cellular_expression_patterns', 75), ('begining', 75), ('termination', 75), ('detailed_expression_patterns', 75), ('promoters', 75), ('strain_information', 75), ('strain_name', 75), ('date_created', 75), ('source', 75), ('reporter', 75), ('lineage', 75), ('construct', 75), ('created_by', 75), ('construct_info', 75), ('plasmid_name', 75), ('gene', 75), ('transcript', 75), ('promoter_length', 75), ('left', 75), ('forward', 75), ('right', 75), ('reverse', 75), ('vector', 75), ('expressing_strains', 75), ('image_urls', 74)]


In [68]:
# get filenames of all downloaded images
image_filenames = os.listdir("raw_data/images")
print(image_filenames)

['b7d6bd10-f078-48d2-a8d7-feb6d240e3a4.gif', '08bdee22-4e00-454a-ab5e-86b7d4433e43.gif', '418f1fcb-65f2-40f6-890e-1005104e67da.gif', '6dac24d6-4578-419e-adc1-3bc34269ed7e.gif', '30b09323-d0ef-45a0-ae3a-499f40fa7303.gif', '775b26cd-62d8-4ad3-b2ad-55b745ff7aac.gif', 'bc2b2ee6-33f3-4243-bd5d-c1a990e145fa.gif', '083bbc71-23df-4e5a-b07e-2189282981e9.gif', 'b06d3b17-99f9-495f-bc91-a0c2e39a7015.gif', 'c4d619f9-a514-4d9b-a73e-e32dfc9b1898.gif', '18a14533-bbe2-4b05-abc6-52c835aefeda.gif', '613aaa2c-0167-4c40-b1a0-8854b2e0a9c7.gif', '35795291-7ade-40ed-91d4-cd3bc1066c72.gif', '4d29258f-6eb3-4872-a824-a3e5313bc377.gif', '6fa17b04-8db6-4394-bc58-73b74033cd80.gif', 'e0871642-0435-4dcf-b6ee-049a838d6075.gif', 'aa74eb54-a8b4-4e3e-81bc-58dba3b993a1.gif', 'fdf2b80d-7671-43f6-ba72-8fab10d6a0b4.gif', '693d30b7-7cec-49a8-a3b9-51cd7c80a612.gif', '6d5d3c5a-6f85-4efb-ae3a-b727558dd989.gif', 'f461388e-f781-489a-994f-cee7c5845579.gif', '871b0553-11d2-4ce8-a8b6-d2d59723a02d.gif', '39773b7f-3d28-4fdb-9a99-e06874

Image urls are 1 short from all the other values. We need to find the uuid corresponding to the missing url. We can compare the website_ids and image_filenames to find it

In [69]:
# parse image_urls to extract website_ids
ids = [filename.split(".gif")[0] for filename in image_filenames]
print(ids)

['b7d6bd10-f078-48d2-a8d7-feb6d240e3a4', '08bdee22-4e00-454a-ab5e-86b7d4433e43', '418f1fcb-65f2-40f6-890e-1005104e67da', '6dac24d6-4578-419e-adc1-3bc34269ed7e', '30b09323-d0ef-45a0-ae3a-499f40fa7303', '775b26cd-62d8-4ad3-b2ad-55b745ff7aac', 'bc2b2ee6-33f3-4243-bd5d-c1a990e145fa', '083bbc71-23df-4e5a-b07e-2189282981e9', 'b06d3b17-99f9-495f-bc91-a0c2e39a7015', 'c4d619f9-a514-4d9b-a73e-e32dfc9b1898', '18a14533-bbe2-4b05-abc6-52c835aefeda', '613aaa2c-0167-4c40-b1a0-8854b2e0a9c7', '35795291-7ade-40ed-91d4-cd3bc1066c72', '4d29258f-6eb3-4872-a824-a3e5313bc377', '6fa17b04-8db6-4394-bc58-73b74033cd80', 'e0871642-0435-4dcf-b6ee-049a838d6075', 'aa74eb54-a8b4-4e3e-81bc-58dba3b993a1', 'fdf2b80d-7671-43f6-ba72-8fab10d6a0b4', '693d30b7-7cec-49a8-a3b9-51cd7c80a612', '6d5d3c5a-6f85-4efb-ae3a-b727558dd989', 'f461388e-f781-489a-994f-cee7c5845579', '871b0553-11d2-4ce8-a8b6-d2d59723a02d', '39773b7f-3d28-4fdb-9a99-e06874415639', '35f8f228-4230-4468-a60e-52b5c44de89d', 'a591a34a-db51-4c7d-b8c4-2e8ed086863f',

In [70]:
# get uuid of missing image
set(data["uuids"]) - set(ids)

{'c1795df8-827d-4a17-8566-5a1f278ee95b'}

A quick check on the website shows that promoter "test" is missing an image. Therefore the corresponding image url corresponding to the promoter test.

In [71]:
# get index of missing image_url
data["uuids"].index("c1795df8-827d-4a17-8566-5a1f278ee95b")

74

In [74]:
# move the image uuid of the missing image to the correct index
data["uuids"].insert(72, data["uuids"].pop(data["uuids"].index("c1795df8-827d-4a17-8566-5a1f278ee95b")))

In [78]:
# insert NA for missing image url
data["image_urls"].insert(72, np.nan)

In [82]:
# create pandas dataframe
dataframe = pd.DataFrame(data)

In [83]:
# store data as a pickle file
pd.to_pickle(dataframe, "processsed_data/table.pkl")