## Purpose:

The purpose of this script is to return the GUIDs that exist in the eyeGeneGenomics data, but not in in the eyeGeneDemographics data. 

1. Input: Data from the API for eyeGeneGenomics and eyeGeneDemographics.

2. Output: List of GUIDs that are not in the eyeGeneDemographics data. 

In [12]:
import pandas as pd
import requests
import json
import getpass
from io import StringIO
import os
import datetime as dt
import time
import sys
import zipfile,io
import glob


#create your folder for storing data
def create_folder(folder_path):
    adjusted_folder_path = folder_path
    folder_found = os.path.isdir(adjusted_folder_path)
    counter = 0
    while folder_found == True:
        counter = counter + 1
        adjusted_folder_path = folder_path + ' (' + str(counter) + ')'
        folder_found = os.path.isdir(adjusted_folder_path)
    os.mkdir(adjusted_folder_path)
    return adjusted_folder_path

x=dt.datetime.now()
new_dir = os.getcwd()+'\\'+"QueryDataFiles_"+x.strftime('%Y_%m_%d')
created_dir = create_folder(new_dir)


def pulldatafile(filepath):
    full_filepath = glob.glob(filepath)
    file =''.join(map(str, full_filepath))
    return file

def read_data(file):
    df = pd.read_csv(file)
    return df

## Please enter log in information

In [2]:
## login
print("Enter your username")
username = input()

password = getpass.getpass("Enter your password")

Enter your username


 hrodney1
Enter your password ·············


In [3]:
#login in to API

loginheaders = {
    'accept': 'text/plain',
    'Content-Type': 'application/x-www-form-urlencoded'
}

logindata = {'password':password,
         'username': username}

In [4]:
response = requests.post("https://brics.nei.nih.gov/gateway/authentication/user/login", headers=loginheaders, data=logindata)
if response.status_code == 200:
    print("Login Successful")
    token = response.text
    print(token)
elif response.status_code != 200:
    print(response.status_code)
    print("Login not Successful. Please check username and password. If error still occurs reach out to system adminstrator. THIS CODE WILL NOT PROCEED")
    

Login Successful


## API Query to retrieve data. 

In [6]:
multipleformsstudy = {
    'accept':'application/zip',
    'Content-type': 'application/json',
    'Authorization':'Bearer ' + token
    }

multiplformsstudyurl ="https://brics.nei.nih.gov/gateway/query-api/data/bulk/study/form"

In [7]:
genomicsfilter =  {
  "flattened": "false",
  "outputFormat": "csv",
  "studyForms": [
    {
      "forms": ["eyeGENEGenomics","eyeGENEDemographics"],
      "study": "EYEGENE-STUDY0000203"
    }
  ]
}

In [8]:
multipleformsstudyquery = requests.post(multiplformsstudyurl,headers = multipleformsstudy,json = genomicsfilter)
multipleformsstudyquery 

<Response [200]>

In [9]:
multipleformsstudyquery.headers

{'Date': 'Mon, 20 Apr 2020 10:31:03 GMT', 'Server': 'Apache', 'X-Content-Type-Options': 'nosniff, nosniff, nosniff', 'X-Frame-Options': 'sameorigin', 'X-Xss-Protection': '1; mode=block, 1; mode=block', 'Referrer-Policy': 'strict-origin-when-cross-origin', 'Strict-Transport-Security': 'max-age=63072000;includeSubdomains;preload, max-age=31536000 ; includeSubDomains', 'Set-Cookie': 'JSESSIONID=2A260AAF28F28D9B9216FA1B07FE92BB; Path=/; HttpOnly', 'Expires': '0', 'Keep-Alive': 'timeout=5, max=100', 'Content-Disposition': 'attachment;filename="bulkForm_2020-04-20T10-37-507528675509189170532.zip"', 'Cache-Control': 'no-cache, no-store, must-revalidate', 'Pragma': 'no-cache', 'Connection': 'Keep-Alive', 'Transfer-Encoding': 'chunked', 'Content-Type': 'application/zip'}

In [10]:
created_dir = create_folder(new_dir)
a = zipfile.ZipFile(io.BytesIO(multipleformsstudyquery.content))
a.extractall(created_dir)

In [13]:
files = sorted(glob.glob(created_dir + '/*.csv'))

for file in files:
    print("Here is the location of your files: " + file)
    print("_____________________________________________")

In [14]:
demographicsfile = pulldatafile("**/query_result_eyeGENEDemographics*")
demographicsdata = read_data(demographicsfile)
genomicsfile =pulldatafile("**/query_result_eyeGENEGenomics*")
genomicsdata =read_data(genomicsfile)

  if (await self.run_code(code, result,  async_=asy)):


In [15]:
GUIDS = demographicsdata[~demographicsdata["eyeGENEDemographics.Main.GUID"].isin(genomicsdata["eyeGENEGenomics.Main.GUID"])]

In [20]:
GUIDSNOTINGENOMICS = pd.DataFrame(GUIDS["eyeGENEDemographics.Main.GUID"].unique(), columns = ["GUIDS NOT IN GENOMICS"]).to_csv(created_dir + "\\"+x.strftime('%Y_%m_%d')+"_GUIDSNOTINGENOMICS.csv", index = False)

In [21]:
print("The number of rows of genomics data: " + str(len(genomicsdata)))
print("________________________________________________________")

print("The number of rows of demographics data: " + str(len(demographicsdata)))
print("________________________________________________________")

print(f'The number of GUIDs that are not in Genomics but are in Demographics: {len(GUIDS["eyeGENEDemographics.Main.GUID"].unique())}')
print("________________________________________________________")

print("The files containg the list of GUIDs has been created in your folder  " + str(created_dir))





The number of rows of genomics data: 1477635
________________________________________________________
The number of rows of demographics data: 6417
________________________________________________________
The number of GUIDs not in Demographics but exist in Genomics Form: 874
________________________________________________________
The files have been created in your folder  C:\Users\hearodne\Desktop\NEI\GUIDs not in Genomics\API\QueryDataFiles_2020_04_20 (2)
