# Bioinformatics Modeling

## Synthetic Gene Sequence Data Builder - Tutorial 1

## Pre-requisites:
 
- Access to an IBM Cloud Object Storage instance

- The e2eai_credentials.json file (included in the repo clone) in your local directory updated with credentials to the cloud object storage instance

- A copy of ICOS.py (included in the repo clone) in the local directory

- The ibm-cos-sdk  python package installed in your python execution environment


### Contact: fjgreco@us.ibm.com

#### <font color=red>Optional installation of ibm-cos-sdk</font>

### Run the following if in CP4D

### Import packages

In [1]:
import sys
sys.path.append('.')

import ICOS as ICOS

In [2]:
import json
import string 
import random
import pandas as pd
import numpy as np

### Build gene sequence feature and label lists

In [3]:
def build_assay_data(numseq=200, seqlen=50,motif='CGACCGAACTCC'):
    len_motif=len(motif)
    limit=seqlen-len_motif
    binary_choice= range(2)

    seqf=[]
    labf=[]
    #locf=[]
    
    num_insertions=0

    for i in range(numseq):

        seqx = ''.join(random.choices('ATGC' , k = seqlen)) 
        bind=random.choice(binary_choice)
        if bind==1:
            index=random.randrange(1,limit)
            seqx = seqx[:index] + motif + seqx[index + len_motif:]
            num_insertions+=1
        else: 
            bind=0
            index=0
        #print(seqx,bind)
        seqf.append(seqx)
        labf.append(bind)
        #locf.append(index)
        
    print("numseq:",numseq,"seqlen:",seqlen,"number insertions:",num_insertions)
    
    return seqf,labf


### Save assay data

In [4]:
def save_assay_data(sequences,sequence_fn, labels, label_fn):
    print ("Saving file: {}".format(sequence_fn))
    fd=open(sequence_fn,'w')
    for element in sequences:
         fd.write(element)
         fd.write('\n')
    fd.close() 

    print ("Saving file: {}".format(label_fn))
    fd=open(label_fn,'w')
    for element in labels:
         fd.write(str(element))
         fd.write('\n')
    fd.close()

### Build assay csv file

In [5]:
def build_assay_csv_file(sequences,labels, csv='assay_data.csv'):
    
    assay_data=[]
 
    assay_data.append(['sequence','target'])
    for item in zip(sequences,labels):
        assay_data.append([item[0],item[1]])
     
    df=None
    
    df=pd.DataFrame(assay_data, index=np.arange(1, len(assay_data)+1), 
                 columns=['Sequence','BindProperty']) 

    df.to_csv(csv, index = False,header=False)
        
    print("Saving file '{}'".format(csv))
    print("assay_data record count:",len(assay_data))
    
    return df


### Assay data file reader

In [6]:
def read_assay_data_file(csvfile,splitfile=True):
    
    sequences=[]
    labels=[]
    f = open(csvfile, "r")
    for x in f:
      item=x.strip('\n').split(',')  
      sequences.append(item[0])
      labels.append(int(item[1]))
    
    if splitfile:
        prefix=csvfile.split('.')[0]
        
        sequence_fn=prefix+'.seq'
        label_fn=prefix+'.lbl'
        
        print ("Saving file: {}".format(sequence_fn))
        fd=open(sequence_fn,'w')
        for element in sequences:
             fd.write(element)
             fd.write('\n')
        fd.close() 
        
        print ("Saving file: {}".format(label_fn))
        fd=open(label_fn,'w')
        for element in labels:
             fd.write(str(element))
             fd.write('\n')
        fd.close()
        
    return sequences, labels

# Build files...

### <font color=blue>The parameter values specified below must be carried through to the downstream build and training noteboooks. </font>

In [7]:
numseq=200
seqlen=50      
motif='CGACCGAACTCC'   
local_dir='./DATA_DIR'

sequence_file='/assay_data_full.seq'
label_file='/assay_data_full.lbl'
csv_file='/assay_data_full.csv'

In [8]:
#option- run if needed
!mkdir DATA_DIR

In [9]:
local_sequence_file=local_dir+sequence_file
local_label_file=local_dir+label_file
local_csv_file=local_dir+csv_file

sequences, labels = build_assay_data(numseq=numseq,seqlen=seqlen,motif=motif)
save_assay_data(sequences,local_sequence_file,labels,local_label_file)
build_assay_csv_file(sequences,labels,csv=local_csv_file)

numseq: 200 seqlen: 50 number insertions: 111
Saving file: ./DATA_DIR/assay_data_full.seq
Saving file: ./DATA_DIR/assay_data_full.lbl
Saving file './DATA_DIR/assay_data_full.csv'
assay_data record count: 201


Unnamed: 0,Sequence,BindProperty
1,sequence,target
2,CGAGCCAATCATTCCGGATAGGTCGGAATTGGGCCCGTTTTTGCGA...,0
3,CCGAAAGGGGCTCTAAGCAGATAATTCGGCGGGTGGATCGTTACAT...,0
4,GCATGTCGGCTCCGGTTTGGATCTACCTCTCACATACTGCGGGTGC...,0
5,CAACAATATGTCTCACGACCGAACTCCACAAAGAAGGCTGCGTGCG...,1
...,...,...
197,AAAGTCGTTTGTGACTAGTTGTTCCCGCAGTTCGGCGCGGCTAAAA...,0
198,CGGTGTGTGAAAACCGATCCCCTTGCTCAAGTCCATCGAGTTGGGG...,0
199,GAATCTACTCTTAAAACGCCCCCCAGGGACGAGGGCCACTTCTAGA...,0
200,CGGCCCAGCGACTGCGTCACCGGTATTGAAATTTCTATAGTTACAC...,0


## Review list of created assets

In [10]:
!pwd
!ls -al DATA_DIR

/Users/fjgreco/GitHub/E2EAI-public/Code
total 56
drwxr-xr-x   5 fjgreco  staff    160 Jun  6 21:25 [34m.[m[m
drwxr-xr-x  14 fjgreco  staff    448 Jun  6 21:25 [34m..[m[m
-rw-r--r--   1 fjgreco  staff  10616 Jun  6 21:25 assay_data_full.csv
-rw-r--r--   1 fjgreco  staff    400 Jun  6 21:25 assay_data_full.lbl
-rw-r--r--   1 fjgreco  staff  10200 Jun  6 21:25 assay_data_full.seq


## Save assets in ICOS

In [11]:
with open("e2eai_credentials.json") as json_file:
    credentials = json.load(json_file)

icos_credentials=credentials['icos_credentials_e2eai']


### Connect to ICOS

In [12]:
icos=ICOS.ICOS(icos_credentials=icos_credentials)

### <font color=blue>Specify a bucket name in your ICOS instance The name must be unique across the IBMCloud system</font>

<font color=red>Both the bucket and folder names are used in the build, training, and analysis notebooks.</font>

In [13]:
bucket='e2eai-training' # <== Make this unique to your experimental training run.  
folder='assay' # <== This is the default folder name use in subsequent notebooks. 

In [14]:
icos.create_bucket(bucket)

<class 'Exception'> An error occurred (BucketAlreadyExists) when calling the CreateBucket operation: The requested bucket name is not available. The bucket namespace is shared by all users of the system. Please select a different name and try again.


#### <font color=red>If you receive a BucketAlreadyExists exception message, you may still proceed assuming the bucket exists in your ICOS instance.</font>

In [15]:
icos.upload_file(bucket,local_csv_file,folder+csv_file)

File Uploaded


In [16]:
icos.upload_file(bucket,local_label_file,folder+label_file)

File Uploaded


In [17]:
icos.upload_file(bucket,local_sequence_file,folder+sequence_file)

File Uploaded


##  <font color=green>Proceed to running e2eai-bioinformatics-neural_network_build(tutorial).ipynb...</font>