## 1. Setup

In [1]:
# Load dependencies
import pandas as pd
import numpy as np
from sklearn import preprocessing
from project_lib import Project

In [2]:
# Load data
PROJECT_ID = "be864dcc-5bf7-4978-b225-77c8e568a9bc"
PROJECT_TOKEN = "p-4bfabbe1e029211e8cef6987a2e530ed6713ef9d"
INPUT_DIR = 'https://raw.githubusercontent.com/efejiroe/covid_epitope_prediction/master/data/'

bcell = pd.read_csv(f'{INPUT_DIR}/input_bcell.csv')
sars = pd.read_csv(f'{INPUT_DIR}/input_sars.csv') 
bsars = pd.concat([bcell, sars], axis=0, ignore_index=True) 

## 2. Feature Transformations and Creation

### Strategy
1. Drop columns previous shown to have low feature importance
2. Create new column "peptide_length" from the end_position and start_positions

In [3]:
# Drop low importance and target columns from training and test set.
bsars_ft = bsars.drop([
    'parent_protein_id',
    'protein_seq',
    'start_position',
    'end_position', 
    'peptide_seq',
    'chou_fasman', 
    'emini',
    'kolaskar_tongaonkar',
    'parker',
    'target'
], axis = 1)

In [4]:
# Create peptide_length column
bsars_ft['peptide_length'] = bsars['end_position'] - bsars['start_position'] + 1

In [5]:
# Scaling values between 1 and 0.
scaler = preprocessing.MinMaxScaler()
bsars_fts = scaler.fit_transform(bsars_ft)
bsars_fts
bsars_fts = pd.DataFrame(bsars_fts)

In [7]:
# Export training data to IBM cloud storage as "df+strategy version number" 
project = Project(project_id = PROJECT_ID ,project_access_token = PROJECT_TOKEN)
df = bsars_fts
df
project.save_data(file_name = "df01.csv",data = df.to_csv(index=False))


{'file_name': 'df01.csv',
 'message': 'File saved to project storage.',
 'bucket_name': 'covid19epitopeprediction-donotdelete-pr-vjiedfg7ztqsrx',
 'asset_id': '27e000cb-d186-4d37-85e3-5eb669ff28dd'}