# notebook1-ncbi-ftp

In [1]:
from ibm_watson_studio_lib import access_project_or_space
wslib = access_project_or_space()

In [2]:
import pandas as pd
import json

## Review project's connected data assets

In [3]:
ncbi_credentials = wslib.get_connection("ncbi")

ncbi_credentials

{'port': '21',
 'host': 'ftp.ncbi.nih.gov',
 'connection_mode': 'basic',
 'username': 'anonymous',
 '.': {'name': 'ncbi',
  'description': '',
  'asset_id': '0d95880f-3c3e-44d4-bf8d-cd98787f3c38',
  'asset_type': 'connection',
  'personal_credentials': True}}

In [4]:
wslib.list_connected_data()

[{'name': 'refseq-genbank.csv',
  'description': 'ncbi : bioproject/refseq-genbank.csv',
  'asset_id': 'a7bc469f-82f3-466f-91cf-ef8391af1401',
  'asset_type': 'data_asset',
  'tags': ['connected-data']}]

In [5]:
z=wslib.get_connected_data('refseq-genbank.csv')

print(json.dumps(z,indent=2))



{
  "port": "21",
  "host": "ftp.ncbi.nih.gov",
  "connection_mode": "basic",
  "username": "anonymous",
  "datapath": "bioproject/refseq-genbank.csv",
  ".": {
    "name": "refseq-genbank.csv",
    "description": "ncbi : bioproject/refseq-genbank.csv",
    "asset_id": "a7bc469f-82f3-466f-91cf-ef8391af1401",
    "asset_type": "data_asset",
    "asset_state": "available",
    "tags": [
      "connected-data"
    ],
    "connection": {
      "name": "ncbi",
      "description": "",
      "asset_id": "0d95880f-3c3e-44d4-bf8d-cd98787f3c38",
      "asset_type": "connection",
      "personal_credentials": true
    }
  }
}


## Pull from FTP site using project asset information

In [7]:
import ftplib

def setupFTP(x,cwd='.',listd=True):
   
    ftp = ftplib.FTP()

    ftp.connect(x['host'])

    ftp.login(x['username'])

    ftp.cwd(cwd)

    if listd:
        ftp.retrlines("LIST")
    return ftp

def getFTPFile(ftp,source,target,quit=True):
    localfile = open(target, 'wb')
    ftp.retrbinary('RETR ' + source, localfile.write, 1024)
    localfile.close()

    if quit:
        print(ftp.quit())
        
def getFTPList(ftp,listd=True):
    data = []

    ftp.dir(data.append)

    if listd:
        for line in data:
            print ("-", line)
            
    return data
        
    
def ftp_quit(ftp):
    try:
        ftp.quit()
    except:
        print("Already quit")
        
def downloadConnectedData(source,target):
    try:
        x=wslib.get_connected_data(source)
        
        print(json.dumps(x,indent=2))

        print(x['host'], x['username'],x['datapath'])

        ftp=setupFTP(x)

        getFTPFile(ftp,x['datapath'],target)
    except:
        print('error')
        ftp_quit(ftp)

    

## Download  asset from FTP site 

In [8]:
downloadConnectedData("refseq-genbank.csv","refseq-genbank_local.csv")

{
  "port": "21",
  "host": "ftp.ncbi.nih.gov",
  "connection_mode": "basic",
  "username": "anonymous",
  "datapath": "bioproject/refseq-genbank.csv",
  ".": {
    "name": "refseq-genbank.csv",
    "description": "ncbi : bioproject/refseq-genbank.csv",
    "asset_id": "a7bc469f-82f3-466f-91cf-ef8391af1401",
    "asset_type": "data_asset",
    "asset_state": "available",
    "tags": [
      "connected-data"
    ],
    "connection": {
      "name": "ncbi",
      "description": "",
      "asset_id": "0d95880f-3c3e-44d4-bf8d-cd98787f3c38",
      "asset_type": "connection",
      "personal_credentials": true
    }
  }
}
ftp.ncbi.nih.gov anonymous bioproject/refseq-genbank.csv
dr-xr-xr-x   4 ftp      anonymous     4096 Mar 27 02:48 1000genomes
-r--r--r--   1 ftp      anonymous 10737418240 Sep 18  2019 10GB
-r--r--r--   1 ftp      anonymous 1073741824 Sep 18  2019 1GB
-r--r--r--   1 ftp      anonymous 53687091200 Sep 18  2019 50GB
-r--r--r--   1 ftp      anonymous 5368709120 Sep 18  2019 5GB

### List local storage contents

In [17]:
!ls -al

total 248
drwxr-x---. 3 1000690000 wscommon  4096 Mar 27 19:02 .
drwxrwx---. 1 wsuser     wsbuild   4096 Mar 27 15:51 ..
-rw-rw----. 1 1000690000 wscommon 34231 Mar 27 16:32 contacts_fjgreco_us_ibm_com.csv
-rw-rw----. 1 1000690000 wscommon 93506 Mar 27 17:20 refseq-genbank.csv
-rw-rw----. 1 1000690000 wscommon 93506 Mar 27 19:02 refseq-genbank_local.csv
drwxr-x---. 2 1000690000 wscommon  4096 Mar 27 15:50 .virtual_documents


## View recently downloaded file in pandas

In [16]:
pd.read_csv("refseq-genbank_local.csv")

Unnamed: 0,Refseq accn,Genbank accn,Organism name,TaxID
0,PRJNA116,PRJNA10719,Arabidopsis thaliana,3702
1,PRJNA116,PRJNA11796,Arabidopsis thaliana,3702
2,PRJNA116,PRJNA13191,Arabidopsis thaliana,3702
3,PRJNA122,PRJNA12269,Oryza sativa Japonica Group,39947
4,PRJNA122,PRJDB1747,Oryza sativa Japonica Group,39947
...,...,...,...,...
1714,PRJNA756971,PRJNA682572,Prionailurus bengalensis,37029
1715,PRJNA758027,PRJDB3949,Aspergillus udagawae,91492
1716,PRJNA758049,PRJDB7449,Aspergillus pseudoviridinutans,1517512
1717,PRJNA759178,PRJNA597580,Colletes gigas,935657


### List Project locally stored data assets

In [10]:
!ls /project_data/data_asset

contacts_fjgreco_us_ibm_com.csv


### List project data assets including connected assets

In [11]:
wslib.assets.list_assets('data_asset')

[{'name': 'contacts_fjgreco_us_ibm_com.csv',
  'description': '',
  'asset_id': 'fa0b7995-55bf-4530-9cc6-6b24f8837848',
  'asset_type': 'data_asset',
  'tags': []},
 {'name': 'refseq-genbank.csv',
  'description': 'ncbi : bioproject/refseq-genbank.csv',
  'asset_id': 'a7bc469f-82f3-466f-91cf-ef8391af1401',
  'asset_type': 'data_asset',
  'tags': ['connected-data']}]

### Upload locally saved file to project asset storage

In [12]:
wslib.upload_file("refseq-genbank_local.csv")

{'name': 'refseq-genbank_local.csv',
 'asset_type': 'data_asset',
 'asset_id': '8a0b7b12-e040-4fb0-bf5a-d08c0b2e458f',
 'attachment_id': '3482b97b-3e83-43d5-8ad4-29bae3fc1814',
 'filepath': 'refseq-genbank_local.csv',
 'data_size': None,
 'mime': 'text/csv',
 'summary': ['created file in storage', 'created asset', 'created attachment'],
 'input_file_copied': True}

### List project data assets including connected assets and recently uploaded assets

In [14]:
wslib.assets.list_assets('data_asset')

[{'name': 'refseq-genbank.csv',
  'description': 'ncbi : bioproject/refseq-genbank.csv',
  'asset_id': 'a7bc469f-82f3-466f-91cf-ef8391af1401',
  'asset_type': 'data_asset',
  'tags': ['connected-data']},
 {'name': 'refseq-genbank_local.csv',
  'description': '',
  'asset_id': '8a0b7b12-e040-4fb0-bf5a-d08c0b2e458f',
  'asset_type': 'data_asset',
  'tags': []},
 {'name': 'contacts_fjgreco_us_ibm_com.csv',
  'description': '',
  'asset_id': 'fa0b7995-55bf-4530-9cc6-6b24f8837848',
  'asset_type': 'data_asset',
  'tags': []}]