In [1]:
# ----- SETUP (RUN ONCE) ------ Run second cell first
# Before running make sure you set your fork
# You can find this from your forked repo under the code button
# Only run once, or if you CANNOT cd (change directory)
# into the repository
# For this to work (in colab)
# you must have created a GITHUB token
# and added it to colab.

from google.colab import userdata
email = userdata.get('GITHUB_EMAIL')
username = userdata.get('GITHUB_USERNAME')

# Change this to your forked url!!
gh_repo_url = "https://github.com/guth-metzlerr/fungal-temp-analysis.git"

# NOTE: set your email as your email
!git config --global user.email {email}

# NOTE: Change your username
!git config --global user.name {username}

# NOTE: change the address to the address of your fork!
!git clone {gh_repo_url}

!pip install ncbi-genome-download
!pip install pybarrnap

fatal: destination path 'fungal-temp-analysis' already exists and is not an empty directory.


In [2]:
# ---- Change Directory to repository -----
# Run when first open jupyter notebook.
# If this fails, use above cell.
%cd fungal-temp-analysis

/content/fungal-temp-analysis


In [3]:
# make sure to run this cell to have access to this function below
def commit_and_push(repo_url, message):
    import os
    import subprocess
    from google.colab import userdata

    try:
        # change your username from mine to yours.
        username = userdata.get("GITHUB_USERNAME")
        token = userdata.get("GITHUB_TOKEN")
        !git add .
        !git commit -m "{message}"
        !git push "https://{username}:{token}@github.com/{username}/fungal-temp-analysis.git"
        print("Changes Saved to GitHub!")
    except Exception as e:
        print(e)

# Only call this function if you want to
def catch_up_to_main_repo():
    import os
    from google.colab import userdata
    try:
        username = "GITHUB_EMAIL"
        token = userdata.get("GITHUB_TOKEN")
        main_repo = "https://github.com/nkmwicz/fungal-temp-analysis.git"
        !git remote add upstream {main_repo}
        !git fetch upstream
        !git merge upstream/main
        !git push https://{username}:{token}@github.com/{username}/fungal-temp-analysis.git
    except Exception as e:
        print(e)

In [None]:
# SOME NOTES
# %ls shows the available files in the repo to get names of files
# to read a tsv, use pd.read_csv but pass in sep="\t"
# to reperesent tabs as separators
%ls

delete_me.csv                     FungiWork.ipynb  README.md
eukaryotes_ncbi_temperatures.csv  LICENSE          temperature_data.tsv


In [None]:
# columns needed
# remove rows that aren't fungi ()
# remove those without temp.
# name, assembly, temp

In [None]:
#import pandas as pd
#df = pd.read_csv("./delete_me.csv")
#df.loc[len(df)] = [11,12,13,14]
#df.to_csv("./delete_me.csv", index=False)

In [4]:
import pandas as pd

df = pd.read_csv("eukaryotes_ncbi_temperatures.csv")
df.head()

Unnamed: 0,#Organism Name,Organism Groups,Strain,BioSample,BioProject,Assembly,Level,Size(Mb),GC%,Replicons,WGS,Scaffolds,CDS,Release Date,GenBank FTP,RefSeq FTP,Temperature (°C)
0,Neopyropia yezoensis,Eukaryota;Other;Other,,SAMN13316713,PRJNA589917,GCA_009829735.1,Chromosome,107.591,64.8454,chromosome 1:CM020618.1; chromosome 2:CM020619...,WMLA01,28,0,2020-01-03T00:00:00Z,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009...,,
1,Emiliania huxleyi CCMP1516,Eukaryota;Protists;Other Protists,CCMP1516,SAMN02744062,PRJNA77753,GCA_000372725.1,Scaffold,167.676,64.5,,AHAL01,7795,38554,2013-04-19T00:00:00Z,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,
2,Glycine max,Eukaryota;Plants;Land Plants,,SAMN00002965,PRJNA19861,GCA_000004515.5,Chromosome,978.942,35.1221,chromosome 1:NC_016088.4/CM000834.4; chromosom...,ACUP04,347,74248,2010-01-05T00:00:00Z,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,
3,Medicago truncatula,Eukaryota;Plants;Land Plants,,SAMN08400029,PRJNA702529,GCA_003473485.2,Chromosome,430.008,33.4462,chromosome 1:NC_053042.1/CM010648.1; chromosom...,PSQE01,42,42683,2018-09-06T00:00:00Z,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003...,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003...,
4,Solanum lycopersicum,Eukaryota;Plants;Land Plants,,SAMN02981290,PRJNA119,GCA_000188115.3,Chromosome,828.349,35.6991,chromosome 1:NC_015438.3/CM001064.3; chromosom...,AEKE03,3150,37660,2010-12-10T00:00:00Z,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,


In [5]:
columns = ["#Organism Name", "Organism Groups", "Assembly", "Temperature (°C)"]
df = df[columns]
df.head()

Unnamed: 0,#Organism Name,Organism Groups,Assembly,Temperature (°C)
0,Neopyropia yezoensis,Eukaryota;Other;Other,GCA_009829735.1,
1,Emiliania huxleyi CCMP1516,Eukaryota;Protists;Other Protists,GCA_000372725.1,
2,Glycine max,Eukaryota;Plants;Land Plants,GCA_000004515.5,
3,Medicago truncatula,Eukaryota;Plants;Land Plants,GCA_003473485.2,
4,Solanum lycopersicum,Eukaryota;Plants;Land Plants,GCA_000188115.3,


In [6]:
df = df.dropna(subset="Temperature (°C)")
df.shape

(972, 4)

In [7]:
df = df.loc[df['Organism Groups'].str.contains("Fung")]
df

Unnamed: 0,#Organism Name,Organism Groups,Assembly,Temperature (°C)
10,Schizosaccharomyces pombe,Eukaryota;Fungi;Ascomycetes,GCA_000002945.2,25.0
11,Aspergillus nidulans FGSC A4,Eukaryota;Fungi;Ascomycetes,GCA_000149205.2,26.0
12,Aspergillus fumigatus Af293,Eukaryota;Fungi;Ascomycetes,GCA_000002655.1,28.0
13,Neurospora crassa OR74A,Eukaryota;Fungi;Ascomycetes,GCA_000182925.2,25.0
14,Candida albicans SC5314,Eukaryota;Fungi;Ascomycetes,GCA_000182965.3,25.0
...,...,...,...,...
7874,Hanseniaspora lindneri,Eukaryota;Fungi;Ascomycetes,GCA_019649525.1,25.0
7913,Cystobasidium slooffiae,Eukaryota;Fungi;Basidiomycetes,GCA_019775285.1,25.0
7949,[Candida] anglica,Eukaryota;Fungi;Ascomycetes,GCA_019775655.1,25.0
7961,Penicillium brevicompactum,Eukaryota;Fungi;Ascomycetes,GCA_019843585.1,24.0


In [8]:
df.drop_duplicates(subset="Assembly")
df

Unnamed: 0,#Organism Name,Organism Groups,Assembly,Temperature (°C)
10,Schizosaccharomyces pombe,Eukaryota;Fungi;Ascomycetes,GCA_000002945.2,25.0
11,Aspergillus nidulans FGSC A4,Eukaryota;Fungi;Ascomycetes,GCA_000149205.2,26.0
12,Aspergillus fumigatus Af293,Eukaryota;Fungi;Ascomycetes,GCA_000002655.1,28.0
13,Neurospora crassa OR74A,Eukaryota;Fungi;Ascomycetes,GCA_000182925.2,25.0
14,Candida albicans SC5314,Eukaryota;Fungi;Ascomycetes,GCA_000182965.3,25.0
...,...,...,...,...
7874,Hanseniaspora lindneri,Eukaryota;Fungi;Ascomycetes,GCA_019649525.1,25.0
7913,Cystobasidium slooffiae,Eukaryota;Fungi;Basidiomycetes,GCA_019775285.1,25.0
7949,[Candida] anglica,Eukaryota;Fungi;Ascomycetes,GCA_019775655.1,25.0
7961,Penicillium brevicompactum,Eukaryota;Fungi;Ascomycetes,GCA_019843585.1,24.0


In [9]:
import re #re stands for regex
df['species_root_name'] = df['#Organism Name'].apply(lambda item: " ".join(item.split(" ")[:2]))
df['species_root_name'] = df['species_root_name'].apply(lambda item: re.sub(r"[\]\[\;'\",\(\)\-\:\.]", "", item)) #removes extra non-alphabetic characters, using the \ indicates the next thing is like a string and not a coding character
df

Unnamed: 0,#Organism Name,Organism Groups,Assembly,Temperature (°C),species_root_name
10,Schizosaccharomyces pombe,Eukaryota;Fungi;Ascomycetes,GCA_000002945.2,25.0,Schizosaccharomyces pombe
11,Aspergillus nidulans FGSC A4,Eukaryota;Fungi;Ascomycetes,GCA_000149205.2,26.0,Aspergillus nidulans
12,Aspergillus fumigatus Af293,Eukaryota;Fungi;Ascomycetes,GCA_000002655.1,28.0,Aspergillus fumigatus
13,Neurospora crassa OR74A,Eukaryota;Fungi;Ascomycetes,GCA_000182925.2,25.0,Neurospora crassa
14,Candida albicans SC5314,Eukaryota;Fungi;Ascomycetes,GCA_000182965.3,25.0,Candida albicans
...,...,...,...,...,...
7874,Hanseniaspora lindneri,Eukaryota;Fungi;Ascomycetes,GCA_019649525.1,25.0,Hanseniaspora lindneri
7913,Cystobasidium slooffiae,Eukaryota;Fungi;Basidiomycetes,GCA_019775285.1,25.0,Cystobasidium slooffiae
7949,[Candida] anglica,Eukaryota;Fungi;Ascomycetes,GCA_019775655.1,25.0,Candida anglica
7961,Penicillium brevicompactum,Eukaryota;Fungi;Ascomycetes,GCA_019843585.1,24.0,Penicillium brevicompactum


In [10]:
len(df['species_root_name'].unique()) #len gets the length of the list and unique is inside of that. So can check if this list is shorter than the original list

946

In [11]:
df = df.loc[~df['#Organism Name'].str.contains(" cf. ")] #removes any entries that have cf. in their name
df.shape

(954, 5)

In [12]:
df['duplicate'] = df.duplicated(subset="species_root_name", keep=False) #creates a new column called duplicate
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['duplicate'] = df.duplicated(subset="species_root_name", keep=False) #creates a new column called duplicate


Unnamed: 0,#Organism Name,Organism Groups,Assembly,Temperature (°C),species_root_name,duplicate
10,Schizosaccharomyces pombe,Eukaryota;Fungi;Ascomycetes,GCA_000002945.2,25.0,Schizosaccharomyces pombe,False
11,Aspergillus nidulans FGSC A4,Eukaryota;Fungi;Ascomycetes,GCA_000149205.2,26.0,Aspergillus nidulans,False
12,Aspergillus fumigatus Af293,Eukaryota;Fungi;Ascomycetes,GCA_000002655.1,28.0,Aspergillus fumigatus,False
13,Neurospora crassa OR74A,Eukaryota;Fungi;Ascomycetes,GCA_000182925.2,25.0,Neurospora crassa,False
14,Candida albicans SC5314,Eukaryota;Fungi;Ascomycetes,GCA_000182965.3,25.0,Candida albicans,False


In [13]:
df = df.loc[~df['#Organism Name'].str.contains(" cf. ")]
df.shape

(954, 6)

In [14]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,#Organism Name,Organism Groups,Assembly,Temperature (°C),species_root_name,duplicate
0,Schizosaccharomyces pombe,Eukaryota;Fungi;Ascomycetes,GCA_000002945.2,25.0,Schizosaccharomyces pombe,False
1,Aspergillus nidulans FGSC A4,Eukaryota;Fungi;Ascomycetes,GCA_000149205.2,26.0,Aspergillus nidulans,False
2,Aspergillus fumigatus Af293,Eukaryota;Fungi;Ascomycetes,GCA_000002655.1,28.0,Aspergillus fumigatus,False
3,Neurospora crassa OR74A,Eukaryota;Fungi;Ascomycetes,GCA_000182925.2,25.0,Neurospora crassa,False
4,Candida albicans SC5314,Eukaryota;Fungi;Ascomycetes,GCA_000182965.3,25.0,Candida albicans,False


In [None]:
!ncbi-genome-download -h

usage: ncbi-genome-download [-h] [-s {refseq,genbank}] [-F FILE_FORMATS] [-l ASSEMBLY_LEVELS]
                            [-g GENERA] [--genus GENERA] [--fuzzy-genus] [-S STRAINS]
                            [-T SPECIES_TAXIDS] [-t TAXIDS] [-A ASSEMBLY_ACCESSIONS]
                            [--fuzzy-accessions] [-R REFSEQ_CATEGORIES]
                            [--refseq-category REFSEQ_CATEGORIES] [-o OUTPUT] [--flat-output] [-H]
                            [-P] [-u URI] [-p N] [-r N] [-m METADATA_TABLE] [-n] [-N] [-v] [-d]
                            [-V] [-M TYPE_MATERIALS]
                            groups

positional arguments:
  groups                The NCBI taxonomic groups to download (default: all). A comma-separated
                        list of taxonomic groups is also possible. For example:
                        "bacteria,viral"Choose from: ['all', 'archaea', 'bacteria', 'fungi',
                        'invertebrate', 'metagenomes', 'plant', 'protozoa',
            

In [15]:
!ncbi-genome-download --section genbank --assembly-accessions GCA_009665985.1 --formats fasta fungi

In [16]:
#for row in range(1):
#assembly = "GCA_009665985.1"
#!ncbi-genome-download --section genbank --assembly-accessions {assembly} --formats fasta fungi

In [22]:
import subprocess, requests, tarfile, os

# get fasta files
# unzip fasta
# barrnap fasta file for quality.
# get tRNA & save it in df
# scrape web for it????
for row in range(1):
    assembly = df.at[row, 'Assembly']
    url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{assembly}/download?include_annotation_type=GENOME_FASTA"
    res = requests.get(url)  # this returns a zip folder

    # Define the filename for the downloaded zip file
    zip_filename = f"{assembly}.zip"
    # Save the zip file
    with tarfile.open(zip_filename, 'w|gz') as f:
        for chunk in res.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"Downloaded: {zip_filename}")
    extract_dir = f"{assembly}_dir"
    os.mkdir(extract_dir)

    with tarfile.open(zip_filename, 'r:*') as tar: #with handles data without actually opening the data and keeping it open, and as tar now makes the unzipped thing an item called tar that we can call on
        tar.extractall(extract_dir)
        pring('done extracting')
    # !ncbi-genome-download --section genbank --assembly-accessions {assembly} --formats fasta fungi


%cd {extract_dir}
%ls

AttributeError: 'TarFile' object has no attribute 'write'

In [24]:
import subprocess, requests, tarfile, os

# get fasta files
# unzip fasta
# barrnap fasta file for quality.
# get tRNA & save it in df
# scrape web for it????
for row in range(1):
    assembly = df.at[row, 'Assembly']
    url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{assembly}/download?include_annotation_type=GENOME_FASTA"
    res = requests.get(url)  # this returns a zip folder

    # Define the filename for the downloaded zip file
    zip_filename = f"{assembly}.zip"
    # Save the zip file
    # Use 'wb' mode to write binary data to the zip file
    with open(zip_filename, 'wb') as f: #Changed from tarfile to open to write the file
        for chunk in res.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"Downloaded: {zip_filename}")
    extract_dir = f"{assembly}_dir"
    os.mkdir(extract_dir)

    with tarfile.open(zip_filename, 'r:*') as tar: #with handles data without actually opening the data and keeping it open, and as tar now makes the unzipped thing an item called tar that we can call on
        tar.extractall(extract_dir)
        print('done extracting') #changed pring to print
    # !ncbi-genome-download --section genbank --assembly-accessions {assembly} --formats fasta fungi


%cd {extract_dir}
%ls

Downloaded: GCA_000002945.2.zip


ReadError: file could not be opened successfully:
- method gz: ReadError('not a gzip file')
- method bz2: ReadError('not a bzip2 file')
- method xz: ReadError('not an lzma file')
- method tar: ReadError('invalid header')

In [33]:
import subprocess, requests, zipfile, os #Changed tarfile to zipfile to open a zipfile
from zipfile import ZipFile
# get fasta files
# unzip fasta
# barrnap fasta file for quality.
# get tRNA & save it in df
# scrape web for it????
for row in range(1):
    assembly = df.at[row, 'Assembly']
    url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{assembly}/download?include_annotation_type=GENOME_FASTA"
    res = requests.get(url)  # this returns a zip folder

    # Define the filename for the downloaded zip file
    zip_filename = f"{assembly}.zip"
    # Save the zip file
    # Use 'wb' mode to write binary data to the zip file
    with open(zip_filename, 'wb') as f: #Changed from tarfile to open to write the file
        for chunk in res.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"Downloaded: {zip_filename}")
    extract_dir = f"{assembly}_dir"
    os.mkdir(extract_dir)

    with ZipFile(zip_filename, 'r') as zip_ref: #Changed from tarfile to zipfile to open and extract the zipfile
        zip_ref.extractall(extract_dir) #Changed from tar.extractall to zip_ref.extractall to work with a zipfile
    %cd {extract_dir}ncbi_dataset/data/{assembly}
    fasta = os.listdir() #fasta is a list of the name of files it finds
    print(fasta)
    %cd ../../..
    %ls
    # !ncbi-genome-download --section genbank --assembly-accessions {assembly} --formats fasta fungi

Downloaded: GCA_000002945.2.zip
[Errno 2] No such file or directory: 'GCA_000002945.2_dirncbi_dataset/data/GCA_000002945.2'
/content/fungal-temp-analysis
['temperature_data.tsv', 'LICENSE', 'delete_me.csv', 'eukaryotes_ncbi_temperatures.csv', '.git', 'FungiWork.ipynb', 'genbank', 'README.md', 'GCA_000002945.2.zip', 'GCA_000002945.2_dir']
/
[0m[01;36mbin[0m@                        [01;34mdev[0m/     [01;36mlib32[0m@   NGC-DL-CONTAINER-LICENSE  [01;34mroot[0m/  [30;42mtmp[0m/
[01;34mboot[0m/                       [01;34metc[0m/     [01;36mlib64[0m@   [01;34mopt[0m/                      [01;34mrun[0m/   [01;34mtools[0m/
[01;34mcontent[0m/                    [01;34mhome[0m/    [01;36mlibx32[0m@  [01;34mproc[0m/                     [01;36msbin[0m@  [01;34musr[0m/
cuda-keyring_1.1-1_all.deb  [01;34mkaggle[0m/  [01;34mmedia[0m/   [01;34mpython-apt[0m/               [01;34msrv[0m/   [01;34mvar[0m/
[01;34mdatalab[0m/                    [01;36mlib

In [32]:
%rm -rf {extract_dir}

In [38]:
%

'/'

In [None]:
%cd ncbi_dataset/data/{assembly}
%ls

In [None]:
duplicated_species = df.loc[df['duplicate'] == True]
duplicated_species

Unnamed: 0,#Organism Name,Organism Groups,Assembly,Temperature (°C),species_root_name,duplicate
47,Cryptococcus neoformans var. neoformans JEC21,Eukaryota;Fungi;Basidiomycetes,GCA_000091045.1,27.0,Cryptococcus neoformans,True
1069,Saccharomyces cerevisiae x Saccharomyces kudri...,Eukaryota;Fungi;Ascomycetes,GCA_009665985.1,25.0,Saccharomyces cerevisiae,True
1495,Ogataea polymorpha,Eukaryota;Fungi;Ascomycetes,GCA_001664045.1,25.0,Ogataea polymorpha,True
1541,Saccharomyces cerevisiae x Saccharomyces uvarum,Eukaryota;Fungi;Ascomycetes,GCA_013180185.1,25.0,Saccharomyces cerevisiae,True
2300,Magnusiomyces capitatus NRRL Y-17686,Eukaryota;Fungi;Ascomycetes,GCA_900497725.1,25.0,Magnusiomyces capitatus,True
4451,Magnusiomyces capitatus CNRMA 12.647,Eukaryota;Fungi;Ascomycetes,GCA_000817185.1,25.0,Magnusiomyces capitatus,True
4962,Cryptococcus neoformans AD hybrid,Eukaryota;Fungi;Basidiomycetes,GCA_006992865.1,27.0,Cryptococcus neoformans,True
5187,Saccharomyces cerevisiae x Saccharomyces eubay...,Eukaryota;Fungi;Ascomycetes,GCA_009665555.1,25.0,Saccharomyces cerevisiae,True
5188,Saccharomyces cerevisiae x Saccharomyces eubay...,Eukaryota;Fungi;Ascomycetes,GCA_009666275.1,25.0,Saccharomyces cerevisiae,True
5189,Saccharomyces cerevisiae x Saccharomyces eubay...,Eukaryota;Fungi;Ascomycetes,GCA_009667055.1,25.0,Saccharomyces cerevisiae,True


In [None]:
# ----Keep as last cell----
# Use to save changes in repo that are not in this file.
# To save this file, use ctrl + s
# Then set commit message location, branch, etc.

# Change repo url to your forked url
url = "https://github.com/guth-metzlerr/fungal-temp-analysis.git"
# Change commit message
commit_and_push(url, "Added deleteme file for test")

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean
To https://github.com/guth-metzlerr/fungal-temp-analysis.git
 [31m! [rejected]       [m main -> main (fetch first)
[31merror: failed to push some refs to 'https://github.com/guth-metzlerr/fungal-temp-analysis.git'
[m[33mhint: Updates were rejected because the remote contains work that you do[m
[33mhint: not have locally. This is usually caused by another repository pushing[m
[33mhint: to the same ref. You may want to first integrate the remote changes[m
[33mhint: (e.g., 'git pull ...') before pushing again.[m
[33mhint: See the 'Note about fast-forwards' in 'git push --help' for details.[m
Changes Saved to GitHub!
