In [35]:
import pandas as pd
import numpy as np

euk = pd.read_csv("eukaryotes.tsv ", sep = "\t")
my_types = {
    "Species": "string",
    "Kingdom": "string",
    "Class": "string",
    "Assembly status": "string",
    "Number of genes": "Int64",
    "Number of proteins": "Int64",
}

euk = pd.read_csv("eukaryotes.tsv", sep = "\t", dtype = my_types, na_values = ["-"])
euk

Unnamed: 0,Species,Kingdom,Class,Size (Mb),GC%,Number of genes,Number of proteins,Publication year,Assembly status
0,Emiliania huxleyi CCMP1516,Protists,Other Protists,167.676000,64.5000,38549,38554,2013,Scaffold
1,Arabidopsis thaliana,Plants,Land Plants,119.669000,36.0529,38311,48265,2001,Chromosome
2,Glycine max,Plants,Land Plants,979.046000,35.1153,59847,71219,2010,Chromosome
3,Medicago truncatula,Plants,Land Plants,412.924000,34.0470,37603,41939,2011,Chromosome
4,Solanum lycopersicum,Plants,Land Plants,828.349000,35.6991,31200,37660,2010,Chromosome
...,...,...,...,...,...,...,...,...,...
8297,Saccharomyces cerevisiae,Fungi,Ascomycetes,3.993920,38.2000,,,2017,Scaffold
8298,Saccharomyces cerevisiae,Fungi,Ascomycetes,0.586761,38.5921,155,298,1992,Chromosome
8299,Saccharomyces cerevisiae,Fungi,Ascomycetes,12.020400,38.2971,,,2018,Chromosome
8300,Saccharomyces cerevisiae,Fungi,Ascomycetes,11.960900,38.2413,,,2018,Chromosome


In [22]:
# How many fungal species have genomes bigger than 100Mb? What are their names?

idx=(euk["Kingdom"] == "Fungi") & (euk["Size (Mb)"] > 100)
(euk[idx]["Species"].to_list())[:10]

['Blumeria graminis f. sp. hordei DH14',
 'Puccinia triticina 1-1 BBBD Race 1',
 'Tuber melanosporum',
 'Puccinia striiformis f. sp. tritici',
 'Melampsora larici-populina 98AG31',
 'Ophiocordyceps sinensis',
 'Gigaspora rosea',
 'Leucoagaricus gongylophorus Ac12',
 'Hemileia vastatrix HvCat',
 'Cenococcum geophilum 1.58']

In [40]:
# How many genomes are there for each Kingdom (plants, animals, fungi, protists and other), and how many unique species names?

euk["Kingdom"].value_counts()


Fungi       4494
Animals     2181
Plants       870
Protists     727
Other         30
Name: Kingdom, dtype: Int64

In [45]:
euk[euk["Kingdom"] == "Plants"]["Species"].nunique()  #get the number of unique species names

464

In [47]:
for king in ["Plants", "Fungi", "Animals", "Protists", "Other"]:
    print(king, euk[euk["Kingdom"] == king]["Species"].nunique())

Plants 464
Fungi 2554
Animals 1442
Protists 449
Other 27


In [84]:
list_kigdoms=euk["Kingdom"].unique()

for king in list_kigdoms:
    print(king, euk[euk["Kingdom"] == king]["Species"].nunique())

Protists 449
Plants 464
Fungi 2554
Animals 1442
Other 27


In [72]:
# Make a new dataframe containing just the rows for the Aquila genus

# euk["Species"].str.split(" ") - to split the species name where we see a space
# euk["Species"].str.split(" ").str[0] - take the first element of each of the resulting lists to get genus names

euk["Species"].str.split(" ").str[0] == "Aquila" # add the condition to get a series of boolean values

0       False
1       False
2       False
3       False
4       False
        ...  
8297    False
8298    False
8299    False
8300    False
8301    False
Name: Species, Length: 8302, dtype: bool

In [74]:
# Select only rows with True

euk[euk["Species"].str.split(" ").str[0] == "Aquila"]

Unnamed: 0,Species,Kingdom,Class,Size (Mb),GC%,Number of genes,Number of proteins,Publication year,Assembly status
1755,Aquila chrysaetos canadensis,Animals,Birds,1192.74,41.9001,17520.0,31284.0,2014,Scaffold
4388,Aquila chrysaetos canadensis,Animals,Birds,1548.48,43.5,,,2014,Scaffold
5342,Aquila chrysaetos chrysaetos,Animals,Birds,1228.51,42.2,,,2018,Scaffold


In [77]:
euk[euk["Species"].str.startswith("Aquila ")] # space after "Aquila" to exclude things like "Aquilaria"

Unnamed: 0,Species,Kingdom,Class,Size (Mb),GC%,Number of genes,Number of proteins,Publication year,Assembly status
1755,Aquila chrysaetos canadensis,Animals,Birds,1192.74,41.9001,17520.0,31284.0,2014,Scaffold
4388,Aquila chrysaetos canadensis,Animals,Birds,1548.48,43.5,,,2014,Scaffold
5342,Aquila chrysaetos chrysaetos,Animals,Birds,1228.51,42.2,,,2018,Scaffold


In [93]:
# In which assembly status are the most insect genomes?

# In which assembly status are the most genomes of all classes? Use .head(1) to get the most frequent answer.
# euk["Assembly status"].value_counts().head(1)

# To get insects

euk[euk["Class"] == "Insects"]["Assembly status"].value_counts().head(1)

Scaffold    497
Name: Assembly status, dtype: Int64

In [112]:
# Which is the most common status for every different class?

list_classes=euk["Class"].unique()

for classes in list_classes:
    top_status = euk[euk["Class"] == classes]["Assembly status"].value_counts().index[0]
    print(f"Most {classes} genomes are assembled to {top_status} status ")
    
#  print(classes, euk[euk["Class"] == classes]["Assembly status"].value_counts().index[0])   without nice words :)

Most Other Protists genomes are assembled to Scaffold status 
Most Land Plants genomes are assembled to Scaffold status 
Most Ascomycetes genomes are assembled to Scaffold status 
Most Basidiomycetes genomes are assembled to Scaffold status 
Most Kinetoplasts genomes are assembled to Scaffold status 
Most Apicomplexans genomes are assembled to Scaffold status 
Most Other Fungi genomes are assembled to Scaffold status 
Most Roundworms genomes are assembled to Scaffold status 
Most Insects genomes are assembled to Scaffold status 
Most Fishes genomes are assembled to Scaffold status 
Most Other Animals genomes are assembled to Scaffold status 
Most Mammals genomes are assembled to Scaffold status 
Most Other genomes are assembled to Scaffold status 
Most Amphibians genomes are assembled to Chromosome status 
Most Birds genomes are assembled to Scaffold status 
Most Green Algae genomes are assembled to Scaffold status 
Most Flatworms genomes are assembled to Scaffold status 
Most Reptiles

In [152]:
list_classes=euk["Class"].unique()

for classes in list_classes:
    #print(classes, euk[euk["Class"] == classes]["Assembly status"].value_counts().head(1))
    print(classes, euk[euk["Class"] == classes]["Assembly status"].value_counts().index[0])

Other Protists Scaffold
Land Plants Scaffold
Ascomycetes Scaffold
Basidiomycetes Scaffold
Kinetoplasts Scaffold
Apicomplexans Scaffold
Other Fungi Scaffold
Roundworms Scaffold
Insects Scaffold
Fishes Scaffold
Other Animals Scaffold
Mammals Scaffold
Other Scaffold
Amphibians Chromosome
Birds Scaffold
Green Algae Scaffold
Flatworms Scaffold
Reptiles Scaffold
Other Plants Scaffold


In [150]:
euk[["Class", "Assembly status"]]

euk["Assembly status"].value_counts()

#euk["Assembly status"].value_counts().head(1)
#euk["Assembly status"].value_counts().index[0]

Scaffold    5437
Name: Assembly status, dtype: Int64

In [118]:
# Which genomes have at least 10% more proteins than genes?

euk[(euk["Number of proteins"] / euk["Number of genes"]) >= 1.1]

Unnamed: 0,Species,Kingdom,Class,Size (Mb),GC%,Number of genes,Number of proteins,Publication year,Assembly status
1,Arabidopsis thaliana,Plants,Land Plants,119.669000,36.0529,38311,48265,2001,Chromosome
2,Glycine max,Plants,Land Plants,979.046000,35.1153,59847,71219,2010,Chromosome
3,Medicago truncatula,Plants,Land Plants,412.924000,34.0470,37603,41939,2011,Chromosome
4,Solanum lycopersicum,Plants,Land Plants,828.349000,35.6991,31200,37660,2010,Chromosome
6,Oryza sativa Japonica Group,Plants,Land Plants,374.423000,43.5769,35219,42580,2015,Chromosome
...,...,...,...,...,...,...,...,...,...
6487,Fusarium oxysporum f. sp. melonis 26406,Fungi,Ascomycetes,54.034300,47.5000,20030,26719,2012,Scaffold
6523,Fusarium oxysporum Fo47,Fungi,Ascomycetes,49.664600,47.7000,18553,24818,2012,Scaffold
6626,Arabidopsis thaliana,Plants,Land Plants,93.654500,36.0433,16842,20111,2000,Chromosome
6781,Mus musculus,Animals,Mammals,3251.250000,41.8306,31682,45437,2005,Chromosome


In [71]:
fname='123123.txt'
print('asd "asd"  asd')
print("asd 'asd'  asd")
print("""test
test
tssss
""")
print(f'my file: {fname}')

asd "asd"  asd
asd 'asd'  asd
test
test
tssss

my file: 123123.txt
