In [1]:
import pandas as pd

euk = pd.read_csv("eukaryotes.tsv ", sep = "\t")
euk

Unnamed: 0,Species,Kingdom,Class,Size (Mb),GC%,Number of genes,Number of proteins,Publication year,Assembly status
0,Emiliania huxleyi CCMP1516,Protists,Other Protists,167.676000,64.5,38549,38554,2013,Scaffold
1,Arabidopsis thaliana,Plants,Land Plants,119.669000,36.0529,38311,48265,2001,Chromosome
2,Glycine max,Plants,Land Plants,979.046000,35.1153,59847,71219,2010,Chromosome
3,Medicago truncatula,Plants,Land Plants,412.924000,34.047,37603,41939,2011,Chromosome
4,Solanum lycopersicum,Plants,Land Plants,828.349000,35.6991,31200,37660,2010,Chromosome
...,...,...,...,...,...,...,...,...,...
8297,Saccharomyces cerevisiae,Fungi,Ascomycetes,3.993920,38.2,-,-,2017,Scaffold
8298,Saccharomyces cerevisiae,Fungi,Ascomycetes,0.586761,38.5921,155,298,1992,Chromosome
8299,Saccharomyces cerevisiae,Fungi,Ascomycetes,12.020400,38.2971,-,-,2018,Chromosome
8300,Saccharomyces cerevisiae,Fungi,Ascomycetes,11.960900,38.2413,-,-,2018,Chromosome


In [2]:
my_types = {
    "Species": "string",
    "Kingdom": "string",
    "Class": "string",
    "Assembly status": "string",
    "Number of genes": "Int64",
    "Number of proteins": "Int64",
}

euk = pd.read_csv("eukaryotes.tsv", sep = "\t", dtype = my_types, na_values = ["-"])
euk.dtypes

Species                string
Kingdom                string
Class                  string
Size (Mb)             float64
GC%                   float64
Number of genes         Int64
Number of proteins      Int64
Publication year        int64
Assembly status        string
dtype: object

In [3]:
# To extract a single column as a series.

euk["Size (Mb)"]

0       167.676000
1       119.669000
2       979.046000
3       412.924000
4       828.349000
           ...    
8297      3.993920
8298      0.586761
8299     12.020400
8300     11.960900
8301     11.820700
Name: Size (Mb), Length: 8302, dtype: float64

In [6]:
sizes = euk["Size (Mb)"]

# Iterate over series

for s in sizes[:10]:
    print("one size is " + str(s) + " Megabases")

# What is the biggest value?

max(sizes)

one size is 167.676 Megabases
one size is 119.669 Megabases
one size is 979.046 Megabases
one size is 412.924 Megabases
one size is 828.3489999999998 Megabases
one size is 4006.12 Megabases
one size is 374.423 Megabases
one size is 14547.3 Megabases
one size is 12.1571 Megabases
one size is 2135.08 Megabases


32396.4

In [11]:
# Descriptive statistics for series.

print(sizes.min(), sizes.max(), sizes.mean(), sizes.median(), sizes.std(), sizes.skew())

# The 90th percentile
print(sizes.quantile(0.9))

# Series of largest values.
print(sizes.nlargest(10))

# Randomly sample a number of values.
print(sizes.sample(5))

0.011236 32396.4 401.91843726053935 39.5597 1111.5382890873323 11.096216999207364
1204.6580000000004
210     32396.4
957     27602.7
4670    26936.2
820     24633.1
798     22103.6
5022    15344.7
940     14673.2
7       14547.3
5992    13916.9
5693    13427.4
Name: Size (Mb), dtype: float64
1951     42.4870
4607    861.6290
7957     11.7185
4866     40.9266
7264     36.7167
Name: Size (Mb), dtype: float64


In [12]:
# How many times each unique value occurs in the series.

euk["Species"].value_counts()

Saccharomyces cerevisiae                           576
Homo sapiens                                       210
Pyricularia oryzae                                 199
Venturia inaequalis                                 85
Oryza rufipogon                                     62
                                                  ... 
Cryptococcus neoformans var. neoformans B-3501A      1
Rhizopus oryzae HUMC 02                              1
Coccidioides immitis H538.4                          1
Phytophthora infestans                               1
Saccharomyces cerevisiae CLIB382                     1
Name: Species, Length: 4936, dtype: Int64

In [13]:
# Broadcasting

sizes*100000

0       16767600.0
1       11966900.0
2       97904600.0
3       41292400.0
4       82834900.0
           ...    
8297      399392.0
8298       58676.1
8299     1202040.0
8300     1196090.0
8301     1182070.0
Name: Size (Mb), Length: 8302, dtype: float64

In [14]:
# Broadcasting: converting the GC percentages to AT content.

1 - (euk["GC%"] / 100)

0       0.355000
1       0.639471
2       0.648847
3       0.659530
4       0.643009
          ...   
8297    0.618000
8298    0.614079
8299    0.617029
8300    0.617587
8301    0.617464
Name: GC%, Length: 8302, dtype: float64

In [15]:
# Can also use multiple columns.
# Calculating the gene density of each genome (genes per kilobase).

euk["Number of genes"] / (euk["Size (Mb)"] * 1000)

0       0.229902
1       0.320141
2       0.061128
3       0.091065
4       0.037665
          ...   
8297        <NA>
8298    0.264162
8299        <NA>
8300        <NA>
8301        <NA>
Length: 8302, dtype: Float64

In [16]:
# Some functions in numpy can operate on lists (unfuncs).

import numpy as np
np.log(sizes)

0       5.122034
1       4.784730
2       6.886579
3       6.023264
4       6.719435
          ...   
8297    1.384773
8298   -0.533138
8299    2.486605
8300    2.481643
8301    2.469852
Name: Size (Mb), Length: 8302, dtype: float64

In [17]:
# We can concatenate strings.

euk["Species"] + " (" + euk["Class"] + ")"

0       Emiliania huxleyi CCMP1516 (Other Protists)
1                Arabidopsis thaliana (Land Plants)
2                         Glycine max (Land Plants)
3                 Medicago truncatula (Land Plants)
4                Solanum lycopersicum (Land Plants)
                           ...                     
8297         Saccharomyces cerevisiae (Ascomycetes)
8298         Saccharomyces cerevisiae (Ascomycetes)
8299         Saccharomyces cerevisiae (Ascomycetes)
8300         Saccharomyces cerevisiae (Ascomycetes)
8301         Saccharomyces cerevisiae (Ascomycetes)
Length: 8302, dtype: string

In [21]:
# Get species name in upper case

print(euk["Species"].str.upper())

# Chain together

np.log(euk["Size (Mb)"]).mean()

0       EMILIANIA HUXLEYI CCMP1516
1             ARABIDOPSIS THALIANA
2                      GLYCINE MAX
3              MEDICAGO TRUNCATULA
4             SOLANUM LYCOPERSICUM
                   ...            
8297      SACCHAROMYCES CEREVISIAE
8298      SACCHAROMYCES CEREVISIAE
8299      SACCHAROMYCES CEREVISIAE
8300      SACCHAROMYCES CEREVISIAE
8301      SACCHAROMYCES CEREVISIAE
Name: Species, Length: 8302, dtype: string


4.201547332780901

In [24]:
# Get the median of gene density

density = euk["Number of genes"] / (euk["Size (Mb)"] * 1000)
density.median()

0.2883939418821724

In [26]:
# Selecting multiple columns. 
# The outer pair of [] indicates that we're selecting columns, the inner defines a list of column names.

euk[["Species", "Size (Mb)", "Number of genes"]].head()

Unnamed: 0,Species,Size (Mb),Number of genes
0,Emiliania huxleyi CCMP1516,167.676,38549
1,Arabidopsis thaliana,119.669,38311
2,Glycine max,979.046,59847
3,Medicago truncatula,412.924,37603
4,Solanum lycopersicum,828.349,31200


In [27]:
np.log(euk["Size (Mb)"]).nlargest(3)

210     10.385803
957     10.225669
4670    10.201226
Name: Size (Mb), dtype: float64

In [28]:
# Define what column we want to use as a label.
# Doesn't change the original dataframe, but makes a copy.

euk.set_index("Species").head()

Unnamed: 0_level_0,Kingdom,Class,Size (Mb),GC%,Number of genes,Number of proteins,Publication year,Assembly status
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Emiliania huxleyi CCMP1516,Protists,Other Protists,167.676,64.5,38549,38554,2013,Scaffold
Arabidopsis thaliana,Plants,Land Plants,119.669,36.0529,38311,48265,2001,Chromosome
Glycine max,Plants,Land Plants,979.046,35.1153,59847,71219,2010,Chromosome
Medicago truncatula,Plants,Land Plants,412.924,34.047,37603,41939,2011,Chromosome
Solanum lycopersicum,Plants,Land Plants,828.349,35.6991,31200,37660,2010,Chromosome


In [30]:
euk.set_index("Species")["Number of genes"].nlargest()

Species
Brassica napus        119453
Vitis vinifera        112321
Arachis hypogaea      111051
Camelina sativa        97832
Papaver somniferum     91114
Name: Number of genes, dtype: Int64

In [34]:
# Sorting the entire set of values.

euk.set_index("Species")["Number of genes"].sort_values()

Species
Trichomalopsis sarcophagae       3
Brachionus plicatilis            4
Enteromyxum leei                 5
Kudoa iwatai                     7
Leucoraja erinacea              13
                              ... 
Saccharomyces cerevisiae      <NA>
Saccharomyces cerevisiae      <NA>
Saccharomyces cerevisiae      <NA>
Saccharomyces cerevisiae      <NA>
Saccharomyces cerevisiae      <NA>
Name: Number of genes, Length: 8302, dtype: Int64

In [35]:
euk.set_index("Species")["Number of genes"].sort_index()

Species
Abeoforma whisleri          <NA>
Abrus precatorius          28735
Absidia glauca             15117
Absidia repens             15151
Acanthamoeba astronyxis     <NA>
                           ...  
fungal sp. EF0021           <NA>
fungal sp. Mo6-1            <NA>
fungal sp. No.11243         9730
fungal sp. No.14919        14606
uncultured Bathycoccus      <NA>
Name: Number of genes, Length: 8302, dtype: Int64

In [37]:
# Removing the missing data values

euk.set_index("Species")["Number of genes"].sort_index().dropna()

Species
Abrus precatorius                      28735
Absidia glauca                         15117
Absidia repens                         15151
Acanthamoeba castellanii str. Neff     15655
Acanthaster planci                     18187
                                       ...  
[Candida] intermedia                    6073
[Candida] pseudohaemulonis              5284
[Nectria] haematococca mpVI 77-13-4    15708
fungal sp. No.11243                     9730
fungal sp. No.14919                    14606
Name: Number of genes, Length: 2372, dtype: Int64

In [39]:
# Multilime method chains

(
    euk # start with the dataframe
    .set_index("Species") # set the index to be the species name
    ["Number of genes"] # get the number of genes
    .sort_index() #sort the series by the index
    .dropna() # remove any missing data
)

Species
Abrus precatorius                      28735
Absidia glauca                         15117
Absidia repens                         15151
Acanthamoeba castellanii str. Neff     15655
Acanthaster planci                     18187
                                       ...  
[Candida] intermedia                    6073
[Candida] pseudohaemulonis              5284
[Nectria] haematococca mpVI 77-13-4    15708
fungal sp. No.11243                     9730
fungal sp. No.14919                    14606
Name: Number of genes, Length: 2372, dtype: Int64

In [44]:
# Is the size greater than 500 Mb?

euk.set_index("Species")["Size (Mb)"] > 500

Species
Emiliania huxleyi CCMP1516    False
Arabidopsis thaliana          False
Glycine max                    True
Medicago truncatula           False
Solanum lycopersicum           True
                              ...  
Saccharomyces cerevisiae      False
Saccharomyces cerevisiae      False
Saccharomyces cerevisiae      False
Saccharomyces cerevisiae      False
Saccharomyces cerevisiae      False
Name: Size (Mb), Length: 8302, dtype: bool