### Data 88: Data Science for Genetics and Genomics - 09/17/19

# *Lab 1 - Comparative Genomics with EDA*

### by Jonathan Fischer (adapted in part from work by Shishi Luo) 

In [None]:
# Import the necessary modules
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as sp
plt.style.use('fivethirtyeight')

## Let's explore genome sizes for some commonly studied organisms

In [None]:
# Load table of model organisms
# name your table model_species

# Hints:
# table_name = Table.read_table('filename')
# filename = https://raw.githubusercontent.com/ds-connectors/Data88-Genetics_and_Genomics/master/Lab01/model_species.csv

model_species = Table.read_table('https://raw.githubusercontent.com/ds-connectors/Data88-Genetics_and_Genomics/master/Lab01/model_species.csv')


In [None]:
# We can see the whole table with the table_name.show() command. table_name will show the first 10 rows. Try it out!
# table_name.show()

#model_species
model_species.show()

In [None]:
# Let's rank organisms by their genome size. To sort the table, 
# use table_name.sort('Column_name').show()
# to sort in descending order, use the additional option like so: 
# table_name.sort('Column_name', descending = True).show()
# Try both ways out

model_species.sort('Size').show()
#model_species.sort('Size',descending = True).show()

In [None]:
# Can we extract just the organisms with more than 60,000 genes? This is the rough number that humans have. 
# This is done by table_name.where(table_name['Column_name'] ~ condition). 
# ~ in the above is the operator for the condition. In this case, it's > for greater than

model_species.where(model_species['Genes'] > 60000)

## Let's compare the genome sizes of some pathogens

In [None]:
# Load pathogen information. Name the table pathogens
# See Cell 2 for a hint
# filename is https://raw.githubusercontent.com/ds-connectors/Data88-Genetics_and_Genomics/master/Lab01/pathogens.csv

pathogens = Table.read_table('https://raw.githubusercontent.com/ds-connectors/Data88-Genetics_and_Genomics/master/Lab01/pathogens.csv')


In [None]:
# Display full table. How many organisms does it have? Hint: Table.num_rows
# See Cell 3 if you need a hint

#pathogens.num_rows
pathogens.sort('Genes', descending = False).show()

With so many organisms, it can be hard to interpret the table. Histograms are a great way to visualize the distribution of a quantity of interest.

In [None]:
# Histogram of genome sizes (in megabases)
# object.hist('column_name', bins = b, normed = n)
# b gives the number of bins in the histogram
# n is either True or False for whether bin heights should be normalized by number of observations
# Choose 20 bins and normed = False

pathogens.hist('Size', bins = 20, normed = False)

In [None]:
# Histograms of genome sizes split by subgroup (aka pivot histograms)
# Let's normalize by frequency here because each group may have a different number of 
# object.hist('column_name', group = variable_to_group_by, bins = b, normed = n)

pathogens.hist('Size', group = 'Subgroup', 
                   normed=True, orientation='vertical', bins = 100)

In [None]:
# Make scatterplot of the number of genes vs the genome size in pathogens
# Also print Pearson and Spearman correlations. 
# Command is print(sp.pearsonr(table['Variable1'], table['Variable2'])) or
# print(sp.spearmanr(table['Variable1'], table['Variable2']))
# Number of genes on Y axis, genome size on X axis
# object.scatter('X_column', 'Y_column')

pathogens.scatter('Size', 'Genes')
print(sp.pearsonr(pathogens['Size'], pathogens['Genes']))
print(sp.spearmanr(pathogens['Size'], pathogens['Genes']))

In [None]:
# Make scatterplot of the number of proteins vs number of genes in pathogens
# Number of proteins on Y axis, number of genes on X axis
# object.scatter('X_column', 'Y_column')
# print respective correlations

pathogens.scatter('Genes', 'Proteins')
print(sp.pearsonr(pathogens['Genes'], pathogens['Proteins']))
print(sp.spearmanr(pathogens['Genes'], pathogens['Proteins']))

In [None]:
# Compute the mean, median, standard deviation, and interquartile range for 
# the genome sizes, number of genes, and number of proteins. Store them in a table with 
# Size, Genes, and Proteins as the columns

# mean: np.mean, median: np.median, standard deviation: np.std, IQR: sp.iqr
# Put these into a table names pathogen_summary with columns corresponding to Size, Genes, and Proteins

# Example of how to construct a table
#t = Table().with_columns([
#    'letter', ['a', 'b', 'c', 'z'],
#    'count',  [  9,   3,   3,   1],
#    'points', [  1,   2,   2,  10],
#]) 


pathogen_summary = Table().with_columns([
    'Size', [np.mean(pathogens['Size']), np.median(pathogens['Size']), np.std(pathogens['Size']), sp.iqr(pathogens['Size'])],
    'Genes', [np.mean(pathogens['Genes']), np.median(pathogens['Genes']), np.std(pathogens['Genes']), sp.iqr(pathogens['Genes'])],
    'Proteins', [np.mean(pathogens['Proteins']), np.median(pathogens['Proteins']), np.std(pathogens['Proteins']), sp.iqr(pathogens['Proteins'])],
])

pathogen_summary
                                                                                                                  

## Let's repeat this analysis but for animals

In [None]:
# Load animal information. Name the table animals
# See Cell 2 for a hint
# filename is https://raw.githubusercontent.com/ds-connectors/Data88-Genetics_and_Genomics/master/Lab01/animals.csv

animals = Table.read_table('https://raw.githubusercontent.com/ds-connectors/Data88-Genetics_and_Genomics/master/Lab01/animals.csv')

In [None]:
# Display full table. How many organisms does it have? Hint: Table.num_rows
# See Cell 3 if you need a hint

#animals.num_rows
animals.sort('Genes', descending = False).show()

In [None]:
# Histogram of genome sizes (in megabases)

animals.hist('Size', bins = 20, normed = False)

In [None]:
# Histograms of genome sizes split by Subgroup (aka pivot histograms)
# Let's normalize by frequency here because each group may have a different number of 

animals.hist('Size', group = 'Subgroup', 
                   normed=True, orientation='vertical')

In [None]:
# Make scatterplot of the number of genes vs the genome size in animals
# Print correlations

animals.scatter('Size', 'Genes')
print(sp.pearsonr(animals['Size'], animals['Genes']))
print(sp.spearmanr(animals['Size'], animals['Genes']))

In [None]:
# Make scatterplot of the number of proteins vs number of genes in animals
# Print correlations

animals.scatter('Genes', 'Proteins')
print(sp.pearsonr(animals['Genes'], animals['Proteins']))
print(sp.spearmanr(animals['Genes'], animals['Proteins']))

In [None]:
# Table of summary statistics (name it animal_summary)

animal_summay = Table().with_columns([
    'Size', [np.mean(animals['Size']), np.median(animals['Size']), np.std(animals['Size']), sp.iqr(animals['Size'])],
    'Genes', [np.mean(animals['Genes']), np.median(animals['Genes']), np.std(animals['Genes']), sp.iqr(animals['Genes'])],
    'Proteins', [np.mean(animals['Proteins']), np.median(animals['Proteins']), np.std(animals['Proteins']), sp.iqr(animals['Proteins'])],
])

animal_summary

# Bonus exercises

## Comparing pathogens and animals

In [None]:
# We need to merge the tables before comparing. Name the new table merged

# First, add a new column named 'Type' to each table which says either 'Pathogens' or 'Animals'
# Hint: table.append_column(column_name, word)

pathogens.append_column('Type', 'Pathogens')
animals.append_column('Type', 'Animals')

# First set merged equal to pathogens. Then use the table.append(table2) syntax to merge the tables
merged = pathogens
merged = merged.append(animals)

merged.show()

In [None]:
# Make the normalized pivot histogram of genome sizes with 50 bins
merged.hist('Size', group = 'Type', 
                   normed=True, orientation='vertical', bins = 50)

### The scaling is kind of distorted because of the wide range of values. Perhaps a log transformation would make things look better?

In [None]:
# Append a new column named 'Log size'
# Use np.log10 to take the log10 of the 'Size' column. Remember that these are in Mb, so you should add 6 after taking the log!

merged.append_column('Log size', np.log10(merged['Size'])+6)

In [None]:
# Make the normalized pivot histogram of genome sizes with 50 bins
merged.hist('Log size', group = 'Type', 
                   normed=True, orientation='vertical', bins = 50)