# Learning objectives

1. Introduce pandas
1. More matplotlib

# Load packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load K4me3.bed

In [None]:
bed_columns = ["chr", "start", "end"]
df_k4 = pd.read_table( "https://github.com/bxlab/qbb2020/raw/master/data/K4me3.bed", names=bed_columns )
df_k4

In [None]:
type( df_k4 )

# Work with data frames

In [None]:
df_k4.columns

In [None]:
df_k4.index

In [None]:
df_k4.values

In [None]:
df_k4.shape

In [None]:
df_k4.loc[0,"chr"]

In [None]:
df_k4.loc[:,"chr"]

# Chromosomes

## Demo

In [None]:
chr_k4 = df_k4.loc[:,"chr"]
chr_k4 = chr_k4.value_counts()
chr_k4

In [None]:
type( chr_k4 )

In [None]:
chr_k4.shape

In [None]:
chr_k4.loc["2L"]

In [None]:
coi = ["2L", "2R"]
chr_k4.loc[ coi ]

In [None]:
chr_k4.index

In [None]:
fig, ax = plt.subplots()
ax.bar( chr_k4.index, chr_k4 )
plt.show()

## Question 1: Tidy up K4

In [None]:
coi = [ "2L", "2R", "3L", "3R", "4", "X" ]

In [None]:
fig, ax = plt.subplots()
ax.bar( chr_k4.loc[coi].index, chr_k4.loc[coi] )
ax.set_title( "K4 Modifications")
ax.set_xlabel( "Chromosome" )
ax.set_ylabel( "Number of Regions" )
plt.show()

## Question 2: Subplot K9 K27

In [None]:
df_k9 = pd.read_table( "https://github.com/bxlab/qbb2020/raw/master/data/K9me3.bed", names=bed_columns )
chr_k9 = df_k9.loc[:,"chr"].value_counts()

df_k27 = pd.read_table( "https://github.com/bxlab/qbb2020/raw/master/data/K27me3.bed", names=bed_columns )
chr_k27 = df_k27.loc[:,"chr"].value_counts()

In [None]:
fig, ax = plt.subplots( ncols=3 )
fig.suptitle( "Histone Modifications")
ax[0].bar( chr_k4.loc[coi].index, chr_k4.loc[coi] )
ax[0].set_title( "K4" )
ax[0].set_ylabel( "Number of Regions" )
ax[1].bar( chr_k9.loc[coi].index, chr_k9.loc[coi] )
ax[1].set_title( "K9" )
ax[1].set_xlabel( "Chromosome" )
ax[2].bar( chr_k27.loc[coi].index, chr_k27.loc[coi] )
ax[2].set_title( "K27" )
plt.show()

# Widths

## Demo

In [None]:
df_k4.loc[:,"width"] = df_k4.loc[:,"end"] - df_k4.loc[:,"start"]
df_k4

In [None]:
df_k4.describe()

In [None]:
fig, ax = plt.subplots()
ax.hist( df_k4.loc[:,"width"] )
plt.show()

## Question 1: Subplot K9 K27

In [None]:
df_k9.loc[:,"width"] = df_k9.loc[:,"end"] - df_k9.loc[:,"start"]
df_k27.loc[:,"width"] = df_k27.loc[:,"end"] - df_k27.loc[:,"start"]

In [None]:
fig, ax = plt.subplots( ncols=3 )
fig.suptitle( "Histone Modifications" )
ax[0].hist( df_k4.loc[:,"width"] )
ax[0].set_title( "K4" )
ax[0].set_ylabel( "Number of Features" )
ax[1].hist( df_k9.loc[:,"width"] )
ax[1].ticklabel_format( style="plain" ) 
ax[1].set_title( "K9" )
ax[1].set_xlabel( "Nucleotides" )
ax[2].hist( df_k27.loc[:,"width"] )
ax[2].set_title( "K27" )
plt.show()

## Advanced: One plot

Can you find one or more ways to make an informative single plot with all three modifications?  Some suggestions:
- `np.log2()`
- `ax.hist( range, density, cumulative, histtype )`

In [None]:
fig, ax = plt.subplots()
ax.hist( np.log2(df_k4.loc[:,"width"]), density=True, histtype="step", label="K4" )
ax.hist( np.log2(df_k9.loc[:,"width"]), density=True, histtype="step", label="K9" )
ax.hist( np.log2(df_k27.loc[:,"width"]), density=True, histtype="step", label="K27" )
ax.set_title( "Histone Modifications" )
ax.set_xlabel( "log2(nucleotides)" )
ax.set_ylabel( "Cumulative density" )
ax.legend()
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.hist( df_k4.loc[:,"width"], range=(0,1e5), density=True, cumulative=True, histtype="step", label="K4" )
ax.hist( df_k9.loc[:,"width"], range=(0,1e5), density=True, cumulative=True, histtype="step", label="K9" )
ax.hist( df_k27.loc[:,"width"], range=(0,1e5), density=True, cumulative=True, histtype="step", label="K27" )
ax.set_title( "Histone Modifications" )
ax.set_xlabel( "Nucleotides" )
ax.set_ylabel( "Cumulative density" )
ax.legend( loc="lower right" )
plt.show()

# Summary

1. Introduce pandas
    - `read_table()`
    - `.columns` `.index` `.values`
    - `.shape`
    - `.loc[]`
    - `.value_counts()`
    - Vectorized operations
    - `.describe()`
1. More matplotlib
    - `ax.bar()`
    - `ax.hist()`