In [None]:
# https://pandas.pydata.org/pandas-docs/stable/index.html
# https://pandas.pydata.org/pandas-docs/stable/install.html#recommended-dependencies

# 1) Import libraries

In [None]:
%matplotlib inline
import pandas as pd 
import matplotlib.pyplot as plt 

# 2) Read in data

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/io.html
# https://pandas.pydata.org/pandas-docs/stable/10min.html#viewing-data

In [None]:
df = pd.read_csv("https://roualdes.us/data/carnivora.csv")
df.tail()

In [None]:
dir(df['LY'])

# 3) Visualization

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/visualization.html

In [None]:
df.plot.scatter('LY', 'SB')

In [None]:
df.hist('LY')

In [None]:
df.boxplot('LY', 'Family', rot=30)

# 4) Data...

## 4.x) Select (columns)

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/dsintro.html#indexing-selection
# Referencing this (cook)book
# nbviewer.jupyter.org/github/jvns/pandas-cookbook/blob/v0.1/cookbook/

In [None]:
df['LY'].head() # Chapter 2

In [None]:
print(type(df['LY'])) # Chapter 3.2

In [None]:
df[['Family', 'LY', 'AI']].head() # index with a list

In [None]:
print(type(df))

## 4.x) Filter

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/indexing.html
# https://pandas.pydata.org/pandas-docs/stable/missing_data.html

In [None]:
df['LY'].loc[df['LY'] > 108].head()

In [None]:
df['LY'].loc[df['LY'].notna()].head()

## 4.x) Mutate

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/dsintro.html#data-alignment-and-arithmetic

In [None]:
df['brbo'] = df['SB']/df['SW']
df['brbo'].head()

## 4.x) Summarise

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/basics.html#descriptive-statistics
# https://pandas.pydata.org/pandas-docs/stable/groupby.html#aggregation

In [None]:
print(len(df['LY']))
print(df['LY'].isnull().sum())
print(df['LY'].count())
df['LY'].mean()

In [None]:
df['LY'].describe()

In [None]:
df[['LY', 'SB']].describe()

In [None]:
def third_value(x):
    z = sorted(x)
    return z[2]
df[['LY', 'SB']].agg(third_value)

## 4.2) Group By

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/groupby.html
# https://pandas.pydata.org/pandas-docs/stable/groupby.html#iterating-through-groups

In [None]:
df.groupby('Family')['LY'].describe()

In [None]:
import numpy as np
def confidence_int(x):
    n, xbar, sd = x.count(), x.mean(), x.std()
    zstar = np.array([-1.96, 1.96])
    lower, upper = xbar + zstar*sd/np.sqrt(n)
    # create a dataframe with a dict (Python's map)
    return pd.DataFrame({'lower': [lower], 'upper': [upper]})

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/groupby.html#flexible-apply
df.groupby('Family')['LY'].apply(confidence_int).reset_index()
df.groupby('Family')['brbo'].mean()

## 4.3) Transform

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/groupby.html#transformation

In [None]:
def z_score(x):
    xbar, sd = x.mean(), x.std()
    return (x - xbar) / sd

def scale(x):
    return (x - x.min())/x.max()

In [None]:
df['LY'].transform(z_score)
df['LY_scaled'] = df['LY'].transform(scale)
df['LY_scaled'].min(), df['LY_scaled'].max()