In [2]:
import pandas as pd
import numpy as np

### Read in some practice data

In [3]:
## Titanic data
from sklearn.datasets import fetch_openml
dat = fetch_openml(data_id=40945, parser='auto')
boat = dat.frame

In [4]:
## Tips data
import seaborn as sns
iris = sns.load_dataset('iris')
tips = sns.load_dataset('tips')
pen = sns.load_dataset('penguins')

## Mapping

This method can also be used to combine categories

In [53]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon",
                             "pastrami", "corned beef", "bacon",
                             "pastrami", "honey ham", "nova lox"],
                             "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})


In [54]:
meat_to_animal = {
  "bacon": "pig",
  "pulled pork": "pig",
  "pastrami": "cow",
  "corned beef": "cow",
  "honey ham": "pig",
  "nova lox": "salmon"
}

In [55]:
data["animal"] = data["food"].map(meat_to_animal)

## Binning

In [None]:
boat.age

In [10]:
bins = [0,5,50,100]
age_cats = pd.cut(boat.age, bins=3, labels=['young','medium','old'])

In [None]:
age_cats.value_counts()

In [None]:
quartiles = pd.qcut(boat.age, 4)
quartiles.value_counts()

## Hierarchical Indexing

In [13]:
df = pd.DataFrame(np.arange(12).reshape((4,3)))
df.columns = ["A","B","C"]

In [None]:
df

In [15]:
df.index = ['a','b','c','d']
df.columns = ['one','two','three']

In [None]:
df

In [17]:
df.index = [["a", "a", "b", "b"], ['g1', 'g2', 'g3', 'g3']]

In [None]:
df

In [None]:
df.reset_index()

In [None]:
df.loc['a',:]

In [None]:
df.loc[pd.IndexSlice[:, 'g3'], :]

In [None]:
df.loc[('a','g2'),:]

In [None]:
df.groupby(level=1).sum()

In [None]:
df.groupby(level=0).sum()

In [None]:
df.reset_index(drop=True)

## Filling with group means

In [35]:
state = ["Ohio", "New York", "Vermont", "Florida",
           "Oregon", "Nevada", "California", "Idaho"]

region = ["East", "East", "East", "East",
              "West", "West", "West", "West"]

value = [1,2,3,np.nan,np.nan,6,7,np.nan]

data = pd.DataFrame({'state':state, 'region':region, 'value':value})

In [None]:
data

In [37]:
data['fill_1'] = data.value.fillna(data.value.mean())

In [None]:
data

In [None]:
data.groupby('region')['value'].mean()

In [46]:
data['fill_2'] = data.groupby('region')['value'].transform(lambda x: x.fillna(x.mean()))

In [None]:
data

### Element-wise Operations

In [None]:
np.exp(data['fill_2'])

In [None]:
data['fill_2'].apply(np.mean)

In [None]:
data['state'].apply(lambda x: x.lower())

In [None]:
data['state'].str.lower()

In [None]:
## This won't work because the column is not a string
## data['state'].lower()