# `pandas`

In [None]:
%pylab inline
plt.style.use('ggplot')

In [None]:
import numpy as np
import pandas as pd

## Creating `pd.Series`

In [None]:
s = pd.Series(np.random.randn(10))
s

In [None]:
s = pd.Series(np.random.randn(10), name="random_series")
s

In [None]:
s = pd.Series(np.random.randn(10), name="random_series",
              index=np.random.randint(23, size=(10,)))
s

In [None]:
s.index

In [None]:
s = pd.Series(np.random.randn(10), name="random_series",
              index=pd.Index(np.random.randint(23, size=(10,)), name="main_index"))
s

In [None]:
s.index

In [None]:
s

In [None]:
# Index with a single occurence
s[8]

In [None]:
# Index with multiple occurences
s[[21]]

In [None]:
s

In [None]:
s[21].iloc[0]

In [None]:
s.iloc[1]

In [None]:
s = pd.Series({'a':3, 'c':6, 'b':2}, name="dict_series")
s

In [None]:
s.index

In [None]:
s[0], s['a'], s.iloc[0]

In [None]:
s['a':'c']

In [None]:
s.sort_index()['a':'c']

## Creating `pd.DataFrame`

In [None]:
df = pd.DataFrame(np.arange(20).reshape((5,4)))
df

In [None]:
df = pd.DataFrame(np.arange(20).reshape((5,4)),
                  columns=['a', 'b', 'c', 'd'])
df

In [None]:
import string
df = pd.DataFrame(np.arange(20).reshape((5,4)),
                  columns=['a', 'b', 'c', 'd'],
                  index=np.random.choice(list(string.ascii_lowercase), 5, replace=False))
df

In [None]:
df.columns

In [None]:
df.index

In [None]:
df['a']

In [None]:
df['z']

In [None]:
type(df['a'])

In [None]:
df.loc[["z"]]

In [None]:
df.loc[["z"]]

In [None]:
df.loc[["z"], "a"]

In [None]:
df

In [None]:
# Indexing with right and wrong order

# df['n':'r']
df['r':'z']

# Indexing `pd.Series`

In [None]:
N_ELEMS = 20
s = pd.Series(np.random.randint(20, size=(N_ELEMS,)),
              index=list(string.ascii_lowercase)[:N_ELEMS],
              name='randint_series')
s

## Indexing with `[]`

In [None]:
s

In [None]:
s['i'] # But there's a caveat: it may be series or just an element

In [None]:
s[['i']]

In [None]:
s['a':'z']

In [None]:
s[['k', 'q', 'a', 'r']]

In [None]:
s[5:3:-1]

## Indexing with `.loc`

In [None]:
s_int_idx = pd.Series(np.random.randint(20, size=(N_ELEMS,)),
                      index=np.random.choice(N_ELEMS, N_ELEMS, replace=False),
                      name='randint_series')
s_int_idx

In [None]:
s_int_idx[2:15]

In [None]:
s_int_idx[2]

In [None]:
s_int_idx[2:5]

In [None]:
s_int_idx[s_int_idx.index.isin(range(2,6))]

In [None]:
s_int_idx.loc[2:5]

In [None]:
s_int_idx.iloc[2:5]

In [None]:
s_int_idx.loc[2:456]

In [None]:
s_int_idx.mean()

In [None]:
s_sorted = s_int_idx.sort_index()

In [None]:
s_sorted.index.is_monotonic

In [None]:
s_sorted.loc[2:5]

In [None]:
s_trick = pd.Series([1,2,3,4], index=[2,3,4,4])
s_trick.loc[2:4]

In [None]:
s_trick

In [None]:
s_trick.index.is_monotonic

In [None]:
s_int_idx.sort_index().loc[2:6]

In [None]:
s_int_idx

In [None]:
s_int_idx[s_int_idx.index!=11]

In [None]:
s_int_idx[(s_int_idx>15) | (s_int_idx<5)]

In [None]:
s_int_idx.loc[s_int_idx!=14]

In [None]:
s_int_idx

In [None]:
s_int_idx[s_int_idx.index!=11]

## Indexing with `.iloc`

In [None]:
s

In [None]:
s.iloc[5:8]

In [None]:
s.shape

In [None]:
s.iloc[15:20]

In [None]:
s.iloc[[3, 9, 8]]

In [None]:
s

In [None]:
s.iloc[[3, 9]]

In [None]:
s.loc[["d", "j"]]

# Indexing `pd.DataFrame`

In [None]:
df = pd.DataFrame(np.arange(20).reshape((5,4)),
                  columns=['d', 'c', 'b', 'a'],
                  index=np.random.choice(list(string.ascii_lowercase), 5, replace=False))
df

In [None]:
df[2:5]

In [None]:
df['e'] # Nope, it doesn't work that way

In [None]:
df['a']

In [None]:
df[['b']]

In [None]:
df

In [None]:
df.columns

In [None]:
df[df.columns[2:]]

In [None]:
df.iloc[:, 2:]

In [None]:
df

In [None]:
df[:'e'] # Surprising!

In [None]:
df['s':'g'] # Not really surprising

In [None]:
df.sort_index()['s':'g']

In [None]:
df['g':]

In [None]:
df.sort_index()['g':]

In [None]:
df

In [None]:
df["s":'z'] # No, that won't work

In [None]:
df.sort_index()['k':'zjyyf']

In [None]:
df.index.to_series().rank()

In [None]:
df.sort_index().index.to_series().rank()

In [None]:
df

In [None]:
df['e':'s']['c'] # Not a very good idea

In [None]:
df[(df['a']>12) | (df['b']<3)]

In [None]:
df

## Indexing with `.loc`

In [None]:
df

In [None]:
df.loc['s']

In [None]:
df.loc['s', 'b']

In [None]:
df.loc['g':, 'b']

In [None]:
df.loc['e':, 'b':]

In [None]:
df

In [None]:
df.loc['e':, 'c':'d'].shape

In [None]:
df

In [None]:
df.columns.to_series().rank()

In [None]:
df.loc['e':, 'a'::-2]

In [None]:
df

In [None]:
df.sort_index(axis='columns').loc['e':, 'c':'d']

In [None]:
df.loc[:, ["a", "b"]]

In [None]:
df.loc["x", "b"] = 20
df.loc["h", "c"] = 30

In [None]:
df

In [None]:
df[["a", "b"]].rank()

In [None]:
df.loc[["e", "g", "s"]]

In [None]:
df.loc[["e", "g", "s"]].rank()

In [None]:
df = pd.DataFrame(np.arange(20).reshape((5,4)),
                  columns=['d', 'c', 'a', 'b'],
                  index=np.random.choice(list(string.ascii_lowercase), 5, replace=False))
df

In [None]:
df

In [None]:
df.loc['v':, 'b':'d']

In [None]:
df.columns.to_series().rank()

In [None]:
df.sort_index(axis='columns').loc['v':, 'c':'sflng']

In [None]:
df.sort_index(axis='columns').columns.to_series().rank()

In [None]:
df

In [None]:
df.loc[df['c']>10, 'c']

In [None]:
df.loc[[1,2], 'c'] # This won't work

## Indexing with `.iloc`

In [None]:
df.iloc[:3]

In [None]:
df.iloc[1:3]

In [None]:
df.iloc[1:3, 'b'] # This won't work

In [None]:
df.iloc[1:3, 2:]

In [None]:
df

In [None]:
df.values

In [None]:
df.iloc[(df['a']>10).values, 2:]

## `SettingWithCopyWarning`

In [None]:
df

In [None]:
df.loc[df['a']>10, 'c'] = 10

In [None]:
df

In [None]:
df.loc[df['a']>5]['c'] = 20.

In [None]:
df

In [None]:
df_1 = df.loc[df['a']>10]

In [None]:
df_1.is_copy

In [None]:
df_1['c'] = 25

In [None]:
df_1

# Dataframe arithmetic

In [None]:
df_1 = pd.DataFrame(np.arange(40).reshape(10,4),
                    columns=['a', 'b', 'c', 'd'],
                    index=np.random.choice(list(string.ascii_lowercase), 10, replace=False))
df_1

In [None]:
df_2 = pd.DataFrame(np.arange(40).reshape(10,4),
                    columns=['a', 'e', 'c', 'd'],
                    index=np.random.choice(list(string.ascii_lowercase), 10, replace=False))
df_2

In [None]:
# A lot of missing values
df_1 + df_2

In [None]:
df_1.add(df_2, fill_value=0)

In [None]:
s_1 = pd.Series(np.arange(10),
                name='f',
                index=np.random.choice(list(string.ascii_lowercase), 10, replace=False))

In [None]:
s_1

In [None]:
df_1 + s_1

In [None]:
s_1 + df_1

In [None]:
s_1.add(df_1, axis='columns')

In [None]:
df1 = df.astype(np.float).copy()

In [None]:
df1.loc["h", "b"] = np.nan

In [None]:
df1

In [None]:
(df1 - df1.mean()) / df1.std()

# Applying functions to dataframes

In [None]:
df

In [None]:
df.iloc[0]

In [None]:
df.apply(lambda x: np.sqrt(x.d), axis=1)

In [None]:
%timeit df['d'].apply(lambda x: np.sqrt(x))

In [None]:
np.sqrt(df['d'])

In [None]:
%timeit np.sqrt(df['d'])

In [None]:
# Better way
%timeit np.sqrt(df['d'].values)

In [None]:
df

In [None]:
df.values

In [None]:
df.values.sum(axis=1)

In [None]:
df.apply(lambda x: x.sum(), axis=1)

In [None]:
np.sum(df, axis=1)

In [None]:
df.apply(lambda x: [x.d, x.b], axis=1)

In [None]:
dfm = df.apply(lambda x: pd.Series({'sum': x.sum(),
                                    'sqrt': np.sqrt(x['d'])}),
               axis=1)

In [None]:
dfm

In [None]:
dfm["sum"] = dfm["sum"].astype(np.int)

In [None]:
dfm.dtypes

In [None]:
dfm[dfm.columns[dfm.dtypes==np.int64]]

# Reading CSV files

We will use [Titanic dataset](https://www.kaggle.com/c/titanic/data), which is located in `data` directory.

In [None]:
titanic_train = pd.read_csv('data/train.csv')

In [None]:
titanic_train.head()

In [None]:
titanic_train.tail()

In [None]:
titanic_train.info()

In [None]:
titanic_train.describe()

In [None]:
%timeit titanic_train[titanic_train.PassengerId==400]

In [None]:
titanic_train = pd.read_csv('data/train.csv', index_col='PassengerId')
titanic_test = pd.read_csv('data/test.csv', index_col='PassengerId')

In [None]:
%timeit titanic_train.loc[400]

In [None]:
titanic = pd.concat([titanic_train, titanic_test])

In [None]:
titanic

In [None]:
titanic = pd.concat([titanic_train, titanic_test], sort=False)

In [None]:
titanic.head()

In [None]:
titanic.tail()

# Dataframe statistics

In [None]:
titanic.index.is_unique, titanic.index.is_monotonic

In [None]:
titanic['Pclass'].value_counts()

In [None]:
titanic[titanic.Ticket=="CA 2144"]

In [None]:
titanic[titanic.SibSp==5]

In [None]:
titanic.SibSp.value_counts()

In [None]:
titanic.Embarked.value_counts() # S = Southampton, C = Cherbourg, Q = Queens Town

In [None]:
titanic.Sex.value_counts()

In [None]:
print("Average age: %2.2f" % titanic['Age'].mean())
print("STD of age: %2.2f" % titanic['Age'].std())
print("Minimum age: %2.2f" % titanic['Age'].min())
print("Maximum age: %2.2f" % titanic['Age'].max())

In [None]:
print("Average number of siblings/spouse: %2.2f" % titanic['SibSp'].mean())
print("Average number of siblings/spouse in class 1: %2.2f" % titanic.loc[titanic.Pclass==1, 'SibSp'].mean())
print("Average number of siblings/spouse in class 2: %2.2f" % titanic.loc[titanic.Pclass==2, 'SibSp'].mean())
print("Average number of siblings/spouse in class 3: %2.2f" % titanic.loc[titanic.Pclass==3, 'SibSp'].mean())

In [None]:
print("Minimum age (not survived): %2.2f" % titanic.loc[titanic.Survived==0, 'Age'].min())
print("Maximum age (not survived): %2.2f" % titanic.loc[titanic.Survived==0, 'Age'].max())

In [None]:
print("Minimum age (not survived): %2.2f" % titanic.loc[titanic.Survived==1, 'Age'].min())
print("Maximum age (not survived): %2.2f" % titanic.loc[titanic.Survived==1, 'Age'].max())

# Announcing the future

In [None]:
titanic.groupby('Pclass')

In [None]:
titanic.groupby('Pclass').size()

In [None]:
titanic.Pclass.value_counts()

In [None]:
titanic["AgeGroup"] = 5 + 10*(titanic.Age//10)

In [None]:
group_counts = titanic.groupby(['Pclass', 'AgeGroup', 'Sex']).size()/titanic.groupby('Pclass').size()

In [None]:
group_counts[[(1, 5.0, "female"), (1, 15.0, "female")]]

In [None]:
100*group_counts

In [None]:
group_counts = group_counts.unstack()

In [None]:
group_counts.unstack(level=1)

In [None]:
group_counts

In [None]:
group_counts.loc[1]

In [None]:
plt.figure(figsize=(15, 5))

for pclass in [1, 2, 3]:
    plt.subplot(1, 3, pclass)
    group_counts.loc[pclass].plot(ax=plt.gca())
    plt.ylim(0, 0.25)
    plt.title("Age distribution for Class %i" % pclass, fontsize=12)
plt.tight_layout()

In [None]:
survival_groups = titanic.groupby(['Pclass', 'AgeGroup', 'Sex']).Survived.mean()

In [None]:
survival_groups = survival_groups.unstack()

In [None]:
plt.figure(figsize=(15, 5))

for pclass in [1, 2, 3]:
    plt.subplot(1, 3, pclass)
    survival_groups.loc[pclass].plot(ax=plt.gca())
    plt.ylim(0, 1)
    plt.title("Survived in class %i" % pclass, fontsize=12)
plt.tight_layout()

In [None]:
siblings_groups = titanic.groupby(['Pclass', 'AgeGroup', 'Sex']).SibSp.mean()

In [None]:
siblings_groups = siblings_groups.unstack()

In [None]:
plt.figure(figsize=(15, 5))

for pclass in [1, 2, 3]:
    plt.subplot(1, 3, pclass)
    siblings_groups.loc[pclass].plot(ax=plt.gca())
    plt.ylim(0, 5)
    plt.title("Siblings in class %i" % pclass, fontsize=12)
plt.tight_layout()

In [None]:
embark_counts = titanic.groupby(['Pclass', 'AgeGroup', 'Sex', 'Embarked']).size()/titanic.groupby('Pclass').size()

In [None]:
embark_counts = embark_counts.unstack([-1, -2])

In [None]:
embark_counts

In [None]:
embark_counts.loc[3, (('S', 'male'))]

# Replacing and renaming

In [None]:
titanic.replace?

In [None]:
titanic.replace(22, 122).head()

In [None]:
import re
titanic.replace(re.compile(r'\(.*\)'), '').head()

In [None]:
titanic.rename(lambda x: x.lower(), axis=1).head()

In [None]:
titanic.rename({'SibSp':'siblings_spouses'}, axis=1).head()

## String operations

In [None]:
titanic.head()

In [None]:
(titanic
 .replace(re.compile(r'\(.*\)'), '')
 .Name.str
 .split(',', expand=True)
 .rename({0:'family_name', 1:'first_name'}, axis=1)
 .head())

# Cleaning data

In [None]:
titanic.isnull().head()

In [None]:
titanic.isnull().any()

In [None]:
titanic.isnull().any(axis=1).head()

In [None]:
titanic.isnull().sum()

In [None]:
titanic.head(15)

In [None]:
fill_values = titanic[['Age', 'Fare']].mean()

In [None]:
titanic[titanic.Fare.isnull()]

In [None]:
titanic.fillna(fill_values).fillna({'AgeGroup':25.}).head(15)

# Getting indicators and dummy variables

In [None]:
pd.get_dummies(titanic, columns=['Pclass', 'Sex', 'Embarked']).head()