# Brief pandas reference

In [1]:
import numpy as np
import pandas as pd

## Series

In [2]:
ser = pd.Series([1,2,3,4,5])
ser

0    1
1    2
2    3
3    4
4    5
dtype: int64

### Index

A series has index and array representations.

In [3]:
ser.index

RangeIndex(start=0, stop=5, step=1)

In [4]:
ser.array

<NumpyExtensionArray>
[1, 2, 3, 4, 5]
Length: 5, dtype: int64

Index can be assinged. Usage is the same for custom index.

In [5]:
ser2 = pd.Series([1,2,3,4], index=['a','b','c','d'])
ser2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [6]:
ser2['a']

1

In [7]:
ser2['a'] = 0
ser2

a    0
b    2
c    3
d    4
dtype: int64

In [8]:
np.log1p(ser2)

a    0.000000
b    1.098612
c    1.386294
d    1.609438
dtype: float64

Index can be reassigned as well.

In [9]:
ser2.index = ['z', 'y', 'x', 'w']
ser2

z    0
y    2
x    3
w    4
dtype: int64

### Dictionary

Series is almost like a dictionary. Index serve as keys.

In [10]:
0 in ser

True

In [11]:
'd' in ser2

False

In [12]:
4 in ser2

False

In [13]:
breakfasts = {'eggs': 1.49, 'bacons': 1.99, 'pancakes': .99}
ser3 = pd.Series(breakfasts)
ser3

eggs        1.49
bacons      1.99
pancakes    0.99
dtype: float64

In [14]:
ser3.to_dict()

{'eggs': 1.49, 'bacons': 1.99, 'pancakes': 0.99}

Changing the order of the series is easy.

In [15]:
favorites = ['bacons', 'eggs', 'potatoes']
ser4 = pd.Series(breakfasts, index=favorites)
ser4

bacons      1.99
eggs        1.49
potatoes     NaN
dtype: float64

### Checking NA

In [16]:
ser4.isna()

bacons      False
eggs        False
potatoes     True
dtype: bool

In [17]:
ser4.notna()

bacons       True
eggs         True
potatoes    False
dtype: bool

### Operations

Arithmetic operations are performed on each index.

In [18]:
ser3 + ser4

bacons      3.98
eggs        2.98
pancakes     NaN
potatoes     NaN
dtype: float64

### Name

Series and its index can have names.

In [19]:
ser3.name = 'breakfasts'
ser3.index.name = 'menu'
ser3

menu
eggs        1.49
bacons      1.99
pancakes    0.99
Name: breakfasts, dtype: float64

## DataFrame

Dictionary is basically a bunch of series on index.

In [20]:
data = {'customer': ['Puppy', 'Piggy', 'Kitty', 'Puppy', 'Kitty', 'Piggy'],
        'order': ['Lunch', 'Dinner', 'Dinner', 'Breakfast', 'Breakfast', 'Lunch'],
        'total': [1.49, 2.49, 2.99, .99, .99, 2.99]}
df = pd.DataFrame(data)
df

Unnamed: 0,customer,order,total
0,Puppy,Lunch,1.49
1,Piggy,Dinner,2.49
2,Kitty,Dinner,2.99
3,Puppy,Breakfast,0.99
4,Kitty,Breakfast,0.99
5,Piggy,Lunch,2.99


In [21]:
df.head()

Unnamed: 0,customer,order,total
0,Puppy,Lunch,1.49
1,Piggy,Dinner,2.49
2,Kitty,Dinner,2.99
3,Puppy,Breakfast,0.99
4,Kitty,Breakfast,0.99


In [22]:
pd.DataFrame(data, columns=['total', 'customer'])

Unnamed: 0,total,customer
0,1.49,Puppy
1,2.49,Piggy
2,2.99,Kitty
3,0.99,Puppy
4,0.99,Kitty
5,2.99,Piggy


In [23]:
tips = pd.Series([.5, .2], index=[1, 4])
df['tips'] = tips
df

Unnamed: 0,customer,order,total,tips
0,Puppy,Lunch,1.49,
1,Piggy,Dinner,2.49,0.5
2,Kitty,Dinner,2.99,
3,Puppy,Breakfast,0.99,
4,Kitty,Breakfast,0.99,0.2
5,Piggy,Lunch,2.99,


In [24]:
del df['tips']
df

Unnamed: 0,customer,order,total
0,Puppy,Lunch,1.49
1,Piggy,Dinner,2.49
2,Kitty,Dinner,2.99
3,Puppy,Breakfast,0.99
4,Kitty,Breakfast,0.99
5,Piggy,Lunch,2.99


### Dictionary within dictionary

Outer keys are columns and inner keys are rows. Since dictionary only allows one unique keys, only the last duplicate keys are used. However, index can have duplicate values.

In [25]:
data2 = {'customer': {'Lunch': 'Puppy', 'Dinner': 'Piggy', 'Dinner': 'Kitty', 'Breakfast': 'Puppy'},
        'total': {'Lunch': 1.49, 'Dinner': 2.49, 'Dinner': 2.99, 'Breakfast': .99}}
df2 = pd.DataFrame(data2)
df2

Unnamed: 0,customer,total
Lunch,Puppy,1.49
Dinner,Kitty,2.99
Breakfast,Puppy,0.99


In [26]:
df2.T

Unnamed: 0,Lunch,Dinner,Breakfast
customer,Puppy,Kitty,Puppy
total,1.49,2.99,0.99


Transposing can lose information about columns' data type if they have different types.

### Specifying index

In [27]:
pd.DataFrame(df2, index=['Lunch', 'Dinner', 'Snacks'])

Unnamed: 0,customer,total
Lunch,Puppy,1.49
Dinner,Kitty,2.99
Snacks,,


Series can be passed for each column.

In [28]:
pd.DataFrame({'customer': df2['customer']})

Unnamed: 0,customer
Lunch,Puppy
Dinner,Kitty
Breakfast,Puppy


In [29]:
df2.index.name = 'meals'
df2.columns.name = 'customer_info'
df2

customer_info,customer,total
meals,Unnamed: 1_level_1,Unnamed: 2_level_1
Lunch,Puppy,1.49
Dinner,Kitty,2.99
Breakfast,Puppy,0.99


Only data is converted to numpy array with a dtype.

In [30]:
df2.to_numpy()

array([['Puppy', 1.49],
       ['Kitty', 2.99],
       ['Puppy', 0.99]], dtype=object)

## Index

In [31]:
index = pd.Index(np.arange(100, 103))
index

Index([100, 101, 102], dtype='int64')

In [32]:
index2 = pd.Series([1,2,3], index=[11,12,13]).index
index2

Index([11, 12, 13], dtype='int64')

Index is immutable. Therefore, it is safe to pass around index objects to other series or dataframes.

In [33]:
# index2[1] = 3

In [34]:
ser5 = pd.Series(['a', 'b', 'c'], index=index)
ser5

100    a
101    b
102    c
dtype: object

In [35]:
ser5.index is index

True

As mentioned above, index can have duplicate values.

In [36]:
index2 = pd.Index(['a', 'a', 'b', 'c', 'b'])
index2

Index(['a', 'a', 'b', 'c', 'b'], dtype='object')

Interestingly, index use set methods.

In [37]:
index.append(pd.Index([200, 201, 202]))

Index([100, 101, 102, 200, 201, 202], dtype='int64')

In [38]:
index.intersection(pd.Index([101, 204, 211]))

Index([101], dtype='int64')

### Reindex

Reindex does not actually modifies the origianl object.

In [39]:
ser5.reindex([103, 102, 101, 100])

103    NaN
102      c
101      b
100      a
dtype: object

Missing values can be filled with a method, such as bfill (backfill), ffill (forward-fill), and nearest.

In [40]:
ser5.reindex(np.arange(98, 105), method='nearest')

98     a
99     a
100    a
101    b
102    c
103    c
104    c
dtype: object

Similiary, dataframe can work as well.

In [41]:
df

Unnamed: 0,customer,order,total
0,Puppy,Lunch,1.49
1,Piggy,Dinner,2.49
2,Kitty,Dinner,2.99
3,Puppy,Breakfast,0.99
4,Kitty,Breakfast,0.99
5,Piggy,Lunch,2.99


In [44]:
df.reindex(np.arange(8))

Unnamed: 0,customer,order,total
0,Puppy,Lunch,1.49
1,Piggy,Dinner,2.49
2,Kitty,Dinner,2.99
3,Puppy,Breakfast,0.99
4,Kitty,Breakfast,0.99
5,Piggy,Lunch,2.99
6,,,
7,,,


In [50]:
df.reindex(columns=list(df.columns) + ['rating'])

Unnamed: 0,customer,order,total,rating
0,Puppy,Lunch,1.49,
1,Piggy,Dinner,2.49,
2,Kitty,Dinner,2.99,
3,Puppy,Breakfast,0.99,
4,Kitty,Breakfast,0.99,
5,Piggy,Lunch,2.99,


### `loc` and `iloc`

`loc` only works if index and columns exist. (no NaN)

Also, it is preferred over indexing because of specificity. `loc` is for index names and `iloc` is for integers.

In [57]:
df.loc[[1, 3, 5], ['customer', 'total']]

Unnamed: 0,customer,total
1,Piggy,2.49
3,Puppy,0.99
5,Piggy,2.99


In [71]:
df.iloc[[2,4], [0,1]]

Unnamed: 0,customer,order
2,Kitty,Dinner
4,Kitty,Breakfast


`loc` is inclusive.

In [75]:
df.loc[1:3] = 5
df

Unnamed: 0,customer,order,total
0,Puppy,Lunch,1.49
1,5,5,5.0
2,5,5,5.0
3,5,5,5.0
4,Kitty,Breakfast,0.99
5,Piggy,Lunch,2.99


Boolean DataFrame is convenient.

In [77]:
df = pd.DataFrame(data)
df

Unnamed: 0,customer,order,total
0,Puppy,Lunch,1.49
1,Piggy,Dinner,2.49
2,Kitty,Dinner,2.99
3,Puppy,Breakfast,0.99
4,Kitty,Breakfast,0.99
5,Piggy,Lunch,2.99


In [79]:
df[df['total'] > 1]

Unnamed: 0,customer,order,total
0,Puppy,Lunch,1.49
1,Piggy,Dinner,2.49
2,Kitty,Dinner,2.99
5,Piggy,Lunch,2.99


In [90]:
df.iloc[1:3, 1:]

Unnamed: 0,order,total
1,Dinner,2.49
2,Dinner,2.99


When selecting rows, it's like selecting a series.

### Drop

In [67]:
ser3

menu
eggs        1.49
bacons      1.99
pancakes    0.99
Name: breakfasts, dtype: float64

In [68]:
ser3.drop('eggs')

menu
bacons      1.99
pancakes    0.99
Name: breakfasts, dtype: float64

In [70]:
df.drop(index=[1, 2], columns=['total'])

Unnamed: 0,customer,order
0,Puppy,Lunch
3,Puppy,Breakfast
4,Kitty,Breakfast
5,Piggy,Lunch
