# Examples in pandas for basic operations on dataframes

## Setup lists

In [1]:
# Setup
import pandas as pd
from collections import OrderedDict

# Build List comprehensions
classes = ['a','b','b','a','a']
weights = [10 * v for v in [1,2,3,4,5]]
heights = [100 * v for v in [1,2,3,4,5]]

print("Initial lists:")
print(classes)
print(weights)
print(heights)

Initial lists:
['a', 'b', 'b', 'a', 'a']
[10, 20, 30, 40, 50]
[100, 200, 300, 400, 500]


##Build a dataframe from the lists and sort by 'class'

In [2]:
di_all = {
    'class': classes,
    'weight': weights,
    'height': heights
}

df = pd.DataFrame(OrderedDict(di_all))
df = df.sort_values('class', ascending=False)
print(df)

  class  weight  height
1     b      20     200
2     b      30     300
0     a      10     100
3     a      40     400
4     a      50     500


## Basic column info and stats

In [3]:
print(df.shape)

(5, 3)


In [4]:
print(df.dtypes)

class     object
weight     int64
height     int64
dtype: object


In [5]:
print(df.describe())

          weight      height
count   5.000000    5.000000
mean   30.000000  300.000000
std    15.811388  158.113883
min    10.000000  100.000000
25%    20.000000  200.000000
50%    30.000000  300.000000
75%    40.000000  400.000000
max    50.000000  500.000000


In [6]:
print(df.count())

class     5
weight    5
height    5
dtype: int64


In [7]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 1 to 4
Data columns (total 3 columns):
class     5 non-null object
weight    5 non-null int64
height    5 non-null int64
dtypes: int64(2), object(1)
memory usage: 160.0+ bytes
None


## Change dataframe column order

In [8]:
print(df)

  class  weight  height
1     b      20     200
2     b      30     300
0     a      10     100
3     a      40     400
4     a      50     500


In [9]:
NEW_COLS = ['class', 'height', 'weight']
df = df[NEW_COLS]
print(df)

  class  height  weight
1     b     200      20
2     b     300      30
0     a     100      10
3     a     400      40
4     a     500      50


## Access elements in the dataframe

### Access value at index position:

In [10]:
print(df.iloc[4,1])  # [4th row, 1st col]

500


In [11]:
print(df.iloc[:,:2])  # [all rows, the first two columns]

  class  height
1     b     200
2     b     300
0     a     100
3     a     400
4     a     500


In [12]:
print(df.iloc[:-1,-1])  # [all rows up to but not including the last element, the last column]

1    20
2    30
0    10
3    40
Name: weight, dtype: int64


### Access value by name:

In [13]:
print(df['height'].loc[4])  # note that this corresponds to the height value at index name '4'

500


## Reset index in the dataframe

In [14]:
df = df.reset_index(drop=True)
print(df)

  class  height  weight
0     b     200      20
1     b     300      30
2     a     100      10
3     a     400      40
4     a     500      50


## Boolean filter

In [15]:
print(df[df['class'] == 'a'])

  class  height  weight
2     a     100      10
3     a     400      40
4     a     500      50


## Boolean filter with multiple criteria

In [16]:
print("Logical AND examples:")
F1 = (df['class'] == 'a') & (df['weight'] > 25)
F2 = (df['class'] == 'b') & (df['height'] < 250)
print(df[F1])
print("---")
print(df[F2])
print("---")

print("Logical OR example:")
OR_FILTER = F1 | F2
print(df[OR_FILTER])
print("---")

print("Logical NOT example:")
NOT_FILTER = ~OR_FILTER
print(df[NOT_FILTER])

Logical AND examples:
  class  height  weight
3     a     400      40
4     a     500      50
---
  class  height  weight
0     b     200      20
---
Logical OR example:
  class  height  weight
0     b     200      20
3     a     400      40
4     a     500      50
---
Logical NOT example:
  class  height  weight
1     b     300      30
2     a     100      10
