# Examples in pandas for basic operations on dataframes

## Setup lists

In [39]:
# Setup
import pandas as pd
import os
from collections import OrderedDict

# List comprehensions
li_class = ['a','b','b','a','a']
li_weight = [10 * v for v in [1,2,3,4,5]]
li_height = [100 * v for v in [1,2,3,4,5]]

print("Initial lists:")
print(li_class)
print(li_weight)
print(li_height)

Initial lists:
['a', 'b', 'b', 'a', 'a']
[10, 20, 30, 40, 50]
[100, 200, 300, 400, 500]


##Build a dataframe from the lists and sort by 'class'

In [40]:
di_all = {
    'class': li_class,
    'weight': li_weight,
    'height': li_height
}

df = pd.DataFrame(OrderedDict(di_all))
df = df.sort_values('class')
print(df)

  class  weight  height
0     a      10     100
3     a      40     400
4     a      50     500
1     b      20     200
2     b      30     300


## Basic column info and stats

In [41]:
print(df.shape)

(5, 3)


In [42]:
print(df.dtypes)

class     object
weight     int64
height     int64
dtype: object


In [43]:
print(df.describe())

          weight      height
count   5.000000    5.000000
mean   30.000000  300.000000
std    15.811388  158.113883
min    10.000000  100.000000
25%    20.000000  200.000000
50%    30.000000  300.000000
75%    40.000000  400.000000
max    50.000000  500.000000


In [44]:
print(df.count())

class     5
weight    5
height    5
dtype: int64


In [45]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 0 to 2
Data columns (total 3 columns):
class     5 non-null object
weight    5 non-null int64
height    5 non-null int64
dtypes: int64(2), object(1)
memory usage: 160.0+ bytes
None


## Change dataframe column order

In [46]:
print(df)

  class  weight  height
0     a      10     100
3     a      40     400
4     a      50     500
1     b      20     200
2     b      30     300


In [47]:
NEW_COLS = ['class', 'height', 'weight']
df = df[NEW_COLS]
print(df)

  class  height  weight
0     a     100      10
3     a     400      40
4     a     500      50
1     b     200      20
2     b     300      30


## Access elements in the dataframe

### Access value at index position:

In [48]:
print(df.iloc[4,1])  # [4th row, 1st col]

300


In [49]:
print(df.iloc[:,:2])  # all rows, first two columns

  class  height
0     a     100
3     a     400
4     a     500
1     b     200
2     b     300


In [50]:
print(df.iloc[:-1,-1])  # all rows up to the last element (not inclusive) for the last column

0    10
3    40
4    50
1    20
Name: weight, dtype: int64


### Access value by name:

In [51]:
print(df['height'].loc[4])  # note that this corresponds to the height value at index name '4'

500


## Reset index in the dataframe

In [52]:
df = df.reset_index(drop=True)
print(df)

  class  height  weight
0     a     100      10
1     a     400      40
2     a     500      50
3     b     200      20
4     b     300      30


## Boolean filter

In [53]:
print(df[df['class'] == 'a'])

  class  height  weight
0     a     100      10
1     a     400      40
2     a     500      50


## Boolean filter with multiple criteria

In [54]:
F1 = (df['class'] == 'a') & (df['weight'] > 25)
F2 = (df['class'] == 'b') & (df['height'] < 250)

FILTER = F1 | F2  # Note use: '~' for cases where 'not' is necessary
print(df[FILTER])

  class  height  weight
1     a     400      40
2     a     500      50
3     b     200      20
