# Examples in pandas for basic operations on dataframes

## Setup lists

In [1]:
# Setup
import pandas as pd
from collections import OrderedDict

# List Comprehensions
classes = [None,'aa','bb','bb','aa','aa',None,'cc']  # include some missing string classes (None values)
weights = [10 * v for v in range(len(classes))]
heights = [100 * v for v in range(len(classes))]

print("Lists:")
print(classes)
print(weights)
print(heights)

Lists:
[None, 'aa', 'bb', 'bb', 'aa', 'aa', None, 'cc']
[0, 10, 20, 30, 40, 50, 60, 70]
[0, 100, 200, 300, 400, 500, 600, 700]


##Build a dataframe from the lists

In [2]:
di_all = {
    'class': classes,
    'weight': weights,
    'height': heights
}

df = pd.DataFrame(OrderedDict(di_all))
print(df)

  class  weight  height
0  None       0       0
1    aa      10     100
2    bb      20     200
3    bb      30     300
4    aa      40     400
5    aa      50     500
6  None      60     600
7    cc      70     700


## Identify distinct values and add column to dataframe

In [3]:
labels, uniques = pd.factorize(df['class'])
print("labels:", labels)
print("uniques:", uniques)
print("---")
df['label'] = labels
print(df)

('labels:', array([-1,  0,  1,  1,  0,  0, -1,  2]))
('uniques:', Index([u'aa', u'bb', u'cc'], dtype='object'))
---
  class  weight  height  label
0  None       0       0     -1
1    aa      10     100      0
2    bb      20     200      1
3    bb      30     300      1
4    aa      40     400      0
5    aa      50     500      0
6  None      60     600     -1
7    cc      70     700      2


Note that text categories are converted to unique numeric labels, None items are labeled as -1

## Filter out rows

In [4]:
df = df[df['class'] != 'cc']
print(df)

  class  weight  height  label
0  None       0       0     -1
1    aa      10     100      0
2    bb      20     200      1
3    bb      30     300      1
4    aa      40     400      0
5    aa      50     500      0
6  None      60     600     -1


In [5]:
df = df[~df['class'].isnull()]  # need to use 'isnull' to filter out the special None values
print(df)

  class  weight  height  label
1    aa      10     100      0
2    bb      20     200      1
3    bb      30     300      1
4    aa      40     400      0
5    aa      50     500      0


## Delete a column

In [0]:
del df['class']

## Sort values by column

In [7]:
df = df.sort_values('label', ascending=False)
print(df)

   weight  height  label
2      20     200      1
3      30     300      1
1      10     100      0
4      40     400      0
5      50     500      0


## Basic column info and stats

In [8]:
print(df.shape)

(5, 3)


In [9]:
print(df.dtypes)

weight    int64
height    int64
label     int64
dtype: object


In [10]:
print(df.describe())

          weight      height     label
count   5.000000    5.000000  5.000000
mean   30.000000  300.000000  0.400000
std    15.811388  158.113883  0.547723
min    10.000000  100.000000  0.000000
25%    20.000000  200.000000  0.000000
50%    30.000000  300.000000  0.000000
75%    40.000000  400.000000  1.000000
max    50.000000  500.000000  1.000000


In [11]:
print(df.count())

weight    5
height    5
label     5
dtype: int64


In [12]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 2 to 5
Data columns (total 3 columns):
weight    5 non-null int64
height    5 non-null int64
label     5 non-null int64
dtypes: int64(3)
memory usage: 160.0 bytes
None


## Change dataframe column order

In [13]:
print(df)

   weight  height  label
2      20     200      1
3      30     300      1
1      10     100      0
4      40     400      0
5      50     500      0


In [14]:
NEW_COLS = ['label', 'height', 'weight']
df = df[NEW_COLS]
print(df)

   label  height  weight
2      1     200      20
3      1     300      30
1      0     100      10
4      0     400      40
5      0     500      50


## Access elements in the dataframe

### Access value at index position:

In [15]:
print(df.iloc[4,1])  # [4th row, 1st col]

500


In [16]:
print(df.iloc[:,:2])  # [all rows, the first two columns]

   label  height
2      1     200
3      1     300
1      0     100
4      0     400
5      0     500


In [17]:
print(df.iloc[:-1,-1])  # [all rows up to but not including the last element, the last column]

2    20
3    30
1    10
4    40
Name: weight, dtype: int64


### Access value by name:

In [18]:
print(df['height'].loc[4])  # note that this corresponds to the height value at index name '4'

400


## Reset index in the dataframe

In [19]:
df = df.reset_index(drop=True)
print(df)

   label  height  weight
0      1     200      20
1      1     300      30
2      0     100      10
3      0     400      40
4      0     500      50


## Boolean filter with multiple criteria

In [20]:
print("Logical AND examples:")
F1 = (df['label'] == 0) & (df['weight'] > 25)
F2 = (df['label'] == 1) & (df['height'] < 250)
print(df[F1])
print("---")
print(df[F2])
print("---")

print("Logical OR example:")
OR_FILTER = F1 | F2
print(df[OR_FILTER])
print("---")

print("Logical NOT example:")
NOT_FILTER = ~OR_FILTER
print(df[NOT_FILTER])

Logical AND examples:
   label  height  weight
3      0     400      40
4      0     500      50
---
   label  height  weight
0      1     200      20
---
Logical OR example:
   label  height  weight
0      1     200      20
3      0     400      40
4      0     500      50
---
Logical NOT example:
   label  height  weight
1      1     300      30
2      0     100      10
