# Examples in pandas for basic operations on dataframes

## Setup lists

In [1]:
# Setup
import pandas as pd
from collections import OrderedDict

# List Comprehensions
classes = [None,'a','b','b','a','a',None,'c']  # include some missing string classes (None values)
weights = [10 * v for v in range(len(classes))]
heights = [100 * v for v in range(len(classes))]

print("Lists:")
print(classes)
print(weights)
print(heights)

Lists:
[None, 'a', 'b', 'b', 'a', 'a', None, 'c']
[0, 10, 20, 30, 40, 50, 60, 70]
[0, 100, 200, 300, 400, 500, 600, 700]


##Build a dataframe from the lists

In [2]:
di_all = {
    'class': classes,
    'weight': weights,
    'height': heights
}

df = pd.DataFrame(OrderedDict(di_all))
print(df)

  class  weight  height
0  None       0       0
1     a      10     100
2     b      20     200
3     b      30     300
4     a      40     400
5     a      50     500
6  None      60     600
7     c      70     700


## Identify distinct values

In [3]:
labels, uniques = pd.factorize(df['class'])
print("labels:", labels)
print("---")
print("uniques:", uniques)

('labels:', array([-1,  0,  1,  1,  0,  0, -1,  2]))
---
('uniques:', Index([u'a', u'b', u'c'], dtype='object'))


Note that text categories are converted to unique numeric labels, None items are labeled as -1

## Filter out rows

In [4]:
df = df[df['class'] != 'c']
print(df)

  class  weight  height
0  None       0       0
1     a      10     100
2     b      20     200
3     b      30     300
4     a      40     400
5     a      50     500
6  None      60     600


In [5]:
df = df[~df['class'].isnull()]  # need to use 'isnull' to filter out the special None values
print(df)

  class  weight  height
1     a      10     100
2     b      20     200
3     b      30     300
4     a      40     400
5     a      50     500


## Sort values by column

In [6]:
df = df.sort_values('class', ascending=False)
print(df)

  class  weight  height
2     b      20     200
3     b      30     300
1     a      10     100
4     a      40     400
5     a      50     500


## Basic column info and stats

In [7]:
print(df.shape)

(5, 3)


In [8]:
print(df.dtypes)

class     object
weight     int64
height     int64
dtype: object


In [9]:
print(df.describe())

          weight      height
count   5.000000    5.000000
mean   30.000000  300.000000
std    15.811388  158.113883
min    10.000000  100.000000
25%    20.000000  200.000000
50%    30.000000  300.000000
75%    40.000000  400.000000
max    50.000000  500.000000


In [10]:
print(df.count())

class     5
weight    5
height    5
dtype: int64


In [11]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 2 to 5
Data columns (total 3 columns):
class     5 non-null object
weight    5 non-null int64
height    5 non-null int64
dtypes: int64(2), object(1)
memory usage: 160.0+ bytes
None


## Change dataframe column order

In [12]:
print(df)

  class  weight  height
2     b      20     200
3     b      30     300
1     a      10     100
4     a      40     400
5     a      50     500


In [13]:
NEW_COLS = ['class', 'height', 'weight']
df = df[NEW_COLS]
print(df)

  class  height  weight
2     b     200      20
3     b     300      30
1     a     100      10
4     a     400      40
5     a     500      50


## Access elements in the dataframe

### Access value at index position:

In [14]:
print(df.iloc[4,1])  # [4th row, 1st col]

500


In [15]:
print(df.iloc[:,:2])  # [all rows, the first two columns]

  class  height
2     b     200
3     b     300
1     a     100
4     a     400
5     a     500


In [16]:
print(df.iloc[:-1,-1])  # [all rows up to but not including the last element, the last column]

2    20
3    30
1    10
4    40
Name: weight, dtype: int64


### Access value by name:

In [17]:
print(df['height'].loc[4])  # note that this corresponds to the height value at index name '4'

400


## Reset index in the dataframe

In [18]:
df = df.reset_index(drop=True)
print(df)

  class  height  weight
0     b     200      20
1     b     300      30
2     a     100      10
3     a     400      40
4     a     500      50


## Boolean filter with multiple criteria

In [19]:
print("Logical AND examples:")
F1 = (df['class'] == 'a') & (df['weight'] > 25)
F2 = (df['class'] == 'b') & (df['height'] < 250)
print(df[F1])
print("---")
print(df[F2])
print("---")

print("Logical OR example:")
OR_FILTER = F1 | F2
print(df[OR_FILTER])
print("---")

print("Logical NOT example:")
NOT_FILTER = ~OR_FILTER
print(df[NOT_FILTER])

Logical AND examples:
  class  height  weight
3     a     400      40
4     a     500      50
---
  class  height  weight
0     b     200      20
---
Logical OR example:
  class  height  weight
0     b     200      20
3     a     400      40
4     a     500      50
---
Logical NOT example:
  class  height  weight
1     b     300      30
2     a     100      10
