In [1]:
import numpy as np
import pandas as pd

# Cleaning tools

Some data cleaning tools for analysis.

## Missing data

`isna`, `dropna`, and `fillna`.

In [2]:
ser = pd.Series(['Piggy', 'Is', None, np.nan])
ser

0    Piggy
1       Is
2     None
3      NaN
dtype: object

In [3]:
ser.isna()

0    False
1    False
2     True
3     True
dtype: bool

In [19]:
ser.fillna('Napoleon')

0       Piggy
1          Is
2    Napoleon
3    Napoleon
dtype: object

In [6]:
ser.dropna()

0    Piggy
1       Is
dtype: object

## Duplicate data

In [12]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two", "one"],
                     "k2": [1, 1, 2, 4, 3, 3, 4, 1]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,4
4,one,3
5,two,3
6,two,4
7,one,1


In [13]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
dtype: bool

In [15]:
data.drop_duplicates(subset='k2')

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,4
4,one,3


## `map`

Either a dictionary or a function for element-wise transformations.

In [21]:
eng_to_esp = {'one': 'uno', 'two': 'dos'}
data['k1'].map(eng_to_esp)

0    uno
1    dos
2    uno
3    dos
4    uno
5    dos
6    dos
7    uno
Name: k1, dtype: object

In [22]:
data['k1'].map(lambda x: eng_to_esp[x])

0    uno
1    dos
2    uno
3    dos
4    uno
5    dos
6    dos
7    uno
Name: k1, dtype: object

## `replace`

Sometimes data has some other sentinel values other than NA for missing values, such as -inf. `isna` cannot detect them. `replace` for a rescue.

`to_replace` argument also takes regex, list, Series, dict, etc.

In [26]:
ser2 = pd.Series(np.random.standard_normal(10))
ser2[2:5] = -np.inf
ser2

0   -0.252969
1   -1.342290
2        -inf
3        -inf
4        -inf
5   -0.900209
6   -0.036915
7   -0.580921
8    0.346100
9   -0.534716
dtype: float64

In [27]:
ser2.isna()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
dtype: bool

In [28]:
ser2.replace(-np.inf, 0)

0   -0.252969
1   -1.342290
2    0.000000
3    0.000000
4    0.000000
5   -0.900209
6   -0.036915
7   -0.580921
8    0.346100
9   -0.534716
dtype: float64

## Rename

Renaming index and columns are easy.

In [34]:
data.rename(index={0: 'first', 1: 'second'},
            columns=str.capitalize)

Unnamed: 0,K1,K2
first,one,1
second,two,1
2,one,2
3,two,4
4,one,3
5,two,3
6,two,4
7,one,1


## Bin

Binning into categories from continuous variables.

More on https://wesmckinney.com/book/data-cleaning#prep_discretization

In [40]:
ages = np.random.normal(40, scale=10, size=(100)).astype(int)
ages[:5]

array([45, 22, 39, 36, 33])

In [41]:
ages.mean(), ages.std()

(39.56, 10.053178601815446)

In [42]:
bins = [10, 18, 25, 40, 60, 100]
age_cat = pd.cut(ages, bins)
age_cat

[(40, 60], (18, 25], (25, 40], (25, 40], (25, 40], ..., (25, 40], (40, 60], (25, 40], (25, 40], (25, 40]]
Length: 100
Categories (5, interval[int64, right]): [(10, 18] < (18, 25] < (25, 40] < (40, 60] < (60, 100]]

Into 4 quarters.

In [45]:
pd.qcut(ages, 4)

[(39.5, 47.25], (17.999, 33.0], (33.0, 39.5], (33.0, 39.5], (17.999, 33.0], ..., (17.999, 33.0], (47.25, 67.0], (39.5, 47.25], (17.999, 33.0], (39.5, 47.25]]
Length: 100
Categories (4, interval[float64, right]): [(17.999, 33.0] < (33.0, 39.5] < (39.5, 47.25] < (47.25, 67.0]]

## Outliers

How to cap outliers? Numpy has `clip`, and pytorch has `clamp`.

In [48]:
ages.clip(20, 40)

array([40, 22, 39, 36, 33, 40, 40, 34, 36, 40, 40, 38, 40, 40, 23, 23, 40,
       40, 40, 40, 40, 32, 37, 28, 40, 28, 31, 32, 34, 21, 40, 35, 29, 38,
       40, 40, 40, 40, 36, 40, 37, 20, 40, 30, 38, 40, 25, 30, 36, 40, 33,
       40, 33, 40, 20, 29, 40, 38, 33, 24, 40, 29, 40, 40, 40, 40, 20, 40,
       40, 40, 40, 39, 40, 40, 40, 30, 40, 40, 40, 40, 40, 37, 35, 40, 38,
       40, 40, 36, 25, 38, 40, 34, 38, 40, 24, 30, 40, 40, 30, 40])

## Random sampling

Maybe useful for data augmentation purposes?

In [53]:
data.sample(3)

Unnamed: 0,k1,k2
4,one,3
2,one,2
5,two,3


In [54]:
data.iloc[np.random.permutation(3)]

Unnamed: 0,k1,k2
1,two,1
0,one,1
2,one,2


## Dummy variables

In [57]:
pd.get_dummies(data['k1'], dtype=float, prefix='k1')

Unnamed: 0,k1_one,k1_two
0,1.0,0.0
1,0.0,1.0
2,1.0,0.0
3,0.0,1.0
4,1.0,0.0
5,0.0,1.0
6,0.0,1.0
7,1.0,0.0


Cool trick on string manipulation: Genres coupled into one column, but it would be better to separate them into different columns. Using `str.get_dummies`.

In [62]:
movies = pd.read_table('data/movies.dat', sep='::', header=None,
                       names=['movie_id', 'title', 'genres'], engine='python')
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [67]:
movies['genres'].str.get_dummies('|')

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


### Dummies and bins

Using the age category we have from earlier, we can create a dummy variables.

In [69]:
age_cat

[(40, 60], (18, 25], (25, 40], (25, 40], (25, 40], ..., (25, 40], (40, 60], (25, 40], (25, 40], (25, 40]]
Length: 100
Categories (5, interval[int64, right]): [(10, 18] < (18, 25] < (25, 40] < (40, 60] < (60, 100]]

In [68]:
pd.get_dummies(age_cat)

Unnamed: 0,"(10, 18]","(18, 25]","(25, 40]","(40, 60]","(60, 100]"
0,False,False,False,True,False
1,False,True,False,False,False
2,False,False,True,False,False
3,False,False,True,False,False
4,False,False,True,False,False
...,...,...,...,...,...
95,False,False,True,False,False
96,False,False,False,True,False
97,False,False,True,False,False
98,False,False,True,False,False
