In [1]:
import numpy as np
import pandas as pd

# Cleaning tools

Some data cleaning tools for analysis.

## Missing data

`isna`, `dropna`, and `fillna`.

In [2]:
ser = pd.Series(['Piggy', 'Is', None, np.nan])
ser

0    Piggy
1       Is
2     None
3      NaN
dtype: object

In [3]:
ser.isna()

0    False
1    False
2     True
3     True
dtype: bool

In [4]:
ser.fillna('Napoleon')

0       Piggy
1          Is
2    Napoleon
3    Napoleon
dtype: object

In [5]:
ser.dropna()

0    Piggy
1       Is
dtype: object

## Duplicate data

In [6]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two", "one"],
                     "k2": [1, 1, 2, 4, 3, 3, 4, 1]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,4
4,one,3
5,two,3
6,two,4
7,one,1


In [7]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
dtype: bool

In [8]:
data.drop_duplicates(subset='k2')

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,4
4,one,3


## `map`

Either a dictionary or a function for element-wise transformations.

In [9]:
eng_to_esp = {'one': 'uno', 'two': 'dos'}
data['k1'].map(eng_to_esp)

0    uno
1    dos
2    uno
3    dos
4    uno
5    dos
6    dos
7    uno
Name: k1, dtype: object

In [10]:
data['k1'].map(lambda x: eng_to_esp[x])

0    uno
1    dos
2    uno
3    dos
4    uno
5    dos
6    dos
7    uno
Name: k1, dtype: object

## `replace`

Sometimes data has some other sentinel values other than NA for missing values, such as -inf. `isna` cannot detect them. `replace` for a rescue.

`to_replace` argument also takes regex, list, Series, dict, etc.

In [11]:
ser2 = pd.Series(np.random.standard_normal(10))
ser2[2:5] = -np.inf
ser2

0    0.047515
1    0.190463
2        -inf
3        -inf
4        -inf
5   -1.435184
6    2.118561
7   -0.773494
8   -1.257205
9    1.290710
dtype: float64

In [12]:
ser2.isna()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
dtype: bool

In [13]:
ser2.replace(-np.inf, 0)

0    0.047515
1    0.190463
2    0.000000
3    0.000000
4    0.000000
5   -1.435184
6    2.118561
7   -0.773494
8   -1.257205
9    1.290710
dtype: float64

## Rename

Renaming index and columns are easy.

In [14]:
data.rename(index={0: 'first', 1: 'second'},
            columns=str.capitalize)

Unnamed: 0,K1,K2
first,one,1
second,two,1
2,one,2
3,two,4
4,one,3
5,two,3
6,two,4
7,one,1


## Bin

Binning into categories from continuous variables.

More on https://wesmckinney.com/book/data-cleaning#prep_discretization

In [15]:
ages = np.random.normal(40, scale=10, size=(100)).astype(int)
ages[:5]

array([38, 42, 40, 35, 32])

In [16]:
ages.mean(), ages.std()

(39.63, 10.631702591777106)

In [17]:
bins = [10, 18, 25, 40, 60, 100]
age_cat = pd.cut(ages, bins)
age_cat

[(25, 40], (40, 60], (25, 40], (25, 40], (25, 40], ..., (10, 18], (40, 60], (40, 60], (25, 40], (25, 40]]
Length: 100
Categories (5, interval[int64, right]): [(10, 18] < (18, 25] < (25, 40] < (40, 60] < (60, 100]]

Into 4 quarters.

In [18]:
pd.qcut(ages, 4)

[(31.0, 40.0], (40.0, 46.0], (31.0, 40.0], (31.0, 40.0], (31.0, 40.0], ..., (16.999, 31.0], (40.0, 46.0], (46.0, 61.0], (16.999, 31.0], (31.0, 40.0]]
Length: 100
Categories (4, interval[float64, right]): [(16.999, 31.0] < (31.0, 40.0] < (40.0, 46.0] < (46.0, 61.0]]

## Outliers

How to cap outliers? Numpy has `clip`, and pytorch has `clamp`.

In [19]:
ages.clip(20, 40)

array([38, 40, 40, 35, 32, 20, 40, 40, 27, 40, 28, 21, 29, 25, 36, 40, 34,
       39, 40, 40, 40, 34, 40, 26, 40, 38, 23, 26, 40, 40, 31, 29, 40, 40,
       31, 38, 29, 40, 24, 40, 40, 40, 40, 23, 29, 39, 28, 40, 40, 40, 40,
       40, 40, 40, 40, 40, 40, 40, 40, 40, 27, 40, 40, 40, 40, 39, 39, 40,
       28, 26, 40, 40, 40, 34, 40, 40, 30, 40, 35, 35, 40, 40, 23, 36, 40,
       40, 40, 40, 40, 29, 32, 21, 34, 40, 37, 20, 40, 40, 26, 38])

## Random sampling

Maybe useful for data augmentation purposes?

In [20]:
data.sample(3)

Unnamed: 0,k1,k2
6,two,4
0,one,1
5,two,3


In [21]:
data.iloc[np.random.permutation(3)]

Unnamed: 0,k1,k2
2,one,2
1,two,1
0,one,1


## Dummy variables

In [22]:
pd.get_dummies(data['k1'], dtype=float, prefix='k1')

Unnamed: 0,k1_one,k1_two
0,1.0,0.0
1,0.0,1.0
2,1.0,0.0
3,0.0,1.0
4,1.0,0.0
5,0.0,1.0
6,0.0,1.0
7,1.0,0.0


Cool trick on string manipulation: Genres coupled into one column, but it would be better to separate them into different columns. Using `str.get_dummies`.

In [23]:
movies = pd.read_table('data/movies.dat', sep='::', header=None,
                       names=['movie_id', 'title', 'genres'], engine='python')
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [24]:
movies['genres'].str.get_dummies('|')

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


### Dummies and bins

Using the age category we have from earlier, we can create a dummy variables.

In [25]:
age_cat

[(25, 40], (40, 60], (25, 40], (25, 40], (25, 40], ..., (10, 18], (40, 60], (40, 60], (25, 40], (25, 40]]
Length: 100
Categories (5, interval[int64, right]): [(10, 18] < (18, 25] < (25, 40] < (40, 60] < (60, 100]]

In [26]:
pd.get_dummies(age_cat)

Unnamed: 0,"(10, 18]","(18, 25]","(25, 40]","(40, 60]","(60, 100]"
0,False,False,True,False,False
1,False,False,False,True,False
2,False,False,True,False,False
3,False,False,True,False,False
4,False,False,True,False,False
...,...,...,...,...,...
95,True,False,False,False,False
96,False,False,False,True,False
97,False,False,False,True,False
98,False,False,True,False,False


### pandas extension data types

pandas has its own data types to deal with real-world data. 

 - `pd.NA`
 - `pd.CategoricalDtype`

In [27]:
pd.NA

<NA>

In [28]:
pd.CategoricalDtype

pandas.core.dtypes.dtypes.CategoricalDtype

In [29]:
'abcd'.find('c')

2

### String functions

pandas string functions skip NAN values. (No error)

In [33]:
ser3 = pd.Series({'Piggy': '555-6465', 'Kitty': '555-6369', 'Puppy': '555-2275', 'Shadow': np.nan})
ser3

Piggy     555-6465
Kitty     555-6369
Puppy     555-2275
Shadow         NaN
dtype: object

In [34]:
ser3.isna()

Piggy     False
Kitty     False
Puppy     False
Shadow     True
dtype: bool

In [35]:
ser3.str.contains('555')

Piggy     True
Kitty     True
Puppy     True
Shadow     NaN
dtype: object

Regex for the phone number matching.

In [36]:
pattern = r'([0-9]+)-([0-9]+)'
ser3.str.findall(pattern)

Piggy     [(555, 6465)]
Kitty     [(555, 6369)]
Puppy     [(555, 2275)]
Shadow              NaN
dtype: object

In [41]:
matches = ser3.str.findall(pattern).str[0]
matches

Piggy     (555, 6465)
Kitty     (555, 6369)
Puppy     (555, 2275)
Shadow            NaN
dtype: object

In [42]:
matches.str.get(1)

Piggy     6465
Kitty     6369
Puppy     2275
Shadow     NaN
dtype: object

`str` basically turns it into a string, and any string operation can be performed.

In [46]:
ser3.str[2:]

Piggy     5-6465
Kitty     5-6369
Puppy     5-2275
Shadow       NaN
dtype: object

#### `extract`: Cool trick to turn it into a dataframe.

In [47]:
ser3.str.extract(pattern)

Unnamed: 0,0,1
Piggy,555.0,6465.0
Kitty,555.0,6369.0
Puppy,555.0,2275.0
Shadow,,
