### data cleaning and preparation

In [2]:
import numpy as np
import pandas as pd

In [3]:
# handling missing data
float_data = pd.Series([1.2, -3.5, np.nan, 0])
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [4]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [6]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [7]:
float_data = pd.Series([1, 2, None], dtype = "float64")
float_data.isna()

0    False
1    False
2     True
dtype: bool

In [11]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [13]:
data = pd.DataFrame([[1., 6.5, 3.],
                     [1., np.nan, np.nan],
                     [np.nan, np.nan, np.nan],
                     [np.nan, 6.5, 3]])

data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [14]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [15]:
data.dropna(how = "all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [16]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,-0.536231,,
1,-0.619244,,
2,-0.171837,,0.894559
3,0.911832,,-1.857937
4,1.128786,0.280522,0.374725
5,1.608914,-0.214716,0.881054
6,0.413324,-1.493189,-1.760792


In [17]:
df.dropna()

Unnamed: 0,0,1,2
4,1.128786,0.280522,0.374725
5,1.608914,-0.214716,0.881054
6,0.413324,-1.493189,-1.760792


In [18]:
df.dropna(thresh = 2)

Unnamed: 0,0,1,2
2,-0.171837,,0.894559
3,0.911832,,-1.857937
4,1.128786,0.280522,0.374725
5,1.608914,-0.214716,0.881054
6,0.413324,-1.493189,-1.760792


In [19]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.536231,0.0,0.0
1,-0.619244,0.0,0.0
2,-0.171837,0.0,0.894559
3,0.911832,0.0,-1.857937
4,1.128786,0.280522,0.374725
5,1.608914,-0.214716,0.881054
6,0.413324,-1.493189,-1.760792


In [20]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,-0.536231,0.5,0.0
1,-0.619244,0.5,0.0
2,-0.171837,0.5,0.894559
3,0.911832,0.5,-1.857937
4,1.128786,0.280522,0.374725
5,1.608914,-0.214716,0.881054
6,0.413324,-1.493189,-1.760792


In [21]:
df = pd.DataFrame(np.random.standard_normal((6, 3)))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan
df

Unnamed: 0,0,1,2
0,-1.599068,-0.320055,0.065829
1,0.074563,0.726999,1.840838
2,-0.777226,,0.762889
3,1.744165,,-1.636673
4,0.236752,,
5,1.023005,,


In [24]:
df.ffill()

Unnamed: 0,0,1,2
0,-1.599068,-0.320055,0.065829
1,0.074563,0.726999,1.840838
2,-0.777226,0.726999,0.762889
3,1.744165,0.726999,-1.636673
4,0.236752,0.726999,-1.636673
5,1.023005,0.726999,-1.636673


In [25]:
df.ffill(limit = 2)

Unnamed: 0,0,1,2
0,-1.599068,-0.320055,0.065829
1,0.074563,0.726999,1.840838
2,-0.777226,0.726999,0.762889
3,1.744165,0.726999,-1.636673
4,0.236752,,-1.636673
5,1.023005,,-1.636673


In [26]:
data = pd.Series([1., np.nan, 3.5, np.nan, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [30]:
# data transformation
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],
                     "k2": [1, 1, 2, 3, 3, 4, 4]})

data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [31]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [32]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [33]:
data["v1"] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [35]:
data.drop_duplicates(["k1", "k2"], keep = "last")

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [None]:
# transforming data using a function or mapping