# Data Cleaning and Preparation

In [2]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

In [5]:
float_data = pd.Series([1.2, -3.5, np.nan, 0])
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
float_data.notna()

0     True
1     True
2    False
3     True
dtype: bool

In [13]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [14]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [16]:
#string_data = string_data.dropna()
string_data = string_data.fillna("kk")
string_data

0    aardvark
1          kk
2          kk
3     avocado
dtype: object

## Filtering out Missing Data

In [20]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [23]:
data[data.notna()] # ojo no es inplace!

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [25]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                     [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [26]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
   ....:                      [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [29]:
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [30]:
data.dropna(axis="columns")

0
1
2
3


In [32]:
data.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [34]:
data[4] = np.nan
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [37]:
data.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [64]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,-0.434788,,
1,2.128746,,
2,0.700428,,-0.136972
3,-0.930489,,1.303013
4,-1.409402,-0.144126,-0.716414
5,0.103614,-1.495719,-1.174894
6,2.613999,-0.689307,-0.751653


In [72]:
df.dropna(thresh=3)

Unnamed: 0,0,1,2
4,-1.409402,-0.144126,-0.716414
5,0.103614,-1.495719,-1.174894
6,2.613999,-0.689307,-0.751653


## Filling in Missing Data

## Conclusion