In [31]:
import numpy as np
import pandas as pd

In [2]:
"""
None: Pythonic missing data
"""

# None objects as missing values
arr1 = np.array([1, np.nan, 3, 4])
print(arr1.dtype)

arr2 = np.array([1, 2, 3, 4])
print(arr2.dtype)

float64
int32


In [3]:
# Python objects are incompatible with numpy and pandas operations
arr1.sum()

arr2.sum()

10

In [4]:
"""
NaN: Missing Numerical Data
"""

arr3 = np.array([1, np.nan, 3, 4])
print(arr3.dtype)

float64


In [5]:
# Arithmetic with NaN will be another NaN

print(1 + np.nan)
print(0 * np.nan)
print(arr3.sum())

nan
nan
nan


In [6]:
# Special NumPy aggregation funcs that ignore these missing values
print(np.nansum(arr3))
print(np.nanmax(arr3))
print(np.nanmin(arr3))

8.0
4.0
1.0


In [7]:
# Pandas automatically converts the None to a NaN value.

pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [8]:
"""
Detecting null values
"""

# isnull()

ser = pd.Series([1, np.nan, 'hello', None])
ser.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [9]:
# notnull()

ser.notnull()


0     True
1    False
2     True
3    False
dtype: bool

In [10]:
"""
Dropping null values
"""

# dropna()
ser.dropna()

0        1
2    hello
dtype: object

In [11]:
# For a DataFrame, there are more options

df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [12]:
# df.dropna(): list-wise deletion
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [13]:
# df.dropna(axis='columns'): variable deletion
df.dropna(axis='columns')


Unnamed: 0,2
0,2
1,5
2,6


In [14]:
# how/thresh parameters
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [15]:
# how='any' (default)
# how='all' which will only drop rows/columns that are all null values
df.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [16]:
# thresh = minimum number of non-null values to be kept
df.dropna(axis='rows', thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


In [17]:
"""
Filling Null Values
"""

ser = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
ser

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [18]:
# Fill null values with a certain value
ser.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [19]:
# Forward-fill = LOCF
ser.fillna(method='ffill') # equals to "ser.ffill()"

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [20]:
# backward-fill = NOCB
ser.fillna(method='bfill') # equals to "ser.bfill()"

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [51]:
# bfill with rows
df.fillna(method='bfill', axis='rows')


from pandas import datetime
from matplotlib import pyplot as plt

  from pandas import datetime


In [40]:
"""
Load AirQualityUCI Data
"""

def parser(x):
    return datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

input_file = './data/AirQualityUCI_refined.csv'

df = pd.read_csv(input_file,
                 index_col=[0],
                 parse_dates=[0],
                 date_parser=parser)

FileNotFoundError: [Errno 2] No such file or directory: './data/AirQualityUCI_refined.csv'

In [41]:
# Print the summary of the dataset

df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       2 non-null      float64
 1   1       2 non-null      float64
 2   2       3 non-null      int64  
 3   3       0 non-null      float64
dtypes: float64(3), int64(1)
memory usage: 224.0 bytes


In [42]:
# Visualization setup
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn; seaborn.set()  # set plot styles
%config InlineBackend.figure_format = 'svg'

In [43]:
# Visualize the series of CO(GT)

df['CO(GT)'].plot()

KeyError: 'CO(GT)'

In [44]:
# imputation

imp_locf = df['CO(GT)'].copy().ffill()
imp_nocb = df['CO(GT)'].copy().bfill()
imp_linear = df['CO(GT)'].copy().interpolate()
imp_mean = df['CO(GT)'].copy().fillna(df['CO(GT)'].mean())

KeyError: 'CO(GT)'

In [45]:
# k-nn imputation

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)    # default: 2
imp_knn = df.copy().values
imp_knn = imputer.fit_transform(imp_knn)

In [46]:
# add indices to the imputed result of k-nn

imp_df = pd.DataFrame(imp_knn, index=imp_locf.index, columns=df.columns)


NameError: name 'imp_locf' is not defined

In [47]:
# Visualizing the imputed results

plt.plot(df['CO(GT)'], label='actual', zorder=10)
plt.plot(imp_locf, label='locf', zorder=1)
plt.plot(imp_nocb, label='nocb', zorder=2)
plt.plot(imp_linear, label='linear interpolation', zorder=3)
plt.plot(imp_mean, label='mean substitution', zorder=4)
plt.plot(imp_df['CO(GT)'], label='k-nearest neighbor', zorder=5)
plt.legend(loc='best')
plt.show()

KeyError: 'CO(GT)'

In [48]:
# Select the certain period to visualize

start = '2004-07-18'
end = '2004-10-20'

In [50]:
# Visualize 2004-07 ~ 2004-10

plt.plot(df['CO(GT)'].loc[start:end], label='actual', zorder=10)
plt.plot(imp_locf.loc[start:end], label='locf', zorder=1)
plt.plot(imp_nocb.loc[start:end], label='nocb', zorder=2)
plt.plot(imp_linear.loc[start:end], label='linear interpolation', zorder=3)
plt.plot(imp_mean.loc[start:end], label='mean substitution', zorder=4)
plt.plot(imp_df['CO(GT)'].loc[start:end], label='k-nearest neighbor', zorder=5)
plt.legend(loc='best')
plt.show()

KeyError: 'CO(GT)'