In [1]:
"""
None: Pythonic missing data
"""
import pandas as pd
import numpy as np

#None objects as missing values
arr1 = np.array([1, None,3,4])
arr1.dtype
arr2 = np.array([1,2,3,4])
arr2.dtype

dtype('int32')

In [2]:
 #Python objects are incompatible with numpy and pandas operations
arr2.sum()

10

In [3]:
"""
NaN: Missing Numerical Data
"""
arr3 = np.array([1,np.nan, 3, 4])
arr3.dtype
type(np.nan)

float

In [11]:
#Arithmetic with NaN will be another NaN
arr3.sum()

nan

In [12]:
#Special Numpy aggregation funcs that ignore these missing values
np.nansum(arr3)

8.0

In [13]:
#Pandas automatically converts the None to a NaN value
ser = pd.Series([1,np.nan, 2, None])
ser 

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [14]:
"""
Detecting null values
"""
ser = pd.Series([1,np.nan,'hello', None])
print(ser,'\n')
ser.notnull()

0        1
1      NaN
2    hello
3     None
dtype: object 



0     True
1    False
2     True
3    False
dtype: bool

In [15]:
"""
Dropping null values
"""
#dropna()
ser.dropna()

0        1
2    hello
dtype: object

In [16]:
#For a DataFrame, there are more options

df = pd.DataFrame([[1, np.nan,2],
                  [2, 3, 5],
                  [np.nan, 4, 6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [17]:
#df.dropna(): List-wise deletion
df.dropna()
# df.dropna(axis=0)

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [18]:
#df.dropna(axis='columns'): variable deletion
df.dropna(axis='columns')
# df.dropna(axis=1)

Unnamed: 0,2
0,2
1,5
2,6


In [19]:
#how/thresh parameters
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [20]:
#how='any' (default)
#how='all' which will only drop rows/columns that are all null values

# df.dropna(how='any')
# print(df.dropna(how='any', axis='columns'))

print(df.dropna(how='all', axis='columns'))

     0    1  2
0  1.0  NaN  2
1  2.0  3.0  5
2  NaN  4.0  6


In [21]:
#thresh = minmun number of non-null values to be kept
df.dropna(axis='rows', thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


In [22]:
"""
Filling Null Values
"""
ser.fillna(0)

0        1
1        0
2    hello
3        0
dtype: object

In [23]:
#Forward-fill = LOCF

# ser.fillna(method='ffill')
ser.fillna(method='bfill')

0        1
1    hello
2    hello
3     None
dtype: object

In [24]:
ser.bfill()

0        1
1    hello
2    hello
3     None
dtype: object

In [25]:
from pandas import datetime
from matplotlib import pyplot as plt

"""
Load AirQualityUCI data
"""

def parser(x):
    return datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

input_file = './data/AirQualityUCI_refined.csv'

df = pd.read_csv(input_file,
                 index_col=[0],
                 parse_dates=[0],
                 date_parser=parser)

  from pandas import datetime


FileNotFoundError: [Errno 2] No such file or directory: './data/AirQualityUCI_refined.csv'

In [None]:
# Print the summary of the datest
df.info()

In [26]:
# Visualization setup
@matplotlib inline
import matplotlib.pyplot as plt
import seaborn; seaborn.set() #set plot styles
%config InlineBackend.figure_format = 'svg'

SyntaxError: invalid syntax (<ipython-input-26-d36967c2fd26>, line 2)

In [27]:
#Visualize the series of CO(GT)
df['CO(GT)'].plot()

KeyError: 'CO(GT)'

In [28]:
# imputation
imp_locf = df['CO(GT)'].copy().ffill() # LOCF
imp_nocb = df['CO(GT)'].copy().bfill() # NOCB
imp_linear = df['CO(GT)'].copy().interpolate() #linear interpolation
imp_mean = df['CO(GT)'].copy().fillna(df['CO(GT)'].mean())

KeyError: 'CO(GT)'

In [29]:
# k-nn imputation
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5) #default: 2
imp_knn = df.copy().values
imp_knn = imputer.fit_transform(imp_knn)


In [30]:
# add indices to the imputed result of k-nn
imp_df = pd.DAtaFrame(imp_knn, index=imp_locf.index, columns=df.columns)

AttributeError: module 'pandas' has no attribute 'DAtaFrame'

In [None]:
# Visualizing the imputed results
plt.plot(df['CO(GT)'], label='actual', zorder=10)
plt.plot(imp_locf, label='locf', zorder=1)
plt.plot(imp_nocb, label='nocb', zorder=2)
plt.plot(imp_linear, label='linear interpolation', zorder=3)
plt.plot(imp_mean, label = 'mean substitution', zorder=4)
plt.plot(imp_df['CO(GT)'], label='k-nearest neighbor', zorder=5)
plt.legend(loc='best')
plt.show()

In [None]:
# Selected the certain period to visualize

start = '2004-07-18'
end = '2004-10-20'


In [None]:
# Visualize 2004-07 ~ 2004-10
plt.plot(df['CO(GT)'], label='actual', zorder=10)
plt.plot(imp_locf, label='locf', zorder=1)
plt.plot(imp_nocb, label='nocb', zorder=2)
plt.plot(imp_linear, label='linear interpolation', zorder=3)
plt.plot(imp_mean, label = 'mean substitution', zorder=4)
plt.plot(imp_df['CO(GT)'], label='k-nearest neighbor', zorder=5)
plt.legend(loc='best')
plt.show()