# 5: Handle Missing Data: fillna, dropna, interpolate

> 1. fillna to fill missing values using different ways
> 2. interpolate to make a guess on missing values using interpolation
> 3. dropna to drop rows with missing values

In [9]:
import pandas as pd
df = pd.read_csv('nyc_weather.csv',parse_dates=['EST'])
df.head(3)

Unnamed: 0,EST,Temperature,DewPoint,Humidity,Sea Level PressureIn,VisibilityMiles,WindSpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
0,2016-01-01,38,23,52,30.03,10,8.0,0,5,,281
1,2016-01-02,36,18,46,30.02,10,7.0,0,3,,275
2,2016-01-03,40,21,47,29.86,10,8.0,0,1,,277


In [10]:
type(df['EST'][0])

pandas._libs.tslibs.timestamps.Timestamp

pandas._libs.tslibs.timestamps.Timestamp -> using parse_dates=['EST']

#### Setting EST as index

In [11]:
df.set_index('EST',inplace=True)
df.head(1)

Unnamed: 0_level_0,Temperature,DewPoint,Humidity,Sea Level PressureIn,VisibilityMiles,WindSpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
EST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-01-01,38,23,52,30.03,10,8.0,0,5,,281


In [12]:
df

Unnamed: 0_level_0,Temperature,DewPoint,Humidity,Sea Level PressureIn,VisibilityMiles,WindSpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
EST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-01-01,38,23,52,30.03,10,8.0,0,5,,281
2016-01-02,36,18,46,30.02,10,7.0,0,3,,275
2016-01-03,40,21,47,29.86,10,8.0,0,1,,277
2016-01-04,25,9,44,30.05,10,9.0,0,3,,345
2016-01-05,20,-3,41,30.57,10,5.0,0,0,,333
2016-01-06,33,4,35,30.5,10,4.0,0,0,,259
2016-01-07,39,11,33,30.28,10,2.0,0,3,,293
2016-01-08,39,29,64,30.2,10,4.0,0,8,,79
2016-01-09,44,38,77,30.16,9,8.0,T,8,Rain,76
2016-01-10,50,46,71,29.59,4,,1.8,7,Rain,109


In [14]:
df.shape

(31, 10)

***

## Use fillna() method in dataframe

In [15]:
new_df_fillna = df.fillna(0)
new_df_fillna

Unnamed: 0_level_0,Temperature,DewPoint,Humidity,Sea Level PressureIn,VisibilityMiles,WindSpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
EST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-01-01,38,23,52,30.03,10,8.0,0,5,0,281
2016-01-02,36,18,46,30.02,10,7.0,0,3,0,275
2016-01-03,40,21,47,29.86,10,8.0,0,1,0,277
2016-01-04,25,9,44,30.05,10,9.0,0,3,0,345
2016-01-05,20,-3,41,30.57,10,5.0,0,0,0,333
2016-01-06,33,4,35,30.5,10,4.0,0,0,0,259
2016-01-07,39,11,33,30.28,10,2.0,0,3,0,293
2016-01-08,39,29,64,30.2,10,4.0,0,8,0,79
2016-01-09,44,38,77,30.16,9,8.0,T,8,Rain,76
2016-01-10,50,46,71,29.59,4,0.0,1.8,7,Rain,109


> If we want to be specific, then pass a dictionary into the fillna() function

In [16]:
new_df_fillna = df.fillna({'Events':'No events',
                          'WindSpeedMPH':'0.1'})
new_df_fillna

Unnamed: 0_level_0,Temperature,DewPoint,Humidity,Sea Level PressureIn,VisibilityMiles,WindSpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
EST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-01-01,38,23,52,30.03,10,8.0,0,5,No events,281
2016-01-02,36,18,46,30.02,10,7.0,0,3,No events,275
2016-01-03,40,21,47,29.86,10,8.0,0,1,No events,277
2016-01-04,25,9,44,30.05,10,9.0,0,3,No events,345
2016-01-05,20,-3,41,30.57,10,5.0,0,0,No events,333
2016-01-06,33,4,35,30.5,10,4.0,0,0,No events,259
2016-01-07,39,11,33,30.28,10,2.0,0,3,No events,293
2016-01-08,39,29,64,30.2,10,4.0,0,8,No events,79
2016-01-09,44,38,77,30.16,9,8.0,T,8,Rain,76
2016-01-10,50,46,71,29.59,4,0.1,1.8,7,Rain,109


## Use fillna(method="ffill") method in dataframe

> ffill means forward-fill, fill with the forward value

In [17]:
new_df_ffill = df.fillna(method="ffill")
new_df_ffill

Unnamed: 0_level_0,Temperature,DewPoint,Humidity,Sea Level PressureIn,VisibilityMiles,WindSpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
EST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-01-01,38,23,52,30.03,10,8.0,0,5,,281
2016-01-02,36,18,46,30.02,10,7.0,0,3,,275
2016-01-03,40,21,47,29.86,10,8.0,0,1,,277
2016-01-04,25,9,44,30.05,10,9.0,0,3,,345
2016-01-05,20,-3,41,30.57,10,5.0,0,0,,333
2016-01-06,33,4,35,30.5,10,4.0,0,0,,259
2016-01-07,39,11,33,30.28,10,2.0,0,3,,293
2016-01-08,39,29,64,30.2,10,4.0,0,8,,79
2016-01-09,44,38,77,30.16,9,8.0,T,8,Rain,76
2016-01-10,50,46,71,29.59,4,8.0,1.8,7,Rain,109


In [18]:
new_df_bfill = df.fillna(method="bfill")
new_df_bfill

Unnamed: 0_level_0,Temperature,DewPoint,Humidity,Sea Level PressureIn,VisibilityMiles,WindSpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
EST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-01-01,38,23,52,30.03,10,8.0,0,5,Rain,281
2016-01-02,36,18,46,30.02,10,7.0,0,3,Rain,275
2016-01-03,40,21,47,29.86,10,8.0,0,1,Rain,277
2016-01-04,25,9,44,30.05,10,9.0,0,3,Rain,345
2016-01-05,20,-3,41,30.57,10,5.0,0,0,Rain,333
2016-01-06,33,4,35,30.5,10,4.0,0,0,Rain,259
2016-01-07,39,11,33,30.28,10,2.0,0,3,Rain,293
2016-01-08,39,29,64,30.2,10,4.0,0,8,Rain,79
2016-01-09,44,38,77,30.16,9,8.0,T,8,Rain,76
2016-01-10,50,46,71,29.59,4,6.0,1.8,7,Rain,109


## "axis" parameter in fillna() method in dataframe

In [19]:
new_df_bfill_axis = df.fillna(method="bfill",axis='columns')
new_df_bfill_axis

Unnamed: 0_level_0,Temperature,DewPoint,Humidity,Sea Level PressureIn,VisibilityMiles,WindSpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
EST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-01-01,38,23,52,30.03,10,8.0,0,5,281,281
2016-01-02,36,18,46,30.02,10,7.0,0,3,275,275
2016-01-03,40,21,47,29.86,10,8.0,0,1,277,277
2016-01-04,25,9,44,30.05,10,9.0,0,3,345,345
2016-01-05,20,-3,41,30.57,10,5.0,0,0,333,333
2016-01-06,33,4,35,30.5,10,4.0,0,0,259,259
2016-01-07,39,11,33,30.28,10,2.0,0,3,293,293
2016-01-08,39,29,64,30.2,10,4.0,0,8,79,79
2016-01-09,44,38,77,30.16,9,8.0,T,8,Rain,76
2016-01-10,50,46,71,29.59,4,1.8,1.8,7,Rain,109


## "limit" parameter in fillna() method in dataframe

In [21]:
new_df_ffill_limit = df.fillna(method="ffill",limit=2)
new_df_ffill_limit

Unnamed: 0_level_0,Temperature,DewPoint,Humidity,Sea Level PressureIn,VisibilityMiles,WindSpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
EST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-01-01,38,23,52,30.03,10,8.0,0,5,,281
2016-01-02,36,18,46,30.02,10,7.0,0,3,,275
2016-01-03,40,21,47,29.86,10,8.0,0,1,,277
2016-01-04,25,9,44,30.05,10,9.0,0,3,,345
2016-01-05,20,-3,41,30.57,10,5.0,0,0,,333
2016-01-06,33,4,35,30.5,10,4.0,0,0,,259
2016-01-07,39,11,33,30.28,10,2.0,0,3,,293
2016-01-08,39,29,64,30.2,10,4.0,0,8,,79
2016-01-09,44,38,77,30.16,9,8.0,T,8,Rain,76
2016-01-10,50,46,71,29.59,4,8.0,1.8,7,Rain,109


## interpolate() to do interpolation in dataframe

In [22]:
df_interpolate_new = df.interpolate()
df_interpolate_new

Unnamed: 0_level_0,Temperature,DewPoint,Humidity,Sea Level PressureIn,VisibilityMiles,WindSpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
EST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-01-01,38,23,52,30.03,10,8.0,0,5,,281
2016-01-02,36,18,46,30.02,10,7.0,0,3,,275
2016-01-03,40,21,47,29.86,10,8.0,0,1,,277
2016-01-04,25,9,44,30.05,10,9.0,0,3,,345
2016-01-05,20,-3,41,30.57,10,5.0,0,0,,333
2016-01-06,33,4,35,30.5,10,4.0,0,0,,259
2016-01-07,39,11,33,30.28,10,2.0,0,3,,293
2016-01-08,39,29,64,30.2,10,4.0,0,8,,79
2016-01-09,44,38,77,30.16,9,8.0,T,8,Rain,76
2016-01-10,50,46,71,29.59,4,7.333333,1.8,7,Rain,109


## dropna() method Drop all the rows which has "na" in dataframe

In [23]:
df_dropna = df.dropna()
df_dropna

Unnamed: 0_level_0,Temperature,DewPoint,Humidity,Sea Level PressureIn,VisibilityMiles,WindSpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
EST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-01-09,44,38,77,30.16,9,8.0,T,8,Rain,76
2016-01-16,47,37,70,29.52,8,7.0,0.24,7,Rain,340
2016-01-17,36,23,66,29.78,8,6.0,0.05,6,Fog-Snow,345
2016-01-18,25,6,53,29.83,9,12.0,T,2,Snow,293
2016-01-23,26,21,78,29.77,1,16.0,2.31,8,Fog-Snow,42
2016-01-24,28,11,53,29.92,8,6.0,T,3,Snow,327
2016-01-27,41,22,45,30.03,10,7.0,T,3,Rain,311


##  "thresh" parameter in dropna() method 

In [24]:
df_dropna_thresh = df.dropna(thresh=1)
df_dropna_thresh

Unnamed: 0_level_0,Temperature,DewPoint,Humidity,Sea Level PressureIn,VisibilityMiles,WindSpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
EST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-01-01,38,23,52,30.03,10,8.0,0,5,,281
2016-01-02,36,18,46,30.02,10,7.0,0,3,,275
2016-01-03,40,21,47,29.86,10,8.0,0,1,,277
2016-01-04,25,9,44,30.05,10,9.0,0,3,,345
2016-01-05,20,-3,41,30.57,10,5.0,0,0,,333
2016-01-06,33,4,35,30.5,10,4.0,0,0,,259
2016-01-07,39,11,33,30.28,10,2.0,0,3,,293
2016-01-08,39,29,64,30.2,10,4.0,0,8,,79
2016-01-09,44,38,77,30.16,9,8.0,T,8,Rain,76
2016-01-10,50,46,71,29.59,4,,1.8,7,Rain,109


***