Resource : http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html

In [1]:
import pandas as pd
print(pd.__version__)

0.22.0


In [2]:
df = pd.read_csv('weather_data.csv', header = 0, index_col = 0) # Note i'am conveting making the date col as index so I can make it work as time-series dataset
df.head()
# There are couple of missing values in the temp, windspeed and event column

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/2017,32.0,6.0,Rain
1/4/2017,,9.0,Sunny
1/5/2017,28.0,,Snow
1/6/2017,,7.0,
1/7/2017,32.0,,Rain


In [7]:
# converting day from string to datetime with the US format of yyyy-mm-dd
df.index =  pd.to_datetime(df.index)

In [6]:
type(df.index)

pandas.core.indexes.datetimes.DatetimeIndex

In [11]:
df.head()

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain


In [8]:
# converting day form stirng into a datetime format in one line :
df = pd.read_csv('weather_data.csv', parse_dates = ['day'], index_col = 0)
df.head()
# can use df.set_index = ('day', inplace = True) to set day as index

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain


In [10]:
# Replacing NA using fillna
new_df = df.fillna(0)
new_df
# notice how the NA were replaced by 0

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,0
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,0
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [11]:
# Sometimes having 0 is note usefull, for instance in the event column 0 does not
# make any sense so we can do the following...

new_df_dict = df.fillna({
    'temperature': 0,
    'windspeed': 0,
    'event': 'No-event'

})
new_df_dict

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,No-event
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,No-event
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [20]:
# It is not always wise to replace the NA's with 0, one alternative to this
# can be replacing the NA with the previous values using 
#'ffill'- for previous value or 'bfill'- for next value

new_df = df.fillna(method = 'ffill')
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,7.0,Sunny
2017-01-09,32.0,7.0,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [22]:
# replaces the values column-wise rather than row-wise 
new_df = df.fillna(method = 'ffill', axis = 'columns')
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,28.0,Snow
2017-01-06,,7.0,7
2017-01-07,32.0,32.0,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [23]:
# restricts the replacement to only one value
new_df = df.fillna(method = 'ffill', limit = 1)
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,,Sunny
2017-01-09,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [24]:
# filling the missing values with interpolate 
new_df_interpolate = df.interpolate()
new_df_interpolate
# it uses the concept of linear interpolation to figure out
# and replace the missing values 
# Note: it can only replace numeric missing values..!!

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,30.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [27]:
# Now we are replacing the missing numeric values using the interpolate method
# Plus we are replacing categorical values using fillna's 'ffill' method
new_df_interpolate = df.interpolate()
new_df_interpolate['event'] = df['event'].fillna(method= 'ffill')
new_df_interpolate

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,30.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,Snow
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [60]:
# Since we have so kind of time-series data using 'time' as a method 
# for interpolation

df_interpolation_time = df.interpolate(method = 'time')
df_interpolation_time
# We can see the difference in the above interpolation using liner method
# vs the interplation done using the time method..

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,29.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,Snow
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


Notice that in above temperature on 2017-01-04 is 29 instead of 30 (in plain linear interpolate)

There are many other methods for interpolation such as quadratic, piecewise_polynomial, cubic etc. Just google "dataframe interpolate" to see complete documentation

In [30]:
# Sometimes we might want to drop the missing values if it not of any use to us
print(df)
df_drop = df.dropna()
df_drop

            temperature  windspeed   event
day                                       
2017-01-01         32.0        6.0    Rain
2017-01-04          NaN        9.0   Sunny
2017-01-05         28.0        NaN    Snow
2017-01-06          NaN        7.0    Snow
2017-01-07         32.0        NaN    Rain
2017-01-08          NaN        NaN   Sunny
2017-01-09          NaN        NaN   Sunny
2017-01-10         34.0        8.0  Cloudy
2017-01-11         40.0       12.0   Sunny


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [31]:
# if we want to drop the row if it has all NA's
# it can be done using the 'how' parameter

df_drop_how = df.dropna(how = 'all') #drops only those rows that have complete missing values
df_drop_how

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,Snow
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [38]:
# If we want to drop rows based on some threshold value of the number of non-missing value
df = pd.read_csv('weather_data_test.csv', index_col = 'day')
print(df)
drop_threshold = df.dropna(thresh = 3)
drop_threshold
# It is only deleting values that have less than three non-missing values 

         temperature  windspeed   event   test
day                                           
1/1/17          32.0        6.0    Rain    NaN
1/4/17           NaN        9.0   Sunny  value
1/5/17          28.0        NaN    Snow  value
1/6/17           NaN        7.0     NaN  value
1/7/17          32.0        NaN    Rain    NaN
1/8/17           NaN        NaN   Sunny  value
1/9/17           NaN        NaN     NaN  value
1/10/17         34.0        8.0  Cloudy    NaN
1/11/17         40.0       12.0   Sunny  value


Unnamed: 0_level_0,temperature,windspeed,event,test
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1/1/17,32.0,6.0,Rain,
1/4/17,,9.0,Sunny,value
1/5/17,28.0,,Snow,value
1/10/17,34.0,8.0,Cloudy,
1/11/17,40.0,12.0,Sunny,value


In [58]:
# To insert the values for the missing dates in the dataframe
df_new = pd.read_csv('weather_data.csv', index_col = 'day')
df_new.index = pd.to_datetime(df.index, format = '%m/%d/%Y')
print(df_new.head())

dt = pd.date_range(start = '2017-01-01', end = '2017-01-11', freq = 'D')
idx = pd.DatetimeIndex(dt)
df_new = df_new.reindex(idx)

            temperature  windspeed  event
day                                      
2017-01-01         32.0        6.0   Rain
2017-01-04          NaN        9.0  Sunny
2017-01-05         28.0        NaN   Snow
2017-01-06          NaN        7.0    NaN
2017-01-07         32.0        NaN   Rain


In [59]:
# Now we can fill the above missing values using fillna's and interpolations
df_new_interpol = df_new.interpolate(method = 'time')
print(df_new_interpol)

df_new_interpol['event'] = df_new_interpol['event'].fillna(method = 'ffill')
df_new_interpol

            temperature  windspeed   event
2017-01-01    32.000000       6.00    Rain
2017-01-02    31.000000       7.00     NaN
2017-01-03    30.000000       8.00     NaN
2017-01-04    29.000000       9.00   Sunny
2017-01-05    28.000000       8.00    Snow
2017-01-06    30.000000       7.00     NaN
2017-01-07    32.000000       7.25    Rain
2017-01-08    32.666667       7.50   Sunny
2017-01-09    33.333333       7.75     NaN
2017-01-10    34.000000       8.00  Cloudy
2017-01-11    40.000000      12.00   Sunny


Unnamed: 0,temperature,windspeed,event
2017-01-01,32.0,6.0,Rain
2017-01-02,31.0,7.0,Rain
2017-01-03,30.0,8.0,Rain
2017-01-04,29.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,Snow
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,Sunny
2017-01-10,34.0,8.0,Cloudy
