## sometimes you have missing data in the data frame, so you can handle them using either fillna => fill missing values with different values 
## interpolate => to make a guess on missing values using interpolation 
## dropna => drop rows with missing values 


In [31]:
import pandas as pd
df = pd.read_csv("weather_data.csv")
df

Unnamed: 0,day,temperature,windspeed,event
0,01-01-17,32.0,6 mph,rain
1,01-04-17,,7,sunny
2,01-05-17,28.0,,snow
3,01-06-17,,7 mph,
4,01-07-17,32.0,,rain
5,01-08-17,31.0,2,sunny
6,01-09-17,34.0,5,
7,,,,
8,01-11-17,40.0,12,sunny


### here the type of the day column is string bcoz, its a csv file not excel. so first i want to convert the type from string into date

In [7]:
type(df.day[0])

str

### to convert it use parse_dates=['column_name'] while reading the csv file

In [10]:
df = pd.read_csv("weather_data.csv",parse_dates=['day'])
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6 mph,rain
1,2017-01-04,,7,sunny
2,2017-01-05,28.0,,snow
3,2017-01-06,,7 mph,
4,2017-01-07,32.0,,rain
5,2017-01-08,31.0,2,sunny
6,2017-01-09,34.0,5,
7,2017-01-10,,8,cloudy
8,2017-01-11,40.0,12,sunny


In [12]:
type(df.day[0])    # now the type is timestamp

pandas._libs.tslibs.timestamps.Timestamp

## you want ot make the day column as index, use set_index('colname',inplace=True)
## if you don't use inplace then it will not modify original df instead will return a new df

In [33]:
df.set_index("day",inplace=True)
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01-01-17,32.0,6 mph,rain
01-04-17,,7,sunny
01-05-17,28.0,,snow
01-06-17,,7 mph,
01-07-17,32.0,,rain
01-08-17,31.0,2,sunny
01-09-17,34.0,5,
,,,
01-11-17,40.0,12,sunny


## Now replacing NaN values with some other values
## fillna

In [14]:
df1 = df.fillna(0)
df1

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6 mph,rain
2017-01-04,0.0,7,sunny
2017-01-05,28.0,0,snow
2017-01-06,0.0,7 mph,0
2017-01-07,32.0,0,rain
2017-01-08,31.0,2,sunny
2017-01-09,34.0,5,0
2017-01-10,0.0,8,cloudy
2017-01-11,40.0,12,sunny


In [16]:
df1 = df.fillna({
    'temperature':0,
    'windspeed':0,
    'event':'no event'
})
df1

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6 mph,rain
2017-01-04,0.0,7,sunny
2017-01-05,28.0,0,snow
2017-01-06,0.0,7 mph,no event
2017-01-07,32.0,0,rain
2017-01-08,31.0,2,sunny
2017-01-09,34.0,5,no event
2017-01-10,0.0,8,cloudy
2017-01-11,40.0,12,sunny


## sometimes on calculating mean 0 values make a huge difference. to solve it we fill the blank with previous row value for that column. For that use ffill method in fillna.

In [17]:
df1 = df.fillna(method="ffill")  # ffill => forward fill
df1

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6 mph,rain
2017-01-04,32.0,7,sunny
2017-01-05,28.0,7,snow
2017-01-06,28.0,7 mph,snow
2017-01-07,32.0,7 mph,rain
2017-01-08,31.0,2,sunny
2017-01-09,34.0,5,sunny
2017-01-10,34.0,8,cloudy
2017-01-11,40.0,12,sunny


In [20]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6 mph,rain
2017-01-04,,7,sunny
2017-01-05,28.0,,snow
2017-01-06,,7 mph,
2017-01-07,32.0,,rain
2017-01-08,31.0,2,sunny
2017-01-09,34.0,5,
2017-01-10,,8,cloudy
2017-01-11,40.0,12,sunny


**axis not working right now
0 or index
1 or columns**

In [23]:
df1 = df.fillna(method="bfill",axis=0)  # bfill => backward fill
df1     # axis ==> axis along which to copy values , rows or columns

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6 mph,rain
2017-01-04,28.0,7,sunny
2017-01-05,28.0,7 mph,snow
2017-01-06,32.0,7 mph,rain
2017-01-07,32.0,2,rain
2017-01-08,31.0,2,sunny
2017-01-09,34.0,5,cloudy
2017-01-10,40.0,8,cloudy
2017-01-11,40.0,12,sunny


In [24]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6 mph,rain
2017-01-04,,7,sunny
2017-01-05,28.0,,snow
2017-01-06,,7 mph,
2017-01-07,32.0,,rain
2017-01-08,31.0,2,sunny
2017-01-09,34.0,5,
2017-01-10,,8,cloudy
2017-01-11,40.0,12,sunny


## here between 32 and 28 the temp on the day between might be in between 32 and 28. so to do that interpolate() is there.

In [25]:
df1 = df.interpolate()
df1

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6 mph,rain
2017-01-04,30.0,7,sunny
2017-01-05,28.0,,snow
2017-01-06,30.0,7 mph,
2017-01-07,32.0,,rain
2017-01-08,31.0,2,sunny
2017-01-09,34.0,5,
2017-01-10,37.0,8,cloudy
2017-01-11,40.0,12,sunny


## you see here, that dates 2 3  are not there so, on date 4 the temp must be near to date 5 and not the middle of 1 and 5 , so time method is used for that, interpolate according to time.

In [26]:
df1 = df.interpolate(method="time")
df1

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6 mph,rain
2017-01-04,29.0,7,sunny
2017-01-05,28.0,,snow
2017-01-06,30.0,7 mph,
2017-01-07,32.0,,rain
2017-01-08,31.0,2,sunny
2017-01-09,34.0,5,
2017-01-10,37.0,8,cloudy
2017-01-11,40.0,12,sunny


In [27]:
df1 = df.dropna()  # will drop rows with NaN values
df1

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6 mph,rain
2017-01-08,31.0,2,sunny
2017-01-11,40.0,12,sunny


In [34]:
df1 = df.dropna(how="all")  # will drop rows with all columns NaN
df1

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01-01-17,32.0,6 mph,rain
01-04-17,,7,sunny
01-05-17,28.0,,snow
01-06-17,,7 mph,
01-07-17,32.0,,rain
01-08-17,31.0,2,sunny
01-09-17,34.0,5,
01-11-17,40.0,12,sunny
