In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings("ignore")
# Multiple Imputation by Chained Equations
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
%matplotlib inline

In [2]:
weather_df=pd.read_csv("real_weather.csv", encoding = 'cp949')
weather_df

Unnamed: 0,location,Date,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindGustDir,Sunshine,Evaporation,TodayRain,...,Cloud9am,Humidity9am,WindSpeed9am,Temp9am,Pressure3pm,WindDir3pm,Cloud3pm,Humidity3pm,WindSpeed3pm,Temp3pm
0,속초,2012-05-05,12.6,23.2,0.5,6.4,NW,,,0,...,0.0,66.0,1.6,18.3,998.7,SE,6.0,51.0,4.2,19.3
1,속초,2012-05-06,10.6,24.1,,6.5,WSW,,,0,...,3.0,30.0,5.1,18.6,1000.8,E,2.0,35.0,2.7,20.5
2,속초,2012-05-07,14.2,25.1,,7.0,NW,,,0,...,0.0,26.0,3.1,22.2,1002.8,E,0.0,31.0,2.1,22.9
3,속초,2012-05-08,12.1,21.1,,6.1,NW,,,0,...,9.0,66.0,0.8,17.0,1005.5,ESE,3.0,70.0,2.8,18.3
4,속초,2012-05-09,11.9,15.4,,6.5,NW,,,0,...,8.0,90.0,3.8,14.7,1013.2,NNW,9.0,89.0,4.6,13.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343701,남해,2022-05-01,9.8,20.3,,4.8,WNW,,,0,...,4.0,48.0,1.2,14.4,1006.8,S,1.0,35.0,2.4,19.5
343702,남해,2022-05-02,7.9,21.5,,5.4,WNW,,,0,...,0.0,50.0,1.7,15.9,1006.5,WSW,7.0,42.0,2.2,20.6
343703,남해,2022-05-03,8.3,19.9,,3.6,SSW,,,0,...,2.0,66.0,1.2,13.0,1014.3,S,0.0,37.0,2.7,18.9
343704,남해,2022-05-04,8.2,22.7,,4.1,SSW,,,0,...,0.0,59.0,1.0,15.5,1014.0,SSW,0.0,39.0,2.6,21.3


In [3]:
missing_values=weather_df.isnull().sum() # missing values

percent_missing = weather_df.isnull().sum()/weather_df.shape[0]*100 # missing value %

value = {
    'missing_values ':missing_values,
    'percent_missing %':percent_missing 
}
frame=pd.DataFrame(value)
frame

Unnamed: 0,missing_values,percent_missing %
location,0,0.0
Date,0,0.0
MinTemp,34,0.009892
MaxTemp,27,0.007856
Rainfall,217587,63.30614
WindGustSpeed,181,0.052661
WindGustDir,168,0.048879
Sunshine,194241,56.513706
Evaporation,222948,64.865903
TodayRain,0,0.0


In [4]:
total = weather_df.isnull().sum().sort_values(ascending=False)
percent = (weather_df.isnull().sum()/weather_df.isnull().count()).sort_values(ascending=False)
missing = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing.head(4)

Unnamed: 0,Total,Percent
Evaporation,222948,0.648659
Rainfall,217587,0.633061
Sunshine,194241,0.565137
Cloud3pm,145707,0.423929


In [5]:
 weather_df['Rainfall'].fillna(0, inplace=True)

In [6]:
weather_df.select_dtypes(include=['object']).columns

Index(['location', 'Date', 'WindGustDir', 'WindDir9am', 'WindDir3pm'], dtype='object')

In [7]:
lencoders = {}
for col in weather_df.select_dtypes(include=['object']).columns:
    lencoders[col] = LabelEncoder()
    weather_df[col] = lencoders[col].fit_transform(weather_df[col])

In [8]:
MiceImputed = weather_df.copy(deep=True) 
mice_imputer = IterativeImputer()
MiceImputed.iloc[:, :] = mice_imputer.fit_transform(weather_df)

In [None]:
MiceImputed.to_csv("MiceImputed_weather")