<a href="https://colab.research.google.com/github/dajebbar/FreeCodeCamp-python-data-analysis/blob/main/simpleImputer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install opendatasets --upgrade --quiet

In [2]:
import opendatasets as od
import os

od.version()
url = 'https://www.kaggle.com/jsphyg/weather-dataset-rattle-package'
od.download(url)


data_dir = 'weather-dataset-rattle-package'

os.listdir(data_dir)
train_csv = data_dir + '/weatherAUS.csv'

Skipping, found downloaded files in "./weather-dataset-rattle-package" (use force=True to force download)


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import missingno as msno
import warnings
warnings.filterwarnings("ignore")

plt.style.use('fivethirtyeight')
%matplotlib inline

In [4]:
raw_df = pd.read_csv(train_csv)

categorical_column = ['Location','WindGustDir', 'WindDir9am', 
                      'WindDir3pm',  'RainToday']
numerical_column = ['MinTemp', 'MaxTemp', 'Sunshine', 'WindSpeed9am', 
                    'Pressure3pm', 'Temp9am', 'Temp3pm', 'Cloud3pm' ]

target_column = 'RainTomorrow'
year = pd.to_datetime(raw_df.Date).dt.year
date = 'Date'

df = raw_df[categorical_column + numerical_column + [target_column]]
df.head()

Unnamed: 0,Location,WindGustDir,WindDir9am,WindDir3pm,RainToday,MinTemp,MaxTemp,Sunshine,WindSpeed9am,Pressure3pm,Temp9am,Temp3pm,Cloud3pm,RainTomorrow
0,Albury,W,W,WNW,No,13.4,22.9,,20.0,1007.1,16.9,21.8,,No
1,Albury,WNW,NNW,WSW,No,7.4,25.1,,4.0,1007.8,17.2,24.3,,No
2,Albury,WSW,W,WSW,No,12.9,25.7,,19.0,1008.7,21.0,23.2,2.0,No
3,Albury,NE,SE,E,No,9.2,28.0,,11.0,1012.8,18.1,26.5,,No
4,Albury,W,ENE,NW,No,17.5,32.3,,7.0,1006.0,17.8,29.7,8.0,No


## Dealing with Nan's

In [5]:
df[numerical_column].isna().sum()

MinTemp          1485
MaxTemp          1261
Sunshine        69835
WindSpeed9am     1767
Pressure3pm     15028
Temp9am          1767
Temp3pm          3609
Cloud3pm        59358
dtype: int64

In [6]:
df[categorical_column].isna().sum()

Location           0
WindGustDir    10326
WindDir9am     10566
WindDir3pm      4228
RainToday       3261
dtype: int64

In [7]:
df[target_column].isna().any()

True

In [8]:
df2 = df.copy()

In [9]:
df2.dropna(subset=[target_column, 'RainToday'], inplace=True)

In [10]:
df2[target_column].isna().sum()

0

### Imputing Missing Numeric Data

In [11]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
# impute the original dataframe
imputer.fit(df2[numerical_column])

SimpleImputer()

In [12]:
df2[numerical_column].isna().sum()

MinTemp           468
MaxTemp           307
Sunshine        66805
WindSpeed9am     1055
Pressure3pm     13769
Temp9am           656
Temp3pm          2624
Cloud3pm        56094
dtype: int64

In [13]:
df2[numerical_column].sample(15)

Unnamed: 0,MinTemp,MaxTemp,Sunshine,WindSpeed9am,Pressure3pm,Temp9am,Temp3pm,Cloud3pm
96701,7.0,16.0,4.6,13.0,1017.5,10.9,15.8,
122564,16.3,39.1,12.6,15.0,1008.3,29.0,37.9,2.0
10127,21.3,26.5,6.4,9.0,1014.2,24.6,24.8,7.0
125089,-1.5,20.0,,7.0,,10.2,19.4,
14953,19.7,35.8,,24.0,1015.9,23.6,33.7,
62551,8.9,33.0,6.7,2.0,1008.3,21.8,30.6,8.0
72465,13.2,34.8,11.1,11.0,1007.8,20.7,31.8,5.0
80213,10.0,18.6,7.6,0.0,1017.7,11.4,18.1,3.0
55774,12.1,20.1,,39.0,1017.8,13.2,19.3,1.0
89161,13.8,26.3,9.1,20.0,1015.2,20.7,24.1,5.0


In [14]:
for col in numerical_column:
  df2[col].fillna(df2[col].mean(), inplace=True)

df2[numerical_column].sample(15)

Unnamed: 0,MinTemp,MaxTemp,Sunshine,WindSpeed9am,Pressure3pm,Temp9am,Temp3pm,Cloud3pm
72496,22.6,42.3,13.1,19.0,1009.1,30.4,41.8,1.0
103836,6.6,13.5,7.6,9.0,1024.1,8.6,12.9,5.0
116937,8.4,17.7,8.2,19.0,1030.2,12.3,17.3,5.0
138412,-1.0,17.4,10.6,13.0,1028.0,7.4,16.3,0.0
106822,3.9,15.5,7.63054,17.0,1019.2,7.7,13.5,2.0
84606,20.8,29.3,5.8,9.0,1017.1,25.1,27.3,7.0
117993,15.2,27.6,12.1,28.0,1013.5,21.0,25.3,0.0
63900,5.6,22.7,7.63054,11.0,1018.4,12.8,20.9,1.0
123573,8.9,26.2,11.8,9.0,1021.5,17.0,25.5,0.0
88966,19.9,32.0,12.1,9.0,1008.9,28.6,30.8,1.0


In [15]:
df2[numerical_column].isna().sum()

MinTemp         0
MaxTemp         0
Sunshine        0
WindSpeed9am    0
Pressure3pm     0
Temp9am         0
Temp3pm         0
Cloud3pm        0
dtype: int64

## Encoding Categorical data

In [16]:
df2[categorical_column].isna().sum()

Location          0
WindGustDir    9163
WindDir9am     9660
WindDir3pm     3670
RainToday         0
dtype: int64

In [17]:
df2[categorical_column].fillna('Unknown', inplace=True)
df2[categorical_column].isna().sum()

Location          0
WindGustDir    9163
WindDir9am     9660
WindDir3pm     3670
RainToday         0
dtype: int64

In [18]:
df2[categorical_column].sample(15)

Unnamed: 0,Location,WindGustDir,WindDir9am,WindDir3pm,RainToday
13038,Moree,W,W,W,Yes
8124,Cobar,SE,N,NNW,No
58361,Bendigo,,SE,NE,No
27289,Richmond,ESE,NE,,No
23702,NorfolkIsland,SE,ESE,ESE,Yes
87295,Cairns,SSE,SSE,SSE,Yes
52781,MountGinini,N,SSW,NNW,No
140826,Darwin,SE,ESE,ENE,No
94834,Townsville,ENE,S,ENE,No
57293,Ballarat,WSW,NNE,N,No


In [19]:
from sklearn.preprocessing import (
    MinMaxScaler,
    OneHotEncoder,
)
from sklearn.compose import ColumnTransformer

num_transformer = MinMaxScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)

transformer = ColumnTransformer([
                                 ('one-hot', 
                                  cat_transformer, 
                                  categorical_column),
                                 ('min-max-scaler', 
                                  num_transformer, 
                                  numerical_column)
])

In [20]:
df2.head(2)

Unnamed: 0,Location,WindGustDir,WindDir9am,WindDir3pm,RainToday,MinTemp,MaxTemp,Sunshine,WindSpeed9am,Pressure3pm,Temp9am,Temp3pm,Cloud3pm,RainTomorrow
0,Albury,W,W,WNW,No,13.4,22.9,7.63054,20.0,1007.1,16.9,21.8,4.49925,No
1,Albury,WNW,NNW,WSW,No,7.4,25.1,7.63054,4.0,1007.8,17.2,24.3,4.49925,No


In [21]:
X_train = df2[year < 2015]
X_test = df2[year >= 2015]
y_train = df2[year < 2015][target_column]
y_test = df2[year >= 2015][target_column]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((97988, 14), (42799, 14), (97988,), (42799,))

In [22]:
X_train[numerical_column].isna().sum()

MinTemp         0
MaxTemp         0
Sunshine        0
WindSpeed9am    0
Pressure3pm     0
Temp9am         0
Temp3pm         0
Cloud3pm        0
dtype: int64

In [23]:
X_test[numerical_column].isna().sum()

MinTemp         0
MaxTemp         0
Sunshine        0
WindSpeed9am    0
Pressure3pm     0
Temp9am         0
Temp3pm         0
Cloud3pm        0
dtype: int64