# Australian Weather Forecasting - Imputing Data

## Import libraries

In [1]:
import pandas as pd
import numpy as np

## Loading Data

In [2]:
weather = pd.read_csv("weatherAUS.csv")
weather = weather.head(3000) # remove .head(3000) to work on full data
len(weather)

3000

### Filter out numeric and categorical data

In [3]:
col_list = list()
categorical = ["Date","Location","WindGustDir","WindDir9am","WindDir3pm","RainToday","RainTomorrow"]
for column in weather.columns:
    if column not in categorical:
        col_list.append(column)
numeric_data = weather[col_list]
cat_data = weather[categorical]

### Analyze Missing Data

In [4]:
def missing_data(df):
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,
                                     'percent_missing': percent_missing})
    return missing_value_df
missing_data(weather)

Unnamed: 0,column_name,percent_missing
Date,Date,0.0
Location,Location,0.0
MinTemp,MinTemp,0.2
MaxTemp,MaxTemp,0.033333
Rainfall,Rainfall,0.666667
Evaporation,Evaporation,100.0
Sunshine,Sunshine,100.0
WindGustDir,WindGustDir,0.333333
WindGustSpeed,WindGustSpeed,0.333333
WindDir9am,WindDir9am,17.133333


## Impute Numeric data
Filling missing data with the mean of our observations

In [5]:
numeric_data = numeric_data.drop(['Cloud9am','Cloud3pm','Evaporation','Sunshine'],axis=1)
numeric_data = numeric_data.apply(lambda x: x.fillna(x.mean()),axis=1)

## Impute Categorical Data
Fill missing data based on the distribution of the observed (non-missing) data

In [6]:
def impute_missing_categorical(column):
    priors = cat_data[column].value_counts()/cat_data[column].value_counts().sum()
    prior_name = priors.index
    return np.random.choice(prior_name, p=priors)

cat_data = cat_data.apply(lambda x: x.fillna(impute_missing_categorical(x.name)),axis=0)
missing_data(cat_data)

Unnamed: 0,column_name,percent_missing
Date,Date,0.0
Location,Location,0.0
WindGustDir,WindGustDir,0.0
WindDir9am,WindDir9am,0.0
WindDir3pm,WindDir3pm,0.0
RainToday,RainToday,0.0
RainTomorrow,RainTomorrow,0.0


In [7]:
missing_data(pd.concat([numeric_data,cat_data],axis=1))

Unnamed: 0,column_name,percent_missing
MinTemp,MinTemp,0.0
MaxTemp,MaxTemp,0.0
Rainfall,Rainfall,0.0
WindGustSpeed,WindGustSpeed,0.0
WindSpeed9am,WindSpeed9am,0.0
WindSpeed3pm,WindSpeed3pm,0.0
Humidity9am,Humidity9am,0.0
Humidity3pm,Humidity3pm,0.0
Pressure9am,Pressure9am,0.0
Pressure3pm,Pressure3pm,0.0


### Dataset named aussyRain

In [12]:
aussy_Rain = pd.concat([numeric_data,cat_data],axis=1)

In [13]:
aussy_Rain.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RISK_MM,Date,Location,WindGustDir,WindDir9am,WindDir3pm,RainToday,RainTomorrow
0,13.4,22.9,0.6,44.0,20.0,24.0,71.0,22.0,1007.7,1007.1,16.9,21.8,0.0,2008-12-01,Albury,W,W,WNW,No,No
1,7.4,25.1,0.0,44.0,4.0,22.0,44.0,25.0,1010.6,1007.8,17.2,24.3,0.0,2008-12-02,Albury,WNW,NNW,WSW,No,No
2,12.9,25.7,0.0,46.0,19.0,26.0,38.0,30.0,1007.6,1008.7,21.0,23.2,0.0,2008-12-03,Albury,WSW,W,WSW,No,No
3,9.2,28.0,0.0,24.0,11.0,9.0,45.0,16.0,1017.6,1012.8,18.1,26.5,1.0,2008-12-04,Albury,NE,SE,E,No,No
4,17.5,32.3,1.0,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,17.8,29.7,0.2,2008-12-05,Albury,W,ENE,NW,No,No


In [11]:
import pymc3 as pm
import arviz as az
import theano
import theano.tensor as tt
import itertools


### Convert our Date column into datetime object

In [15]:
aussy_Rain['Date']=pd.to_datetime(aussy_Rain['Date'])

### Dummy encoding for other Categoricals

In [16]:
aussy_Rain['WindGustDir'].value_counts()

W      501
WNW    384
SE     277
WSW    231
SSE    206
NW     166
N      165
NE     157
NNW    151
NNE    150
ENE    145
E      121
SW     120
ESE    112
S       59
SSW     55
Name: WindGustDir, dtype: int64

In [None]:
encoding = {"WindGustDir":     {"S": 0, "SSW ": 1,"SW":3,"WSW":4,"SW":5,"W":6,"WNW":7,"NW":8,"NNW":9,"N":10,"NNE":11,"NE":12,""},
                "num_cylinders": {"four": 4, "six": 6, "five": 5, "eight": 8,
                                  "two": 2, "twelve": 12, "three":3 }}