# Australian Weather Forecasting - Imputing Data

## Import libraries

In [1]:
import pandas as pd
import numpy as np
import pymc3 as pm
import arviz as az
import theano
import theano.tensor as tt
import itertools
import matplotlib.pyplot as plt
import scipy.stats as scistat
import arviz as az
import warnings
warnings.filterwarnings('ignore')

## Loading Data

In [2]:
weather = pd.read_csv("../data/weatherAUS.csv")
weather = weather.sample(3000)

### Filter out numeric and categorical data

In [3]:
col_list = list()
categorical = ["Date","Location","WindGustDir","WindDir9am","WindDir3pm","RainToday","RainTomorrow"]
for column in weather.columns:
    if column not in categorical:
        col_list.append(column)
numeric_data = weather[col_list]
cat_data = weather[categorical]

In [4]:
col_list = list()
categorical = ["Date","Location","WindGustDir","WindDir9am","WindDir3pm","RainToday","RainTomorrow"]
for column in weather.columns:
    if column not in categorical:
        col_list.append(column)
numeric_data = weather[col_list]
cat_data = weather[categorical]

### Analyze Missing Data

In [5]:
def missing_data(df):
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,
                                     'percent_missing': percent_missing})
    return missing_value_df
missing_data(weather)

Unnamed: 0,column_name,percent_missing
Date,Date,0.0
Location,Location,0.0
MinTemp,MinTemp,0.433333
MaxTemp,MaxTemp,0.066667
Rainfall,Rainfall,0.833333
Evaporation,Evaporation,41.4
Sunshine,Sunshine,46.533333
WindGustDir,WindGustDir,6.4
WindGustSpeed,WindGustSpeed,6.4
WindDir9am,WindDir9am,6.366667


## Impute Numeric data and standardization
Filling missing data with the mean of our observations

In [6]:
numeric_data = numeric_data.drop(['Cloud9am','Cloud3pm','Evaporation','Sunshine'],axis=1)
numeric_data = numeric_data.apply(lambda x: x.fillna(x.mean()),axis=1)


In [7]:
numeric_data = (numeric_data - numeric_data.mean()) / (numeric_data.std())


## Impute Categorical Data
Fill missing data based on the distribution of the observed (non-missing) data

In [8]:
def impute_missing_categorical(column):
    priors = cat_data[column].value_counts()/cat_data[column].value_counts().sum()
    prior_name = priors.index
    return np.random.choice(prior_name, p=priors)

cat_data = cat_data.apply(lambda x: x.fillna(impute_missing_categorical(x.name)),axis=0)
missing_data(cat_data)

Unnamed: 0,column_name,percent_missing
Date,Date,0.0
Location,Location,0.0
WindGustDir,WindGustDir,0.0
WindDir9am,WindDir9am,0.0
WindDir3pm,WindDir3pm,0.0
RainToday,RainToday,0.0
RainTomorrow,RainTomorrow,0.0


In [10]:
weather_new = pd.concat([numeric_data,cat_data],axis=1)
weather_new.to_csv('../data/weather_imputed.csv')
missing_data(weather_new)

Unnamed: 0,column_name,percent_missing
MinTemp,MinTemp,0.0
MaxTemp,MaxTemp,0.0
Rainfall,Rainfall,0.0
WindGustSpeed,WindGustSpeed,0.0
WindSpeed9am,WindSpeed9am,0.0
WindSpeed3pm,WindSpeed3pm,0.0
Humidity9am,Humidity9am,0.0
Humidity3pm,Humidity3pm,0.0
Pressure9am,Pressure9am,0.0
Pressure3pm,Pressure3pm,0.0
