Project #3: Machine Learning Based Solar Power Prediction

Authors: Garyu Liu, Chandramohan Bettadpurra

The purpose of this notebook is to:
  1. Impute missing values in each feature column in the CSV file
  2. Download the CSV file

In [0]:
# import necessary libraries
import pandas as pd
import numpy as np
import os
import io
from __future__ import absolute_import, division, print_function
from sklearn.impute import SimpleImputer
from google.colab import files 

In [0]:
# upload and read the dataset
data = files.upload()
import_data = pd.read_csv(io.BytesIO(data['Christchurch_Aero_4843_Raw.csv']), na_values=('-'))

# convert 'Day_Local_Date' to a numeric value (avoid confusing program)
import_data['Day_Local_Date']=pd.to_datetime(import_data.Day_Local_Date)
data=import_data


Saving Christchurch_Aero_4843_Raw.csv to Christchurch_Aero_4843_Raw.csv


In [0]:
# display the number of missing values in each column
data.isna().sum()

Day_Local_Date            0
WDir_Deg                  0
WSpd_m/s                  0
GustDir_Deg              13
GustSpd_m/s              10
WindRun_Km                2
Rain_mm                   4
Tdry_C                    0
TWet_C                    3
RH_%                      3
Tmax_C                    2
Tmin_C                    0
Tgmin_C                  20
ET10_C                   12
Pmsl_hPa                  0
Sun_Hrs                  20
Rad_MJ/m2                 7
Evaporation_Amount_mm     1
dtype: int64

In [0]:
# Define 1 common strategy to impute missing values in all columns
def imputingStrategy():
  commonSimpleImputingStrategy = SimpleImputer(missing_values=np.nan, strategy='mean')
  return commonSimpleImputingStrategy

In [0]:
# set up an imputing strategy for each column
imputer_GustDir_Deg = imputingStrategy()
imputer_GustSpd = imputingStrategy()
imputer_Rain_mm = imputingStrategy()
imputer_TWet_C = imputingStrategy()
imputer_RH = imputingStrategy()
imputer_Tmax_C = imputingStrategy()
imputer_Tmin_C = imputingStrategy()
imputer_Tgmin_C = imputingStrategy()
imputer_ET10_C = imputingStrategy()
imputer_Rad = imputingStrategy()
imputer_Evaporation_Amount_mm = imputingStrategy()
imputer_WDir_Deg = imputingStrategy()
imputer_WSpd = imputingStrategy()
imputer_WindRun_Km = imputingStrategy()
imputer_Tdry_C = imputingStrategy()
imputer_Pmsl_hPa = imputingStrategy()
imputer_Sun_Hrs = imputingStrategy()

In [0]:
# impute the missing data for each specified column
imputer_GustDir_Deg = imputer_GustDir_Deg.fit(data[['GustDir_Deg']])
imputer_GustSpd = imputer_GustSpd.fit(data[['GustSpd_m/s']])
imputer_Rain_mm = imputer_Rain_mm.fit(data[['Rain_mm']])
imputer_TWet_C = imputer_TWet_C.fit(data[['TWet_C']])
imputer_RH = imputer_RH.fit(data[['RH_%']])
imputer_Tmax_C = imputer_Tmax_C.fit(data[['Tmax_C']])
imputer_Tmin_C = imputer_Tmin_C.fit(data[['Tmin_C']])
imputer_Tgmin_C = imputer_Tgmin_C.fit(data[['Tgmin_C']])
imputer_ET10_C = imputer_ET10_C.fit(data[['ET10_C']])
imputer_Rad = imputer_Rad.fit(data[['Rad_MJ/m2']])
imputer_Evaporation_Amount_mm = imputer_Evaporation_Amount_mm.fit(data[['Evaporation_Amount_mm']])
imputer_WDir_Deg = imputer_WDir_Deg.fit(data[['WDir_Deg']])
imputer_WSpd = imputer_WSpd.fit(data[['WSpd_m/s']])
imputer_WindRun_Km = imputer_WindRun_Km.fit(data[['WindRun_Km']])
imputer_Tdry_C = imputer_Tdry_C.fit(data[['Tdry_C']])
imputer_Pmsl_hPa = imputer_Pmsl_hPa.fit(data[['Pmsl_hPa']])
imputer_Sun_Hrs = imputer_Sun_Hrs.fit(data[['Sun_Hrs']])

In [0]:
# set each specific column with the imputed values
data['GustDir_Deg'] = imputer_GustDir_Deg.transform(data[['GustDir_Deg']]).ravel()
data['GustSpd_m/s'] = imputer_GustSpd.transform(data[['GustSpd_m/s']]).ravel()
data['Rain_mm'] = imputer_Rain_mm.transform(data[['Rain_mm']]).ravel()
data['TWet_C'] = imputer_TWet_C.transform(data[['TWet_C']]).ravel()
data['RH_%'] = imputer_RH.transform(data[['RH_%']]).ravel()
data['Tmax_C'] = imputer_Tmax_C.transform(data[['Tmax_C']]).ravel()
data['Tmin_C'] = imputer_Tmin_C.transform(data[['Tmin_C']]).ravel()
data['Tgmin_C'] = imputer_Tgmin_C.transform(data[['Tgmin_C']]).ravel()
data['ET10_C'] = imputer_ET10_C.transform(data[['ET10_C']]).ravel()
data['Rad_MJ/m2'] = imputer_Rad.transform(data[['Rad_MJ/m2']]).ravel()
data['Evaporation_Amount_mm'] = imputer_Evaporation_Amount_mm.transform(data[['Evaporation_Amount_mm']]).ravel()
data['WDir_Deg'] = imputer_WDir_Deg.transform(data[['WDir_Deg']]).ravel()
data['WSpd_m/s'] = imputer_WSpd.transform(data[['WSpd_m/s']]).ravel()
data['WindRun_Km'] = imputer_WindRun_Km.transform(data[['WindRun_Km']]).ravel()
data['Tdry_C'] = imputer_Tdry_C.transform(data[['Tdry_C']]).ravel()
data['Pmsl_hPa'] = imputer_Pmsl_hPa.transform(data[['Pmsl_hPa']]).ravel()
data['Sun_Hrs'] = imputer_Sun_Hrs.transform(data[['Sun_Hrs']]).ravel()

In [0]:
# confirm all missing values have been imputed in each column
data.isna().sum()

Day_Local_Date           0
WDir_Deg                 0
WSpd_m/s                 0
GustDir_Deg              0
GustSpd_m/s              0
WindRun_Km               0
Rain_mm                  0
Tdry_C                   0
TWet_C                   0
RH_%                     0
Tmax_C                   0
Tmin_C                   0
Tgmin_C                  0
ET10_C                   0
Pmsl_hPa                 0
Sun_Hrs                  0
Rad_MJ/m2                0
Evaporation_Amount_mm    0
dtype: int64

In [0]:
# download the file as a csv
export_csv = data.to_csv('Christchurch_Aero_4843_Imputed.csv', index = 'Datetime', header=True)
files.download('Christchurch_Aero_4843_Imputed.csv')