### Header

In [None]:
# import libraries

# maths
import numpy as np
import pandas as pd

# others
import os
import re
import time
import datetime as datetime

In [None]:
# file paths

input_path = '../data/2_input/'
clean_path = '../data/3_clean/'
output_path = '../data/4_output/'

image_path = '../images/'

### Functions

In [None]:
# split dates

def create_yr(x): 
    return x.split('-')[0] 

def create_mth(x): 
    return x.split('-')[1] 

def create_day(x): 
    return x.split('-')[2] 

def rename_columns (columns):
    return [column.lower() for column in columns]

def clean_date(df): 
    df['year'] = df.date.apply(create_yr)
    df['month'] = df.date.apply(create_mth)
    df['day'] = df.date.apply(create_day)
    
    return df

In [None]:
def count_t(x): 
    if x == '  T':
        return 1
    else:
        return 0
    
def count_m(x): 
    if x == 'M':
        return 1
    else:
        return 0

def count_dash(x): 
    if x == '-':
        return 1
    else:
        return 0

In [None]:
# count total number of M - T in df

def print_summary(df):    

    cols = ['column','M','-','T']
    df_summary = pd.DataFrame(columns=cols)
    idx = 0

    for col in df.columns:

        if df[col].dtype == 'object':

            total_m = df[col].apply(count_m).sum()
            total_dash = df[col].apply(count_dash).sum()
            total_t = df[col].apply(count_t).sum()

            df_summary.at[idx,cols[0]] = col
            df_summary.at[idx,cols[1]] = total_m
            df_summary.at[idx,cols[2]] = total_dash
            df_summary.at[idx,cols[3]] = total_t

        idx += 1
    
    return df_summary

### Import Data

In [None]:
# import weather data

df = pd.read_csv(input_path + 'weather.csv')

### Inspect Data

In [None]:
# print first 5 records

print(df.shape)
df.head()

In [None]:
# list all columns

print(df.columns)

In [None]:
# df summary

df.describe()

In [None]:
# show df information

df.info()

In [None]:
# Check for nulls in columns

null_cols = df.isnull().sum()
mask_null = null_cols > 0
null_cols[mask_null].sort_values(ascending=False)

In [None]:
# Check for nulls in rows

null_rows = df.isnull().sum(axis=1)
mask_null = null_rows > 0
null_rows[mask_null].sort_values(ascending=False)

In [None]:
df.columns = rename_columns(df.columns)

In [None]:
df = clean_date(df)

In [None]:
df.head()

In [None]:
# check columns type

df.dtypes

### Clean Data

In [None]:
# count total number of M - T in df

print('before cleaning:')
df_summary = print_summary(df)
df_summary

In [None]:
df.snowfall.unique()

In [None]:
df.depth.unique()

In [None]:
df.depart.unique()

In [None]:
#Suggest to drop water since its all missing values 
#Suggest to drop SnowFall since its mostly 0 or missing 
#Suggest to drop Depth since its either 0 or missing values 
#Suggest to drop codesum

df.drop(columns = ['codesum','water1','snowfall','depth','date'], inplace = True)

In [None]:
# For sunset/sunrise and depart with missing (M) values:
# its because station 2 does not collect these values. 
# so we can impute these missing values with values from station 1

i = 0
while i < df.shape[0]:
    df.iloc[i+1, 4] = df.iloc[i, 4]
    df.iloc[i+1, 9] = df.iloc[i, 9]
    df.iloc[i+1, 10] = df.iloc[i, 10]
    i+=2

In [None]:
def impute_missing_tavg(row):
    if row['tavg'] == 'M': 
        row['tavg'] = (row['tmax'] - row['tmin']) * 0.5 + row['tmin']
    return row

df = df.apply(impute_missing_tavg, axis = 1)
df.tavg = df.tavg.astype('int64')

In [None]:
def impute_missing_wetbulb(row): 
    if row['wetbulb'] == 'M':
        row['wetbulb'] = row['tavg']-((row['tavg']-row['dewpoint'])/3)
    return row

df = df.apply(impute_missing_wetbulb, axis = 1)

In [None]:
def impute_missing_rest(row): 
    if row['heat'] == 'M':
        if row['tavg'] >= 65: 
            row['heat'] = 0
            row['cool'] = row['tavg'] - 65
        else: 
            row['heat'] = 65 - row['tavg']
            row['cool'] = 0

    if row['preciptotal'] == '  T':
        row['preciptotal'] = 0
    if row['preciptotal'] == 'M':
        row['preciptotal'] = df[df.preciptotal!='M'][df.preciptotal!='  T'].preciptotal.median()       
    if row['stnpressure'] == 'M':
        row['stnpressure'] = df[df.stnpressure!='M'].stnpressure.median()
    if row['sealevel'] == 'M':
        row['sealevel'] = df[df.sealevel!='M'].sealevel.median()
    if row['avgspeed'] == 'M':
        row['avgspeed'] = df[df.avgspeed!='M'].avgspeed.median()    
    return row

df = df.apply(impute_missing_rest, axis = 1)

In [None]:
print('after cleaning:')    
df_summary = print_summary(df)
df_summary

### Output Data

In [None]:
df.head()

In [None]:
# output clean data

df.to_csv(clean_path + 'weather_clean.csv',index=False)