# Cleansing & Transforming  
Summary of steps:  

-Merge weather attributes by city  
-Convert datetime columns from 'object' type to 'datetime' type  
-Add new column to identify the time of day (ie. AM1, AM2, PM1, PM2)  
-Add new column to identify the compass direction of wind (ie. N, W, S, E, etc)  


In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Load the Data

In [2]:
df_c = pd.read_csv('../../Data/Raw/city_attributes.csv')
df_h = pd.read_csv('../../Data/Raw/humidity.csv')
df_p = pd.read_csv('../../Data/Raw/pressure.csv')
df_t = pd.read_csv('../../Data/Raw/temperature.csv')
df_d = pd.read_csv('../../Data/Raw/weather_description.csv')
df_wd = pd.read_csv('../../Data/Raw/wind_direction.csv')
df_ws = pd.read_csv('../../Data/Raw/wind_speed.csv')

# Procedures (not used)

In [3]:
def timeday(df):
    result = []
    for row in df:
        if row <= 5:
            #early morning 0:00 to 5:59
            result.append('AM1')  
        elif row <= 11:
            #morning 6:00 to 11:59
            result.append('AM2')
        elif row <= 17:
            #afternoon 12:00 to 17:59
            result.append('PM1')
        elif row <= 23:
            #evening 18:00 to 11:59
            result.append('PM2')
        else:
            result.append('')
    return result

In [4]:
arr = ['N', 'NNE', 'NE', 'ENE', 'E', 'ESE', 'SE', 'SSE', 'S', 'SSW', 'SW', 'WSW', 'W', 'WNW', 'NW', 'NNW', 'N']
def wind_dir(df):
    result = []
    for row in df:
        #print(row)
        if row >= 0:
            k=(row%360)/22.5
            result.append(arr[round(k)])
        else:
            result.append('')
    return result


# Transform the weather data by city

In [5]:
for row in df_c['City']:
    #get humidity attributes 
    data_h = df_h[['datetime',row]]
    data_h = data_h.rename(columns={row:'humidity'})
    
    #get pressure attributes
    data_p = df_p[['datetime',row]]
    data_p = data_p.rename(columns={row:'pressure'})
                           
    #get temperature attributes
    data_t = df_t[['datetime',row]]
    data_t = data_t.rename(columns={row:'temperature'})
    #data_t = data_t['temperature'] - 273.15
        
    #get weather description 
    data_d = df_d[['datetime',row]]
    data_d = data_d.rename(columns={row:'description'})
        
    #get wind direction 
    data_wd = df_wd[['datetime',row]]
    data_wd = data_wd.rename(columns={row:'wind_direction'})
        
    #get wind speed 
    data_ws = df_ws[['datetime',row]]
    data_ws = data_ws.rename(columns={row:'wind_speed'})
    
    #merge all weather attributes onto one dataframe
    df=pd.merge(data_h, data_p, on='datetime',how='outer')
    df=pd.merge(df, data_t, on='datetime',how='outer')
    df=pd.merge(df, data_d, on='datetime',how='outer')
    df=pd.merge(df, data_ws, on='datetime',how='outer')
    df=pd.merge(df, data_wd, on='datetime',how='outer')
    
    df.to_csv('../../Data/Processed/' + row + '.csv',index=False)

# Cleanse the city weather data

In [6]:
def num_missing(x):
    return sum(x.isnull())

In [7]:
for row in df_c['City']:
    print(row)
    df_city = pd.read_csv('../../Data/Processed/'+ row + '.csv')
    
    #remove rows where all attributes are null
    df_city=df_city.dropna(subset=['humidity', 'pressure','temperature','description','wind_speed', 'wind_direction'], how='all')
#     print ('\nMissing values per column:')
#     print (df_city.apply(num_missing,axis=0))

    #Backfill missing data
    df_city=df_city.bfill(limit=8)
#     print ("\nMissing values per column:")
#     print (df_city.apply(num_missing,axis=0))

    #Forwardfill missing data
    df_city=df_city.ffill(limit=8)
#     print ("\nMissing values per column:")
#     print (df_city.apply(num_missing,axis=0))
    
    #convert temperature to celsius
    df_city['temperature']=df_city['temperature']-273.15
    df_city.rename(columns={'temperature': 'temp_celsius'}, inplace=True)
    
    #sort by start date
    df_city = df_city.sort_values(by = ['datetime'])
    
    df_city.to_csv('../../Data/Processed/' + row + '.csv',index=False)
    

Vancouver
Portland
San Francisco
Seattle
Los Angeles
San Diego
Las Vegas
Phoenix
Albuquerque
Denver
San Antonio
Dallas
Houston
Kansas City
Minneapolis
Saint Louis
Chicago
Nashville
Indianapolis
Atlanta
Detroit
Jacksonville
Charlotte
Miami
Pittsburgh
Toronto
Philadelphia
New York
Montreal
Boston
Beersheba
Tel Aviv District
Eilat
Haifa
Nahariyya
Jerusalem
