# Merge 'weather_daily_darksky_clean.csv' with 'daily_dataset_clean.csv'

### Import libraries

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

### Read .csv files into dataframes

In [2]:
df_weather = pd.read_csv("../Data/weather_daily_darksky_clean.csv")

In [3]:
df_daily = pd.read_csv("../Data/daily_dataset_clean.csv")

### Check data types of columns to be be merged on

In [4]:
df_weather.time.dtypes

dtype('O')

In [5]:
df_daily.day.dtypes

dtype('O')

### Transform data types to date

In [6]:
df_weather.time = pd.to_datetime(df_weather.time).dt.date
df_weather['time'] = pd.to_datetime(df_weather['time'])
df_weather = df_weather.sort_values(by = ['time'])
df_weather['time'] 

13    2011-11-01
60    2011-11-02
34    2011-11-03
31    2011-11-04
46    2011-11-05
         ...    
874   2014-03-27
876   2014-03-28
867   2014-03-29
844   2014-03-30
830   2014-03-30
Name: time, Length: 882, dtype: datetime64[ns]

In [7]:
df_daily.day = pd.to_datetime(df_daily.day,format = '%Y/%m/%d')
df_daily = df_daily.sort_values(by = ['day'])
df_daily.day

2284874   2011-11-23
1727195   2011-11-23
2123541   2011-11-23
614605    2011-11-23
867855    2011-11-23
             ...    
1193090   2014-02-28
3112126   2014-02-28
1192269   2014-02-28
1197434   2014-02-28
3510402   2014-02-28
Name: day, Length: 3510403, dtype: datetime64[ns]

### Merge the two dataframes

In [8]:
df_merged = df_weather.merge(df_daily, left_on='time', right_on='day')

### Check columns of merged dataframe

In [9]:
df_merged.columns

Index(['temperatureMax', 'temperatureMaxTime', 'windBearing', 'icon',
       'dewPoint', 'temperatureMinTime', 'cloudCover', 'windSpeed', 'pressure',
       'apparentTemperatureMinTime', 'apparentTemperatureHigh', 'precipType',
       'visibility', 'humidity', 'apparentTemperatureHighTime',
       'apparentTemperatureLow', 'apparentTemperatureMax', 'uvIndex', 'time',
       'sunsetTime', 'temperatureLow', 'temperatureMin', 'temperatureHigh',
       'sunriseTime', 'temperatureHighTime', 'uvIndexTime', 'summary',
       'temperatureLowTime', 'apparentTemperatureMin',
       'apparentTemperatureMaxTime', 'apparentTemperatureLowTime', 'moonPhase',
       'LCLid', 'day', 'energy_median', 'energy_mean', 'energy_max',
       'energy_count', 'energy_std', 'energy_sum', 'energy_min'],
      dtype='object')

### Inspect data type of 'sunsetTime' & 'sunriseTime' columns

In [10]:
df_merged.sunsetTime.dtypes

dtype('O')

In [11]:
df_merged.sunriseTime.dtypes

dtype('O')

### Transform data types to date

In [12]:
df_merged['sunsetTime'] = pd.to_datetime(df_merged['sunsetTime'])
df_merged['sunriseTime'] = pd.to_datetime(df_merged['sunriseTime'])

### Calculate the number of daylight hours for each day

In [13]:
df_merged['dayTime'] = df_merged['sunsetTime'] - df_merged['sunriseTime']
df_merged['dayTime'] = df_merged['dayTime'] / pd.offsets.Hour()

### Drop unnecessary columns

In [14]:
df_merged = df_merged.drop(columns = ['temperatureMaxTime',
                                      'windBearing',
                                      'dewPoint',
                                      'temperatureMinTime',
                                      'windSpeed',
                                      'pressure',
                                      'apparentTemperatureMinTime', 
                                      'apparentTemperatureHigh', 
                                      'precipType',
                                      'apparentTemperatureHighTime',
                                      'apparentTemperatureLow',
                                      'apparentTemperatureMax',
                                      'uvIndex', 
                                      'time',
                                      'temperatureHighTime',
                                      'uvIndexTime', 
                                      'summary',
                                      'temperatureLowTime',
                                      'apparentTemperatureMin',
                                      'apparentTemperatureMaxTime', 
                                      'apparentTemperatureLowTime',
                                      'humidity',
                                      'sunsetTime',
                                      'sunriseTime',
                                      'temperatureLow',
                                      'temperatureHigh',
                                     ])

Double check data types

In [15]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3506288 entries, 0 to 3506287
Data columns (total 16 columns):
 #   Column          Dtype         
---  ------          -----         
 0   temperatureMax  float64       
 1   icon            object        
 2   cloudCover      float64       
 3   visibility      float64       
 4   temperatureMin  float64       
 5   moonPhase       float64       
 6   LCLid           object        
 7   day             datetime64[ns]
 8   energy_median   float64       
 9   energy_mean     float64       
 10  energy_max      float64       
 11  energy_count    int64         
 12  energy_std      float64       
 13  energy_sum      float64       
 14  energy_min      float64       
 15  dayTime         float64       
dtypes: datetime64[ns](1), float64(12), int64(1), object(2)
memory usage: 454.8+ MB


### Rearrange column order

In [16]:
df_merged = df_merged[['day', 'LCLid', 'energy_sum', 'energy_mean','energy_max', 'energy_min',
                       'energy_std', 'energy_count', 'dayTime', 'visibility', 'temperatureMin',
                       'temperatureMax','icon', 'cloudCover', 'moonPhase',  
                        ]]

### Export merged dataframe to .csv file

In [17]:
df_merged.to_csv("../Data/daily_weather_merged.csv")