# Data Cleaning, Preprocessing, and Feature Engineering

In this notebook, we will read in our data. Match instances in our data to images, and add the file path to the df.
After, joining our data, we will need to cross join to get Irradiance data with it's iterval ahead weather, irradiance, and sky images. 

All data was dowloaded from __[here](https://zenodo.org/record/2826939#.YEPKXi1h1pS)__. Thanks so much to the University of California San Diego team (Carreira Pedro, Hugo; Larson, David; Coimbra, Carlos) who worked so hard on collecting this data, and for supporting the work of others in this space.

#### Below we will:
1. [Create dataframes](#Create-DataFrames-for-Models)
    - create filepaths for images
    - get time intervals and cross join for earlier irradiance data
    - merge weather and irradiance data
    
  
2. [explore our data](#)
3. [preprocess/scale our data](#Pre-Processing-and-Scaling)


Import needed libraries:

In [380]:
import pandas as pd
import bz2
from datetime import datetime,timedelta
import tarfile
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from math import pi, sin

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# import sklearn

Functions needed for all processes in notebook:

In [398]:
def breakdown_dates(df,column): 
    df['year'] = pd.DatetimeIndex(df[column]).year
    df['month'] = pd.DatetimeIndex(df[column]).month
    df['day'] = pd.DatetimeIndex(df[column]).day
    df['hour'] = pd.DatetimeIndex(df[column]).hour
    df['minute'] = pd.DatetimeIndex(df[column]).minute
    return

seasons = {2:[3,4,5],
           3:[6,7,8],
           4:[9,10,11],
           1:[12,1,2]}

def replace_m_w_season(row):
    for s in seasons.keys():
#         print(s,seasons[s],row)
        if row in seasons[s]:
            return s
        else:
            pass

def datetime_blank_min_before(df,dt):
    df[f'{dt}_min_before'] = pd.DatetimeIndex(df['timestamp']) - timedelta(minutes=dt)
    return 

def numberOfDays(y, m):
    leap = 0
    if y% 400 == 0:
        leap = 1
    elif y % 100 == 0:
        leap = 0
    elif y% 4 == 0:
        leap = 1
    if m==2:
        return 28 + leap
    list = [1,3,5,7,8,10,12]
    if m in list:
        return 31
    return 30

def make_image_path(row):
    files = {}
    files['higher_file_path'] = make_higher_image_path(row)
    files['lower_file_path'] = make_lower_image_path(row)
    return files

def get_all_file_names(li_file_names,file_name_dict):
    for i in li_file_names:
        if '.DS_Store' in i: 
            pass
        elif len(i) > 4 and len(i) <= 7:
            yrmn = i 
            file_name_dict[i] = {}
        elif len(i) > 7 and len(i) <= 10:
            yrmnd = i
            file_name_dict[yrmn][i] = []
        elif len(i) > 10:
            try:
                file_name_dict[yrmn][yrmnd].append(i)
            except:
                print(i)
                break
        else:
            pass


# 2014/12/29/20141229_170300.jpg
def make_higher_image_path(row):
    mn,day,hour,mi,yr = (int(row["month"]),int(row["day"]),
                      int(row["hour"]),int(row["min"]),int(row["year"]))
    
    return datetime(yr,mn,day,hour,mi,46)

def make_lower_image_path(row):
    mn,day,hour,mi,yr = (row["month"],row["day"],
                      row["hour"],row["min"],row["year"])
    mn = f"{mn:02}"
    day = f"{day:02}"
    hour = f"{hour:02}"
    mi = f"{mi:02}"
    yr = str(yr)
    
    if mi == "00":
        if hour == "00":
            if (day == "01") and (mn == "01"):
                y = int(yr) - 1 
                if y in [2014,2015,2016]:
                    yr_l = y
                    mn_l = 12
                    day_l = numberOfDays(y, mn_l)
                    hour_1 = 23
                    mi_l = 59
                    s_l = 45
                else:
                    yr_1 = int(yr)
                    mn_l = int(mn)
                    hour_1 = int(hour)
                    day_l = int(day)
                    mi_l = 0
                    s_l = 0

            elif (day == "01") and (mn != "01"):
                yr_l = int(yr)
                mn_l = int(mn) - 1
                day_l = numberOfDays(int(yr), mn_l)
                hour_1 = 23
                mi_l = 59
                s_l = 45
            else:
                yr_l = int(yr)
                mn_l = int(mn)
                day_l = int(day) - 1
                hour_1 = 23
                mi_l = 59
                s_l = 45
        else:
            yr_l = int(yr)
            day_l = int(day)
            mn_l = int(mn)
            hour_1 = int(hour) - 1
            mi_l = 59
            s_l = 45
    else:
        hour_1 = int(hour)
        day_l = int(day)
        mn_l = int(mn)
        yr_l = int(yr)
        mi_l = int(mi) - 1
        s_l = 45
        
    return datetime(yr_l,mn_l,day_l,hour_1,mi_l,s_l)


def get_correct_file(row,file_dict):
    mn,day,hour,mi = (row["month"],row["day"],
                      row["hour"],row["min"])
    higher, lower = row["higher_file"],row["lower_file"]
    
    mn = f"{mn:02}"
    day = f"{day:02}"
    hour = f"{hour:02}"
    mi = f"{mi:02}"
    
    yrmn,date = f"{str(row['year'])}/{mn}", f"{str(row['year'])}/{mn}/{day}"
    if date in file_dict[str(row['year'])][yrmn].keys():
        for file in file_dict[str(row['year'])][yrmn][date]:
            dt_f = datetime(int(file[:4]),int(file[5:7]),int(file[8:10]),int(file[20:22]),int(file[22:24]),int(file[24:26]))
            
            if (dt_f >= lower) and (dt_f <= higher):
                return file
            else:
                pass
    else:
        return 0
    

def save_pickle(file_name,obj):
    with open(file_name, 'wb') as fout:
        pickle.dump(obj, fout)

def open_pickle(file_name):
    with open(file_name, 'rb') as handle:
        obj = pickle.load(handle)
    return obj

def update_df_for_model(df,column):
    col = column + '_i'
#     df['Y'] = df.apply(lambda row: [row['ghi_x'],row['dni_x'],row['dhi_x']],axis=1)
#     df[col] = df.apply(lambda row: [row['ghi_y'],row['dni_y'],row['dhi_y']],axis=1)
    df = df[['ghi_x','timestamp_x',column,'air_temp','relhum', 'press', 'windsp', 
             'winddir', 'max_windsp', 'precipitation','file','ghi_y']]

    return df.rename(columns={'timestamp_x':'timestamp','ghi_x':'Y','ghi_y':col})

def preview_df(df):
    df_dtypes = pd.DataFrame(df.dtypes,columns=['dtypes'])
    df_dtypes = df_dtypes.reset_index()
    df_dtypes['name'] = df_dtypes['index']
    df_dtypes = df_dtypes[['name','dtypes']]
    df_dtypes['first value'] = df.loc[0].values
    data_dictionary = pd.DataFrame(df.columns).rename(columns={0:"name"})
    preview = df_dtypes.merge(data_dictionary, on='name',how='left')
    
    return preview

time_to_period = {'month':12,'day':31,'hour':24,'minute':60}

def process_time_to_sin(df,cols,time_to_period):
    for col in cols:
        p = time_to_period[col]
        df[col] = df[col].apply(lambda row: sin((2*pi*row)/p))
    
    return df[cols]
# time = [-5:-1] (['month', 'day', 'hour', 'minute'])
# cont = cols[:-5]
# cat = cols[-1]

def process_timeahead_attributes(df_name,train,test,time_to_period):
    cols = train.columns.to_list()
    cat_cols = cols[-1]
    con_cols = cols[:-5]
    time_cols = ['month', 'day', 'hour', 'minute']

    # performin min-max scaling each continuous feature column to
    # the range [0, 1]
    cs = MinMaxScaler()
    trainContinuous = cs.fit_transform(train[con_cols])
    testContinuous = cs.transform(test[con_cols])
    
    # one-hot encode the categorical data (by definition of
    # one-hot encoding, all output features are now in the range [0, 1])
                                
    trainCategorical = pd.get_dummies(train[cat_cols],drop_first=True)
    testCategorical = pd.get_dummies(test[cat_cols],drop_first=True)
    
    #preform a sin transformation on our time columns: 
    #The sin function will output all the features in the range [-1,1]
    trainTimeCols = process_time_to_sin(train,time_cols,time_to_period)
    testTimeCols = process_time_to_sin(test,time_cols,time_to_period)
    
#     # construct our training and testing data points by concatenating
#     # the categorical features with the continuous features
    trainX = np.hstack([trainContinuous,trainTimeCols,trainCategorical])
    testX = np.hstack([testContinuous,testTimeCols,testCategorical])
#     # return the concatenated training and testing data
    return (trainX, testX)

#need to look into how these may have been saved incorrectly, so if they're off then 
# they can be matched to the closest file by second, maybe there should be an upper 
# and lower time within the min?
    

In [355]:
test =['air_temp', 'relhum', 'press', 'windsp', 'winddir', 'max_windsp',
'precipitation','5_min_before_i', 'season', 'year', 'month',
'day', 'hour']

In [365]:
li = test[-5:-2]
# li.append(test[-6])

In [366]:
li

['season', 'year', 'month']

### Create DataFrames for Models


Open the image files to get image names:


In [None]:
new_img_file_names = {}
for yr in [2014,2015,2016]:
    f_name = f'data/Folsom_sky_images_{yr}.tar.bz2'
    print(f_nmae)
    tar = tarfile.open(f_name, "r")
    tar_members_names = [filename for filename in tar.getnames()]
    img_file_names[yr] = {}
    get_all_file_names(tar_members_names,img_file_names[yr])


In [None]:
# save_pickle('image_file_names.pkl',new_img_file_names)
# save_pickle('data/df_solar_and_img_data.pkl',df_merge_1)
img_file_names = open_pickle('data/image_file_names.pkl')
df_merge_1 = open_pickle('data/df_solar_and_img_data.pkl')

Read in weather and irradiance data:

In [None]:
fol_irr = pd.read_csv('data/Folsom_irradiance.csv',index_col=0)
fol_sat = pd.read_csv('data/Folsom_satellite.csv')
fol_sky_img = pd.read_csv('data/Folsom_sky_image_features.csv',index_col=0)
fol_weather = pd.read_csv('data/Folsom_weather.csv',index_col=0)

Get datetime for each interval:

In [None]:
# fol_sat.columns
datetime_blank_min_before(fol_irr,5)
datetime_blank_min_before(fol_irr,10)
datetime_blank_min_before(fol_irr,15)
datetime_blank_min_before(fol_irr,20)
datetime_blank_min_before(fol_irr,25)
datetime_blank_min_before(fol_irr,30)

In [None]:
fol_weather['timestamp'] = pd.to_datetime(fol_weather['timestamp'])

Merge the irradance DFs to the folsome weather DFs on the interval before: 

In [None]:
df_5_min_ahead = pd.merge(fol_irr,fol_weather,how="left", left_on="5_min_before", right_on="timestamp")
df_10_min_ahead = pd.merge(fol_irr,fol_weather,how="left", left_on="10_min_before", right_on="timestamp")
df_15_min_ahead = pd.merge(fol_irr,fol_weather,how="left", left_on="15_min_before", right_on="timestamp")
df_20_min_ahead = pd.merge(fol_irr,fol_weather,how="left", left_on="20_min_before", right_on="timestamp")
df_25_min_ahead = pd.merge(fol_irr,fol_weather,how="left", left_on="25_min_before", right_on="timestamp")
df_30_min_ahead = pd.merge(fol_irr,fol_weather,how="left", left_on="30_min_before", right_on="timestamp")

In [None]:
df_5_min_ahead = df_5_min_ahead.dropna()[['timestamp_x','5_min_before', 'ghi', 'dni', 'dhi',
                                  'air_temp', 'relhum', 'press', 'windsp', 'winddir','max_windsp', 
                                  'precipitation']].rename(columns={'timestamp_x':'timestamp'})

df_10_min_ahead = df_10_min_ahead.dropna()[['timestamp_x','10_min_before', 'ghi', 'dni', 'dhi',
                                  'air_temp', 'relhum', 'press', 'windsp', 'winddir','max_windsp', 
                                  'precipitation']].rename(columns={'timestamp_x':'timestamp'})

df_15_min_ahead = df_15_min_ahead.dropna()[['timestamp_x','15_min_before', 'ghi', 'dni', 'dhi',
                                  'air_temp', 'relhum', 'press', 'windsp', 'winddir','max_windsp', 
                                  'precipitation']].rename(columns={'timestamp_x':'timestamp'})

df_20_min_ahead = df_20_min_ahead.dropna()[['timestamp_x','20_min_before', 'ghi', 'dni', 'dhi',
                                  'air_temp', 'relhum', 'press', 'windsp', 'winddir','max_windsp', 
                                  'precipitation']].rename(columns={'timestamp_x':'timestamp'})

df_25_min_ahead = df_25_min_ahead.dropna()[['timestamp_x','25_min_before', 'ghi', 'dni', 'dhi',
                                  'air_temp', 'relhum', 'press', 'windsp', 'winddir','max_windsp', 
                                  'precipitation']].rename(columns={'timestamp_x':'timestamp'})

df_30_min_ahead = df_30_min_ahead.dropna()[['timestamp_x','30_min_before', 'ghi', 'dni', 'dhi',
                                  'air_temp', 'relhum', 'press', 'windsp', 'winddir','max_windsp', 
                                  'precipitation']].rename(columns={'timestamp_x':'timestamp'})

In [None]:
for table in [df_5_min_ahead,df_10_min_ahead,df_15_min_ahead,df_20_min_ahead,df_25_min_ahead,df_30_min_ahead]:
#     print(table.columns[1])
    breakdown_dates(table,table.columns[1])

Using functions get the times below and above ours to match the image files to:

In [None]:
for table in [df_5_min_ahead,df_10_min_ahead,df_15_min_ahead,df_20_min_ahead,df_25_min_ahead,df_30_min_ahead]:
    print(table.columns[1])
    table['higher_file'] = table.apply(lambda row: make_higher_image_path(row),axis=1)
    table['lower_file'] = table.apply(lambda row: make_lower_image_path(row),axis=1)

Get the correct image file that falls between the two columns above:

In [None]:
for table in [df_5_min_ahead,df_10_min_ahead,df_15_min_ahead,df_20_min_ahead,df_25_min_ahead,df_30_min_ahead]:
    print(table.columns[1])
    table['file'] = table.apply(lambda row: get_correct_file(row,img_file_names),axis=1)

Only get instances where we have files for:

In [None]:
df_5_min_ahead_w_img = df_5_min_ahead[(~df_5_min_ahead.file.isnull()) & (df_5_min_ahead.file != 0)]
df_10_min_ahead_w_img = df_10_min_ahead[(~df_10_min_ahead.file.isnull()) & (df_10_min_ahead.file != 0)]
df_15_min_ahead_w_img = df_15_min_ahead[(~df_15_min_ahead.file.isnull()) & (df_15_min_ahead.file != 0)]
df_20_min_ahead_w_img = df_20_min_ahead[(~df_20_min_ahead.file.isnull()) & (df_20_min_ahead.file != 0)]
df_25_min_ahead_w_img = df_25_min_ahead[(~df_25_min_ahead.file.isnull()) & (df_25_min_ahead.file != 0)]
df_30_min_ahead_w_img = df_30_min_ahead[(~df_30_min_ahead.file.isnull()) & (df_30_min_ahead.file != 0)]

In [3]:
# save_pickle("df_5_min_ahead_data.pkl",df_5_min_ahead_w_img)
# save_pickle("df_10_min_ahead_data.pkl",df_10_min_ahead_w_img)
# save_pickle("df_15_min_ahead_data.pkl",df_15_min_ahead_w_img)
# save_pickle("df_20_min_ahead_data.pkl",df_20_min_ahead_w_img)
# save_pickle("df_25_min_ahead_data.pkl",df_25_min_ahead_w_img)
# save_pickle("df_30_min_ahead_data.pkl",df_30_min_ahead_w_img)
df_5_min_ahead_w_img = open_pickle("df_5_min_ahead_data.pkl")
df_10_min_ahead_w_img = open_pickle("df_10_min_ahead_data.pkl")
df_15_min_ahead_w_img = open_pickle("df_15_min_ahead_data.pkl")
df_20_min_ahead_w_img = open_pickle("df_20_min_ahead_data.pkl")
df_25_min_ahead_w_img = open_pickle("df_25_min_ahead_data.pkl")
df_30_min_ahead_w_img = open_pickle("df_30_min_ahead_data.pkl")

In [4]:
df_5_min_ahead_w_img = df_5_min_ahead_w_img[['timestamp', '5_min_before', 'ghi', 'air_temp', 
                                             'relhum','press', 'windsp', 'winddir', 
                                             'max_windsp', 'precipitation', 'file']]
df_10_min_ahead_w_img = df_10_min_ahead_w_img[['timestamp', '10_min_before', 'ghi','air_temp', 
                                               'relhum','press', 'windsp', 'winddir', 
                                               'max_windsp', 'precipitation', 'file']]
df_15_min_ahead_w_img = df_15_min_ahead_w_img[['timestamp', '15_min_before', 'ghi', 
                                               'air_temp', 'relhum','press', 'windsp', 'winddir', 
                                               'max_windsp', 'precipitation', 'file']]
df_20_min_ahead_w_img = df_20_min_ahead_w_img[['timestamp', '20_min_before', 'ghi', 
                                               'air_temp', 'relhum','press', 'windsp', 'winddir', 
                                               'max_windsp', 'precipitation', 'file']]
df_25_min_ahead_w_img = df_25_min_ahead_w_img[['timestamp', '25_min_before', 'ghi', 
                                               'air_temp', 'relhum','press', 'windsp', 'winddir', 
                                               'max_windsp', 'precipitation', 'file']]
df_30_min_ahead_w_img = df_30_min_ahead_w_img[['timestamp', '30_min_before', 'ghi', 
                                               'air_temp', 'relhum','press', 'windsp', 'winddir', 
                                               'max_windsp', 'precipitation', 'file']]

Update datetime format on all DFs before we join to get irradiance from earlier timestamps:

In [6]:
for df in [df_5_min_ahead_w_img,df_10_min_ahead_w_img,df_15_min_ahead_w_img,
           df_20_min_ahead_w_img,df_25_min_ahead_w_img,df_30_min_ahead_w_img]:
    
    df['timestamp'] = pd.to_datetime(df['timestamp'])

Now join tables, to match on their time ahead intervals. This will give us the irradiance at the time interval before.

In [12]:
df_5_min = df_5_min_ahead_w_img.merge(fol_irr,how="left", left_on="5_min_before", right_on="timestamp")
df_10_min = df_10_min_ahead_w_img.merge(fol_irr,how="left", left_on="10_min_before", right_on="timestamp")
df_15_min = df_15_min_ahead_w_img.merge(fol_irr,how="left", left_on="15_min_before", right_on="timestamp")
df_20_min = df_20_min_ahead_w_img.merge(fol_irr,how="left", left_on="20_min_before", right_on="timestamp")
df_25_min = df_25_min_ahead_w_img.merge(fol_irr,how="left", left_on="25_min_before", right_on="timestamp")
df_30_min = df_30_min_ahead_w_img.merge(fol_irr,how="left", left_on="30_min_before", right_on="timestamp")

In [17]:
# df_20_min.head(21)

Using the fuction to get the columns we are supposed to have:

In [18]:
df_5_min = update_df_for_model(df_5_min,"5_min_before")
df_10_min = update_df_for_model(df_10_min,"10_min_before")
df_15_min = update_df_for_model(df_15_min,"15_min_before")
df_20_min = update_df_for_model(df_20_min,"20_min_before")
df_25_min = update_df_for_model(df_25_min,"25_min_before")
df_30_min = update_df_for_model(df_30_min,"30_min_before")

In [370]:
# save_pickle("../data_rp/df_5_min_data.pkl",df_5_min)
# save_pickle("../data_rp/df_10_min_data.pkl",df_10_min)
# save_pickle("../data_rp/df_15_min_data.pkl",df_15_min)
# save_pickle("../data_rp/df_20_min_data.pkl",df_20_min)
# save_pickle("../data_rp/df_25_min_data.pkl",df_25_min)
# save_pickle("../data_rp/df_30_min_data.pkl",df_30_min)
df_5_min = open_pickle("../data_rp/df_5_min_data.pkl")
df_10_min = open_pickle("../data_rp/df_10_min_data.pkl")
df_15_min = open_pickle("../data_rp/df_15_min_data.pkl")
df_20_min = open_pickle("../data_rp/df_20_min_data.pkl")
df_25_min = open_pickle("../data_rp/df_25_min_data.pkl")
df_30_min = open_pickle("../data_rp/df_30_min_data.pkl")

In [371]:
df_5_min.head()

Unnamed: 0,Y,timestamp,5_min_before,air_temp,relhum,press,windsp,winddir,max_windsp,precipitation,file,5_min_before_i
0,2.52,2014-01-02 15:33:00,2014-01-02 15:28:00,2.8,75.06,1010.0,2.0,199.6,2.6,0.0,2014/01/02/20140102_152808.jpg,4.87
1,3.17,2014-01-02 15:34:00,2014-01-02 15:29:00,2.7,75.5,1010.0,1.74,190.4,2.4,0.0,2014/01/02/20140102_152907.jpg,5.59
2,3.9,2014-01-02 15:35:00,2014-01-02 15:30:00,2.7,75.54,1010.0,1.78,193.6,2.3,0.0,2014/01/02/20140102_153008.jpg,1.23
3,4.64,2014-01-02 15:36:00,2014-01-02 15:31:00,2.7,74.98,1010.0,1.72,192.2,2.1,0.0,2014/01/02/20140102_153108.jpg,1.62
4,5.36,2014-01-02 15:37:00,2014-01-02 15:32:00,2.62,74.76,1010.0,1.66,188.2,2.4,0.0,2014/01/02/20140102_153208.jpg,2.04


In [374]:
for df in [df_5_min,df_10_min,df_15_min,df_20_min,df_25_min,df_30_min]:
    breakdown_dates(df,'timestamp')
    df['season'] = df.month.apply(lambda row: replace_m_w_season(row))

In [375]:
df_5_min.columns

Index(['Y', 'timestamp', '5_min_before', 'air_temp', 'relhum', 'press',
       'windsp', 'winddir', 'max_windsp', 'precipitation', 'file',
       '5_min_before_i', 'year', 'month', 'day', 'hour', 'minute', 'season'],
      dtype='object')

In [56]:
df_5_min.columns

Index(['Y', 'timestamp', '5_min_before', 'air_temp', 'relhum', 'press',
       'windsp', 'winddir', 'max_windsp', 'precipitation', 'file',
       '5_min_before_i', 'year', 'month', 'day', 'hour', 'min', 'sec',
       'season'],
      dtype='object')

In [301]:
# t = 5
# for df in [df_5_min,df_10_min,df_15_min,df_20_min,df_25_min,df_30_min]:
#     df.drop(['day','min','sec'], axis=1,inplace=True)
#     df.drop(['month'], axis=1,inplace=True)

#     col = f'{t}_min_before'
#     col_1 = f'{t}_min_before_i'
# #     print(t,col,col_1)
#     df = df[['Y', 'timestamp', col, 'air_temp', 'relhum', 'press',
#        'windsp', 'winddir', 'max_windsp', 'precipitation', 'file',
#        col_1, 'year', 'month', 'hour','season']]
#     t += 5

In [377]:
sampled_dfs = {}
m = 5
for df in [df_5_min,df_10_min,df_15_min,df_20_min,df_25_min,df_30_min]:
    name = f"df_{m}_min"
    col = f"{m}_min_before"
#     df.drop(['day','min','sec'], axis=1,inplace=True)
    d = df.sample(n=10000,random_state=42)
#     breakdown_dates(d,'timestamp')
    d.drop(['timestamp',col], axis=1,inplace=True)
    sampled_dfs[name] = d
    m += 5

In [378]:
sampled_dfs['df_5_min'].columns

Index(['Y', 'air_temp', 'relhum', 'press', 'windsp', 'winddir', 'max_windsp',
       'precipitation', 'file', '5_min_before_i', 'year', 'month', 'day',
       'hour', 'minute', 'season'],
      dtype='object')

In [379]:
sampled_dfs['df_5_min']

Unnamed: 0,Y,air_temp,relhum,press,windsp,winddir,max_windsp,precipitation,file,5_min_before_i,year,month,day,hour,minute,season
212504,16.37,11.10,74.34,1004.0,1.46,55.0,2.1,0.0,2014/11/09/20141109_145759.jpg,12.83,2014,11,9,15,3,4
730167,147.30,17.10,51.86,1004.0,0.30,128.9,0.5,0.0,2016/10/31/20161031_224459.jpg,163.50,2016,10,31,22,50,4
650333,529.20,19.54,51.86,1000.0,1.46,144.2,3.2,0.0,2016/07/17/20160717_155708.jpg,514.10,2016,7,17,16,2,3
540543,557.40,22.60,29.70,1007.0,1.66,175.0,2.8,0.0,2016/03/01/20160301_223608.jpg,568.50,2016,3,1,22,41,2
272258,318.50,14.42,73.84,1008.0,1.50,300.2,2.0,0.0,2015/02/19/20150219_220511.jpg,262.70,2015,2,19,22,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454845,380.90,30.40,31.20,1006.0,0.62,324.4,1.1,0.0,2015/10/08/20151008_213209.jpg,586.30,2015,10,8,21,37,4
722096,273.50,11.10,71.52,1011.0,0.68,89.0,0.9,0.0,2016/10/19/20161019_154802.jpg,252.90,2016,10,19,15,53,4
281222,374.40,12.80,55.72,1016.0,0.60,290.6,1.3,0.0,2015/03/05/20150305_163459.jpg,358.00,2015,3,5,16,40,2
40098,1.96,8.52,64.00,1005.0,2.08,40.8,3.2,0.0,2014/03/19/20140319_141400.jpg,5.54,2014,3,19,14,19,2


### Data Exploration:

In [71]:
previews = []
described = []
for df in [df_5_min,df_10_min,df_15_min,df_20_min,df_25_min,df_30_min]:
    previews.append(preview_df(df))
    described.append(df.describe())

In [73]:
described[5]

Unnamed: 0,Y,air_temp,relhum,press,windsp,winddir,max_windsp,precipitation,30_min_before_i,year,month,hour,season
count,764846.0,764846.0,764846.0,764846.0,764846.0,764846.0,764846.0,764846.0,764846.0,764846.0,764846.0,764846.0,764846.0
mean,412.049214,21.337232,44.304767,1003.443749,1.567333,216.414405,2.51834,0.002666,412.803596,2015.023526,6.50635,14.907275,2.57509
std,296.527848,8.384509,21.283695,4.969679,0.890709,76.316673,1.312563,0.035558,295.533668,0.810692,3.215236,7.712866,1.052862
min,0.0,-2.9,4.86,983.0,0.0,0.0,0.0,0.0,0.0,2014.0,1.0,0.0,1.0
25%,142.2,14.7,26.94,1000.0,0.94,153.4,1.6,0.0,142.3,2014.0,4.0,14.0,2.0
50%,381.5,20.58,40.48,1003.0,1.42,228.5,2.3,0.0,381.5,2015.0,7.0,17.0,3.0
75%,658.3,27.84,59.64,1007.0,2.0,280.7,3.2,0.0,658.3,2016.0,9.0,20.0,3.0
max,1466.0,42.78,94.0,1021.0,9.3,360.0,13.5,4.77,1466.0,2016.0,12.0,23.0,4.0


In [81]:
df_5_min[df_5_min.columns[1:]]

Unnamed: 0,timestamp,5_min_before,air_temp,relhum,press,windsp,winddir,max_windsp,precipitation,file,5_min_before_i,year,month,hour,season
0,2014-01-02 15:33:00,2014-01-02 15:28:00,2.80,75.06,1010.0,2.00,199.6,2.6,0.0,2014/01/02/20140102_152808.jpg,4.87,2014,1,15,1
1,2014-01-02 15:34:00,2014-01-02 15:29:00,2.70,75.50,1010.0,1.74,190.4,2.4,0.0,2014/01/02/20140102_152907.jpg,5.59,2014,1,15,1
2,2014-01-02 15:35:00,2014-01-02 15:30:00,2.70,75.54,1010.0,1.78,193.6,2.3,0.0,2014/01/02/20140102_153008.jpg,1.23,2014,1,15,1
3,2014-01-02 15:36:00,2014-01-02 15:31:00,2.70,74.98,1010.0,1.72,192.2,2.1,0.0,2014/01/02/20140102_153108.jpg,1.62,2014,1,15,1
4,2014-01-02 15:37:00,2014-01-02 15:32:00,2.62,74.76,1010.0,1.66,188.2,2.4,0.0,2014/01/02/20140102_153208.jpg,2.04,2014,1,15,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764841,2016-12-31 00:48:00,2016-12-31 00:43:00,11.20,71.40,1001.0,2.12,136.8,3.1,0.0,2016/12/31/20161231_004300.jpg,6.14,2016,12,0,1
764842,2016-12-31 00:49:00,2016-12-31 00:44:00,11.20,71.58,1001.0,2.46,138.6,3.0,0.0,2016/12/31/20161231_004400.jpg,5.13,2016,12,0,1
764843,2016-12-31 00:50:00,2016-12-31 00:45:00,11.20,71.60,1001.0,2.02,133.4,2.9,0.0,2016/12/31/20161231_004500.jpg,4.22,2016,12,0,1
764844,2016-12-31 00:51:00,2016-12-31 00:46:00,11.20,71.64,1001.0,2.04,142.6,3.4,0.0,2016/12/31/20161231_004559.jpg,3.37,2016,12,0,1


In [106]:
cols = df_5_min.columns[-4:-2].to_list()
cols.append(df_5_min.columns[-1])
print(cols)

['year', 'month', 'season']


In [108]:
df_5_min.columns

Index(['Y', 'timestamp', '5_min_before', 'air_temp', 'relhum', 'press',
       'windsp', 'winddir', 'max_windsp', 'precipitation', 'file',
       '5_min_before_i', 'year', 'month', 'hour', 'season'],
      dtype='object')

### Pre Processing and Scaling

first lets save this data, for other tests in the future with different transformations:

In [381]:
save_pickle('../data_rp/sampled_raw_data.pkl',sampled_dfs)
# print (math.sin(math.pi/2))

Train-test split: following regular proportions (pareto method).

In [394]:
train_test_data = {}
t = 5
for df_name in sampled_dfs.keys():
    df = sampled_dfs[df_name]
    train_test_data[df_name] = {}
    x_cols = df.columns[1:].to_list()
    train_test_data[df_name]['file'] = df['file']
    x_cols.remove('file')
    X_train, X_test, y_train, y_test = train_test_split(df[x_cols], df[df.columns[0]], 
                                                        test_size=0.20, random_state=42)
    train_test_data[df_name]['X_train'] = X_train
    train_test_data[df_name]['X_test'] = X_test
    train_test_data[df_name]['y_train'] = y_train
    train_test_data[df_name]['y_test'] = y_test
    t += 5
    

Below we pass our train-test data through the processing function to onehot encode categorical features, min-max scale our continuous features, and transform all time data using the sine function. All data other than time data, will be between [0,1] with time data between [-1,1]. This transformation/scalining will help our model converge more quickly.

In [399]:
t = 5
for _ in range(0,6):
    df_name = f'df_{t}_min'
    train = train_test_data[df_name]['X_train']
    test = train_test_data[df_name]['X_test']
    x_train,x_test = process_timeahead_attributes(t,train,test,time_to_period)
    train_test_data[df_name]['X_train_p'],train_test_data[df_name]['X_test_p'] = x_train,x_test
    print(df_name)
    t+=5
#     process_timeahead_attributes(con_cols,cat_cols,train,test)

df_5_min
df_10_min
df_15_min
df_20_min
df_25_min
df_30_min


In [406]:
save_pickle('../data_rp/model_data_dict.pkl',train_test_data)

In [None]:
# df_5_min.describe()
# df_10_min.describe()
# df_15_min.describe()
# df_20_min.describe()
# df_25_min.describe()
# df_30_min.describe()

# sns.distplot(df_5_min['5_min_before_i']);
# sns.distplot(df_5_min['Y']);
# sns.distplot(df_5_min['air_temp']);
# g = sns.PairGrid(df_5_min, height=3.5)
# g.map(sns.scatterplot)

# corr = df_5_min.corr()
# plt.figure(figsize=(30,20))
# mask = np.zeros_like(corr)
# mask[np.triu_indices_from(mask)] = True
# with sns.axes_style("white"):
#     ax = sns.heatmap(corr,mask=mask,center=0,cmap="coolwarm",annot=True,linewidths=.5)

# corr = df_30_min.corr()
# plt.figure(figsize=(30,20))
# mask = np.zeros_like(corr)
# mask[np.triu_indices_from(mask)] = True
# with sns.axes_style("white"):
#     ax = sns.heatmap(corr,mask=mask,center=0,cmap="coolwarm",annot=True,linewidths=.5)