# Data Cleaning, Preprocessing, and Feature Engineering

In this notebook, we will read in our data. Match instances in our data to images, and add the file path to the df.
After, joining our data, we will need to cross join to get Irradiance data with it's iterval ahead weather, irradiance, and sky images. 

All data was dowloaded from __[here](https://zenodo.org/record/2826939#.YEPKXi1h1pS)__. Thanks so much to the University of California San Diego team (Carreira Pedro, Hugo; Larson, David; Coimbra, Carlos) who worked so hard on collecting this data, and for supporting the work of others in this space.

#### Below we will:
1. [Create dataframes](#Create-DataFrames-for-Models)
    - create filepaths for images
    - get time intervals and cross join for earlier irradiance data
    - merge weather and irradiance data
    
  
2. [explore our data](#)
3. [preprocess/scale our data](#)


Import needed libraries:

In [77]:
import pandas as pd
import bz2
from datetime import datetime,timedelta
import tarfile
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# import sklearn

Functions needed for all processes in notebook:

In [126]:
def breakdown_dates(df,column): 
    df['year'] = pd.DatetimeIndex(df[column]).year
    df['month'] = pd.DatetimeIndex(df[column]).month
    df['day'] = pd.DatetimeIndex(df[column]).day
    df['hour'] = pd.DatetimeIndex(df[column]).hour
    return

seasons = {2:[3,4,5],
           3:[6,7,8],
           4:[9,10,11],
           1:[12,1,2]}

def replace_m_w_season(row):
    for s in seasons.keys():
#         print(s,seasons[s],row)
        if row in seasons[s]:
            return s
        else:
            pass

def datetime_blank_min_before(df,dt):
    df[f'{dt}_min_before'] = pd.DatetimeIndex(df['timestamp']) - timedelta(minutes=dt)
    return 

def numberOfDays(y, m):
    leap = 0
    if y% 400 == 0:
        leap = 1
    elif y % 100 == 0:
        leap = 0
    elif y% 4 == 0:
        leap = 1
    if m==2:
        return 28 + leap
    list = [1,3,5,7,8,10,12]
    if m in list:
        return 31
    return 30

def make_image_path(row):
    files = {}
    files['higher_file_path'] = make_higher_image_path(row)
    files['lower_file_path'] = make_lower_image_path(row)
    return files

def get_all_file_names(li_file_names,file_name_dict):
    for i in li_file_names:
        if '.DS_Store' in i: 
            pass
        elif len(i) > 4 and len(i) <= 7:
            yrmn = i 
            file_name_dict[i] = {}
        elif len(i) > 7 and len(i) <= 10:
            yrmnd = i
            file_name_dict[yrmn][i] = []
        elif len(i) > 10:
            try:
                file_name_dict[yrmn][yrmnd].append(i)
            except:
                print(i)
                break
        else:
            pass


# 2014/12/29/20141229_170300.jpg
def make_higher_image_path(row):
    mn,day,hour,mi,yr = (int(row["month"]),int(row["day"]),
                      int(row["hour"]),int(row["min"]),int(row["year"]))
    
    return datetime(yr,mn,day,hour,mi,46)

def make_lower_image_path(row):
    mn,day,hour,mi,yr = (row["month"],row["day"],
                      row["hour"],row["min"],row["year"])
    mn = f"{mn:02}"
    day = f"{day:02}"
    hour = f"{hour:02}"
    mi = f"{mi:02}"
    yr = str(yr)
    
    if mi == "00":
        if hour == "00":
            if (day == "01") and (mn == "01"):
                y = int(yr) - 1 
                if y in [2014,2015,2016]:
                    yr_l = y
                    mn_l = 12
                    day_l = numberOfDays(y, mn_l)
                    hour_1 = 23
                    mi_l = 59
                    s_l = 45
                else:
                    yr_1 = int(yr)
                    mn_l = int(mn)
                    hour_1 = int(hour)
                    day_l = int(day)
                    mi_l = 0
                    s_l = 0

            elif (day == "01") and (mn != "01"):
                yr_l = int(yr)
                mn_l = int(mn) - 1
                day_l = numberOfDays(int(yr), mn_l)
                hour_1 = 23
                mi_l = 59
                s_l = 45
            else:
                yr_l = int(yr)
                mn_l = int(mn)
                day_l = int(day) - 1
                hour_1 = 23
                mi_l = 59
                s_l = 45
        else:
            yr_l = int(yr)
            day_l = int(day)
            mn_l = int(mn)
            hour_1 = int(hour) - 1
            mi_l = 59
            s_l = 45
    else:
        hour_1 = int(hour)
        day_l = int(day)
        mn_l = int(mn)
        yr_l = int(yr)
        mi_l = int(mi) - 1
        s_l = 45
        
    return datetime(yr_l,mn_l,day_l,hour_1,mi_l,s_l)


def get_correct_file(row,file_dict):
    mn,day,hour,mi = (row["month"],row["day"],
                      row["hour"],row["min"])
    higher, lower = row["higher_file"],row["lower_file"]
    
    mn = f"{mn:02}"
    day = f"{day:02}"
    hour = f"{hour:02}"
    mi = f"{mi:02}"
    
    yrmn,date = f"{str(row['year'])}/{mn}", f"{str(row['year'])}/{mn}/{day}"
    if date in file_dict[str(row['year'])][yrmn].keys():
        for file in file_dict[str(row['year'])][yrmn][date]:
            dt_f = datetime(int(file[:4]),int(file[5:7]),int(file[8:10]),int(file[20:22]),int(file[22:24]),int(file[24:26]))
            
            if (dt_f >= lower) and (dt_f <= higher):
                return file
            else:
                pass
    else:
        return 0
    

def save_pickle(file_name,obj):
    with open(file_name, 'wb') as fout:
        pickle.dump(obj, fout)

def open_pickle(file_name):
    with open(file_name, 'rb') as handle:
        obj = pickle.load(handle)
    return obj

def update_df_for_model(df,column):
    col = column + '_i'
#     df['Y'] = df.apply(lambda row: [row['ghi_x'],row['dni_x'],row['dhi_x']],axis=1)
#     df[col] = df.apply(lambda row: [row['ghi_y'],row['dni_y'],row['dhi_y']],axis=1)
    df = df[['ghi_x','timestamp_x',column,'air_temp','relhum', 'press', 'windsp', 
             'winddir', 'max_windsp', 'precipitation','file','ghi_y']]

    return df.rename(columns={'timestamp_x':'timestamp','ghi_x':'Y','ghi_y':col})

def preview_df(df):
    df_dtypes = pd.DataFrame(df.dtypes,columns=['dtypes'])
    df_dtypes = df_dtypes.reset_index()
    df_dtypes['name'] = df_dtypes['index']
    df_dtypes = df_dtypes[['name','dtypes']]
    df_dtypes['first value'] = df.loc[0].values
    data_dictionary = pd.DataFrame(df.columns).rename(columns={0:"name"})
    preview = df_dtypes.merge(data_dictionary, on='name',how='left')
    
    return preview

def process_timeahead_attributes(df_name,train,test):
    cat_cols = train.columns[-1]
    con_cols = train.columns[1:-2].to_list()
    con_cols.append('hour')
    # performin min-max scaling each continuous feature column to
    # the range [0, 1]
    cs = MinMaxScaler()
    trainContinuous = cs.fit_transform(train[con_cols])
    testContinuous = cs.transform(test[con_cols])
    
    # one-hot encode the categorical data (by definition of
    # one-hot encoding, all output features are now in the range [0, 1])
    print(cat_cols)
    trainCategorical = pd.get_dummies(train[cat_cols],drop_first=True)
    testCategorical = pd.get_dummies(test[cat_cols],drop_first=True)
    
    # construct our training and testing data points by concatenating
    # the categorical features with the continuous features
    trainX = np.hstack([trainCategorical, trainContinuous])
    testX = np.hstack([testCategorical, testContinuous])
    # return the concatenated training and testing data
    return (trainX, testX)

#need to look into how these may have been saved incorrectly, so if they're off then 
# they can be matched to the closest file by second, maybe there should be an upper 
# and lower time within the min?
    

### Create DataFrames for Models


Open the image files to get image names:


In [None]:
new_img_file_names = {}
for yr in [2014,2015,2016]:
    f_name = f'data/Folsom_sky_images_{yr}.tar.bz2'
    print(f_nmae)
    tar = tarfile.open(f_name, "r")
    tar_members_names = [filename for filename in tar.getnames()]
    img_file_names[yr] = {}
    get_all_file_names(tar_members_names,img_file_names[yr])


In [None]:
# save_pickle('image_file_names.pkl',new_img_file_names)
# save_pickle('data/df_solar_and_img_data.pkl',df_merge_1)
img_file_names = open_pickle('data/image_file_names.pkl')
df_merge_1 = open_pickle('data/df_solar_and_img_data.pkl')

Read in weather and irradiance data:

In [None]:
fol_irr = pd.read_csv('data/Folsom_irradiance.csv',index_col=0)
fol_sat = pd.read_csv('data/Folsom_satellite.csv')
fol_sky_img = pd.read_csv('data/Folsom_sky_image_features.csv',index_col=0)
fol_weather = pd.read_csv('data/Folsom_weather.csv',index_col=0)

Get datetime for each interval:

In [None]:
# fol_sat.columns
datetime_blank_min_before(fol_irr,5)
datetime_blank_min_before(fol_irr,10)
datetime_blank_min_before(fol_irr,15)
datetime_blank_min_before(fol_irr,20)
datetime_blank_min_before(fol_irr,25)
datetime_blank_min_before(fol_irr,30)

In [None]:
fol_weather['timestamp'] = pd.to_datetime(fol_weather['timestamp'])

Merge the irradance DFs to the folsome weather DFs on the interval before: 

In [None]:
df_5_min_ahead = pd.merge(fol_irr,fol_weather,how="left", left_on="5_min_before", right_on="timestamp")
df_10_min_ahead = pd.merge(fol_irr,fol_weather,how="left", left_on="10_min_before", right_on="timestamp")
df_15_min_ahead = pd.merge(fol_irr,fol_weather,how="left", left_on="15_min_before", right_on="timestamp")
df_20_min_ahead = pd.merge(fol_irr,fol_weather,how="left", left_on="20_min_before", right_on="timestamp")
df_25_min_ahead = pd.merge(fol_irr,fol_weather,how="left", left_on="25_min_before", right_on="timestamp")
df_30_min_ahead = pd.merge(fol_irr,fol_weather,how="left", left_on="30_min_before", right_on="timestamp")

In [None]:
df_5_min_ahead = df_5_min_ahead.dropna()[['timestamp_x','5_min_before', 'ghi', 'dni', 'dhi',
                                  'air_temp', 'relhum', 'press', 'windsp', 'winddir','max_windsp', 
                                  'precipitation']].rename(columns={'timestamp_x':'timestamp'})

df_10_min_ahead = df_10_min_ahead.dropna()[['timestamp_x','10_min_before', 'ghi', 'dni', 'dhi',
                                  'air_temp', 'relhum', 'press', 'windsp', 'winddir','max_windsp', 
                                  'precipitation']].rename(columns={'timestamp_x':'timestamp'})

df_15_min_ahead = df_15_min_ahead.dropna()[['timestamp_x','15_min_before', 'ghi', 'dni', 'dhi',
                                  'air_temp', 'relhum', 'press', 'windsp', 'winddir','max_windsp', 
                                  'precipitation']].rename(columns={'timestamp_x':'timestamp'})

df_20_min_ahead = df_20_min_ahead.dropna()[['timestamp_x','20_min_before', 'ghi', 'dni', 'dhi',
                                  'air_temp', 'relhum', 'press', 'windsp', 'winddir','max_windsp', 
                                  'precipitation']].rename(columns={'timestamp_x':'timestamp'})

df_25_min_ahead = df_25_min_ahead.dropna()[['timestamp_x','25_min_before', 'ghi', 'dni', 'dhi',
                                  'air_temp', 'relhum', 'press', 'windsp', 'winddir','max_windsp', 
                                  'precipitation']].rename(columns={'timestamp_x':'timestamp'})

df_30_min_ahead = df_30_min_ahead.dropna()[['timestamp_x','30_min_before', 'ghi', 'dni', 'dhi',
                                  'air_temp', 'relhum', 'press', 'windsp', 'winddir','max_windsp', 
                                  'precipitation']].rename(columns={'timestamp_x':'timestamp'})

In [None]:
for table in [df_5_min_ahead,df_10_min_ahead,df_15_min_ahead,df_20_min_ahead,df_25_min_ahead,df_30_min_ahead]:
#     print(table.columns[1])
    breakdown_dates(table,table.columns[1])

Using functions get the times below and above ours to match the image files to:

In [None]:
for table in [df_5_min_ahead,df_10_min_ahead,df_15_min_ahead,df_20_min_ahead,df_25_min_ahead,df_30_min_ahead]:
    print(table.columns[1])
    table['higher_file'] = table.apply(lambda row: make_higher_image_path(row),axis=1)
    table['lower_file'] = table.apply(lambda row: make_lower_image_path(row),axis=1)

Get the correct image file that falls between the two columns above:

In [None]:
for table in [df_5_min_ahead,df_10_min_ahead,df_15_min_ahead,df_20_min_ahead,df_25_min_ahead,df_30_min_ahead]:
    print(table.columns[1])
    table['file'] = table.apply(lambda row: get_correct_file(row,img_file_names),axis=1)

Only get instances where we have files for:

In [None]:
df_5_min_ahead_w_img = df_5_min_ahead[(~df_5_min_ahead.file.isnull()) & (df_5_min_ahead.file != 0)]
df_10_min_ahead_w_img = df_10_min_ahead[(~df_10_min_ahead.file.isnull()) & (df_10_min_ahead.file != 0)]
df_15_min_ahead_w_img = df_15_min_ahead[(~df_15_min_ahead.file.isnull()) & (df_15_min_ahead.file != 0)]
df_20_min_ahead_w_img = df_20_min_ahead[(~df_20_min_ahead.file.isnull()) & (df_20_min_ahead.file != 0)]
df_25_min_ahead_w_img = df_25_min_ahead[(~df_25_min_ahead.file.isnull()) & (df_25_min_ahead.file != 0)]
df_30_min_ahead_w_img = df_30_min_ahead[(~df_30_min_ahead.file.isnull()) & (df_30_min_ahead.file != 0)]

In [3]:
# save_pickle("df_5_min_ahead_data.pkl",df_5_min_ahead_w_img)
# save_pickle("df_10_min_ahead_data.pkl",df_10_min_ahead_w_img)
# save_pickle("df_15_min_ahead_data.pkl",df_15_min_ahead_w_img)
# save_pickle("df_20_min_ahead_data.pkl",df_20_min_ahead_w_img)
# save_pickle("df_25_min_ahead_data.pkl",df_25_min_ahead_w_img)
# save_pickle("df_30_min_ahead_data.pkl",df_30_min_ahead_w_img)
df_5_min_ahead_w_img = open_pickle("df_5_min_ahead_data.pkl")
df_10_min_ahead_w_img = open_pickle("df_10_min_ahead_data.pkl")
df_15_min_ahead_w_img = open_pickle("df_15_min_ahead_data.pkl")
df_20_min_ahead_w_img = open_pickle("df_20_min_ahead_data.pkl")
df_25_min_ahead_w_img = open_pickle("df_25_min_ahead_data.pkl")
df_30_min_ahead_w_img = open_pickle("df_30_min_ahead_data.pkl")

In [4]:
df_5_min_ahead_w_img = df_5_min_ahead_w_img[['timestamp', '5_min_before', 'ghi', 'air_temp', 
                                             'relhum','press', 'windsp', 'winddir', 
                                             'max_windsp', 'precipitation', 'file']]
df_10_min_ahead_w_img = df_10_min_ahead_w_img[['timestamp', '10_min_before', 'ghi','air_temp', 
                                               'relhum','press', 'windsp', 'winddir', 
                                               'max_windsp', 'precipitation', 'file']]
df_15_min_ahead_w_img = df_15_min_ahead_w_img[['timestamp', '15_min_before', 'ghi', 
                                               'air_temp', 'relhum','press', 'windsp', 'winddir', 
                                               'max_windsp', 'precipitation', 'file']]
df_20_min_ahead_w_img = df_20_min_ahead_w_img[['timestamp', '20_min_before', 'ghi', 
                                               'air_temp', 'relhum','press', 'windsp', 'winddir', 
                                               'max_windsp', 'precipitation', 'file']]
df_25_min_ahead_w_img = df_25_min_ahead_w_img[['timestamp', '25_min_before', 'ghi', 
                                               'air_temp', 'relhum','press', 'windsp', 'winddir', 
                                               'max_windsp', 'precipitation', 'file']]
df_30_min_ahead_w_img = df_30_min_ahead_w_img[['timestamp', '30_min_before', 'ghi', 
                                               'air_temp', 'relhum','press', 'windsp', 'winddir', 
                                               'max_windsp', 'precipitation', 'file']]

Update datetime format on all DFs before we join to get irradiance from earlier timestamps:

In [6]:
for df in [df_5_min_ahead_w_img,df_10_min_ahead_w_img,df_15_min_ahead_w_img,
           df_20_min_ahead_w_img,df_25_min_ahead_w_img,df_30_min_ahead_w_img]:
    
    df['timestamp'] = pd.to_datetime(df['timestamp'])

Now join tables, to match on their time ahead intervals. This will give us the irradiance at the time interval before.

In [12]:
df_5_min = df_5_min_ahead_w_img.merge(fol_irr,how="left", left_on="5_min_before", right_on="timestamp")
df_10_min = df_10_min_ahead_w_img.merge(fol_irr,how="left", left_on="10_min_before", right_on="timestamp")
df_15_min = df_15_min_ahead_w_img.merge(fol_irr,how="left", left_on="15_min_before", right_on="timestamp")
df_20_min = df_20_min_ahead_w_img.merge(fol_irr,how="left", left_on="20_min_before", right_on="timestamp")
df_25_min = df_25_min_ahead_w_img.merge(fol_irr,how="left", left_on="25_min_before", right_on="timestamp")
df_30_min = df_30_min_ahead_w_img.merge(fol_irr,how="left", left_on="30_min_before", right_on="timestamp")

In [17]:
# df_20_min.head(21)

Using the fuction to get the columns we are supposed to have:

In [18]:
df_5_min = update_df_for_model(df_5_min,"5_min_before")
df_10_min = update_df_for_model(df_10_min,"10_min_before")
df_15_min = update_df_for_model(df_15_min,"15_min_before")
df_20_min = update_df_for_model(df_20_min,"20_min_before")
df_25_min = update_df_for_model(df_25_min,"25_min_before")
df_30_min = update_df_for_model(df_30_min,"30_min_before")

In [26]:
save_pickle("../data_rp/df_5_min_data.pkl",df_5_min)
save_pickle("../data_rp/df_10_min_data.pkl",df_10_min)
save_pickle("../data_rp/df_15_min_data.pkl",df_15_min)
save_pickle("../data_rp/df_20_min_data.pkl",df_20_min)
save_pickle("../data_rp/df_25_min_data.pkl",df_25_min)
save_pickle("../data_rp/df_30_min_data.pkl",df_30_min)

In [53]:
for df in [df_5_min,df_10_min,df_15_min,df_20_min,df_25_min,df_30_min]:
#     breakdown_dates(df,'timestamp')
    df['season'] = df.month.apply(lambda row: replace_m_w_season(row))

In [56]:
df_5_min.columns

Index(['Y', 'timestamp', '5_min_before', 'air_temp', 'relhum', 'press',
       'windsp', 'winddir', 'max_windsp', 'precipitation', 'file',
       '5_min_before_i', 'year', 'month', 'day', 'hour', 'min', 'sec',
       'season'],
      dtype='object')

In [64]:
# t = 5
for df in [df_5_min,df_10_min,df_15_min,df_20_min,df_25_min,df_30_min]:
    df.drop(['day','min','sec'], axis=1,inplace=True)
#     col = f'{t}_min_before'
#     col_1 = f'{t}_min_before_i'
# #     print(t,col,col_1)
#     df = df[['Y', 'timestamp', col, 'air_temp', 'relhum', 'press',
#        'windsp', 'winddir', 'max_windsp', 'precipitation', 'file',
#        col_1, 'year', 'month', 'hour','season']]
#     t += 5

### Data Exploration:

In [71]:
previews = []
described = []
for df in [df_5_min,df_10_min,df_15_min,df_20_min,df_25_min,df_30_min]:
    previews.append(preview_df(df))
    described.append(df.describe())

In [73]:
described[5]

Unnamed: 0,Y,air_temp,relhum,press,windsp,winddir,max_windsp,precipitation,30_min_before_i,year,month,hour,season
count,764846.0,764846.0,764846.0,764846.0,764846.0,764846.0,764846.0,764846.0,764846.0,764846.0,764846.0,764846.0,764846.0
mean,412.049214,21.337232,44.304767,1003.443749,1.567333,216.414405,2.51834,0.002666,412.803596,2015.023526,6.50635,14.907275,2.57509
std,296.527848,8.384509,21.283695,4.969679,0.890709,76.316673,1.312563,0.035558,295.533668,0.810692,3.215236,7.712866,1.052862
min,0.0,-2.9,4.86,983.0,0.0,0.0,0.0,0.0,0.0,2014.0,1.0,0.0,1.0
25%,142.2,14.7,26.94,1000.0,0.94,153.4,1.6,0.0,142.3,2014.0,4.0,14.0,2.0
50%,381.5,20.58,40.48,1003.0,1.42,228.5,2.3,0.0,381.5,2015.0,7.0,17.0,3.0
75%,658.3,27.84,59.64,1007.0,2.0,280.7,3.2,0.0,658.3,2016.0,9.0,20.0,3.0
max,1466.0,42.78,94.0,1021.0,9.3,360.0,13.5,4.77,1466.0,2016.0,12.0,23.0,4.0


In [81]:
df_5_min[df_5_min.columns[1:]]

Unnamed: 0,timestamp,5_min_before,air_temp,relhum,press,windsp,winddir,max_windsp,precipitation,file,5_min_before_i,year,month,hour,season
0,2014-01-02 15:33:00,2014-01-02 15:28:00,2.80,75.06,1010.0,2.00,199.6,2.6,0.0,2014/01/02/20140102_152808.jpg,4.87,2014,1,15,1
1,2014-01-02 15:34:00,2014-01-02 15:29:00,2.70,75.50,1010.0,1.74,190.4,2.4,0.0,2014/01/02/20140102_152907.jpg,5.59,2014,1,15,1
2,2014-01-02 15:35:00,2014-01-02 15:30:00,2.70,75.54,1010.0,1.78,193.6,2.3,0.0,2014/01/02/20140102_153008.jpg,1.23,2014,1,15,1
3,2014-01-02 15:36:00,2014-01-02 15:31:00,2.70,74.98,1010.0,1.72,192.2,2.1,0.0,2014/01/02/20140102_153108.jpg,1.62,2014,1,15,1
4,2014-01-02 15:37:00,2014-01-02 15:32:00,2.62,74.76,1010.0,1.66,188.2,2.4,0.0,2014/01/02/20140102_153208.jpg,2.04,2014,1,15,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764841,2016-12-31 00:48:00,2016-12-31 00:43:00,11.20,71.40,1001.0,2.12,136.8,3.1,0.0,2016/12/31/20161231_004300.jpg,6.14,2016,12,0,1
764842,2016-12-31 00:49:00,2016-12-31 00:44:00,11.20,71.58,1001.0,2.46,138.6,3.0,0.0,2016/12/31/20161231_004400.jpg,5.13,2016,12,0,1
764843,2016-12-31 00:50:00,2016-12-31 00:45:00,11.20,71.60,1001.0,2.02,133.4,2.9,0.0,2016/12/31/20161231_004500.jpg,4.22,2016,12,0,1
764844,2016-12-31 00:51:00,2016-12-31 00:46:00,11.20,71.64,1001.0,2.04,142.6,3.4,0.0,2016/12/31/20161231_004559.jpg,3.37,2016,12,0,1


In [106]:
cols = df_5_min.columns[-4:-2].to_list()
cols.append(df_5_min.columns[-1])
print(cols)

['year', 'month', 'season']


In [108]:
df_5_min.columns

Index(['Y', 'timestamp', '5_min_before', 'air_temp', 'relhum', 'press',
       'windsp', 'winddir', 'max_windsp', 'precipitation', 'file',
       '5_min_before_i', 'year', 'month', 'hour', 'season'],
      dtype='object')

### Pre-Processing and Scaling:

Train-test split: following regular proportions (pareto method).

In [122]:
train_test_data = {}
t = 5
for df in [df_5_min,df_10_min,df_15_min,df_20_min,df_25_min,df_30_min]:
    df_name = f'df_{t}_min'
    rm_col = f"{t}_min_before"
    train_test_data[df_name] = {}
    x_cols = df.columns[1:].to_list()
    x_cols.remove('file')
    x_cols.remove(rm_col)
    X_train, X_test, y_train, y_test = train_test_split(df[x_cols], df[df.columns[0]], 
                                                        test_size=0.20, random_state=42)
    train_test_data[df_name]['X_train'] = X_train
    train_test_data[df_name]['X_test'] = X_test
    train_test_data[df_name]['y_train'] = y_train
    train_test_data[df_name]['y_test'] = y_test
    t += 5
    

['timestamp', 'air_temp', 'relhum', 'press', 'windsp', 'winddir', 'max_windsp', 'precipitation', '5_min_before_i', 'year', 'month', 'hour', 'season']
['timestamp', 'air_temp', 'relhum', 'press', 'windsp', 'winddir', 'max_windsp', 'precipitation', '10_min_before_i', 'year', 'month', 'hour', 'season']
['timestamp', 'air_temp', 'relhum', 'press', 'windsp', 'winddir', 'max_windsp', 'precipitation', '15_min_before_i', 'year', 'month', 'hour', 'season']
['timestamp', 'air_temp', 'relhum', 'press', 'windsp', 'winddir', 'max_windsp', 'precipitation', '20_min_before_i', 'year', 'month', 'hour', 'season']
['timestamp', 'air_temp', 'relhum', 'press', 'windsp', 'winddir', 'max_windsp', 'precipitation', '25_min_before_i', 'year', 'month', 'hour', 'season']
['timestamp', 'air_temp', 'relhum', 'press', 'windsp', 'winddir', 'max_windsp', 'precipitation', '30_min_before_i', 'year', 'month', 'hour', 'season']


In [87]:
# train_test_data['df_5_min']['X_test']

In [None]:
def update_train_and_test =

In [127]:
t = 5
for _ in range(0,6):
    df_name = f'df_{t}_min'
    train = train_test_data[df_name]['X_train']
    test = train_test_data[df_name]['X_test']
    x_train,x_test = process_timeahead_attributes(t,train,test)
    train_test_data[df_name]['X_train_p'],train_test_data[df_name]['X_test_p'] = x_train,x_test
    
    
    t+=5
#     process_timeahead_attributes(con_cols,cat_cols,train,test)

season
season
season
season
season
season


In [124]:
# pd.DataFrame(train_test_data['df_5_min']['X_train_p'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1.0,0.0,0.0,0.567426,0.291900,0.526316,0.225806,0.689444,0.303704,0.000000,0.655525,0.0,0.363636,0.826087
1,0.0,0.0,1.0,0.362522,0.814225,0.578947,0.036559,0.200167,0.059259,0.000000,0.110505,0.0,0.818182,0.652174
2,1.0,0.0,0.0,0.427758,0.780121,0.263158,0.563441,0.414444,0.762963,0.006289,0.017347,1.0,0.181818,0.000000
3,0.0,1.0,0.0,0.727671,0.266323,0.394737,0.161290,0.695278,0.177778,0.000000,0.223874,0.0,0.545455,0.043478
4,0.0,0.0,0.0,0.490368,0.564057,0.631579,0.058065,0.578333,0.059259,0.000000,0.150341,0.5,0.090909,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611871,0.0,0.0,0.0,0.502627,0.371775,0.631579,0.058065,0.701111,0.111111,0.000000,0.324625,0.5,0.000000,0.956522
611872,0.0,1.0,0.0,0.747811,0.181063,0.473684,0.197849,0.555278,0.229630,0.000000,0.198363,0.5,0.454545,0.043478
611873,0.0,1.0,0.0,0.612960,0.460624,0.526316,0.075269,0.643889,0.103704,0.000000,0.004209,0.0,0.545455,0.130435
611874,0.0,1.0,0.0,0.380911,0.698676,0.500000,0.060215,0.395000,0.081481,0.000000,0.014379,1.0,0.636364,0.565217


In [125]:
# train_test_data['df_5_min']['X_train']

Unnamed: 0,timestamp,air_temp,relhum,press,windsp,winddir,max_windsp,precipitation,5_min_before_i,year,month,hour,season
75165,2014-05-03 19:48:00,23.02,30.88,1003.0,2.10,248.20,4.1,0.00,961.00,2014,5,19,2
197472,2014-10-16 15:44:00,13.66,77.44,1005.0,0.34,72.06,0.8,0.00,162.00,2014,10,15,4
543304,2016-03-06 00:32:00,16.64,74.40,993.0,5.24,149.20,10.3,0.03,25.43,2016,3,0,2
122329,2014-07-11 01:28:00,30.34,28.60,998.0,1.50,250.30,2.4,0.00,328.20,2014,7,1,3
262323,2015-02-04 00:04:00,19.50,55.14,1007.0,0.54,208.20,0.8,0.00,220.40,2015,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,2015-01-29 22:01:00,20.06,38.00,1007.0,0.54,252.40,1.5,0.00,475.90,2015,1,22,1
365838,2015-06-19 01:43:00,31.26,21.00,1001.0,1.84,199.90,3.1,0.00,290.80,2015,6,1,3
131932,2014-07-22 03:18:00,25.10,45.92,1003.0,0.70,231.80,1.4,0.00,6.17,2014,7,3,3
671155,2016-08-11 13:45:00,14.50,67.14,1002.0,0.56,142.20,1.1,0.00,21.08,2016,8,13,3


In [128]:
# train_test_data['df_5_min']['X_test']

Unnamed: 0,timestamp,air_temp,relhum,press,windsp,winddir,max_windsp,precipitation,5_min_before_i,year,month,hour,season
212504,2014-11-09 15:03:00,11.10,74.34,1004.0,1.46,55.0,2.1,0.0,12.83,2014,11,15,4
730167,2016-10-31 22:50:00,17.10,51.86,1004.0,0.30,128.9,0.5,0.0,163.50,2016,10,22,4
650333,2016-07-17 16:02:00,19.54,51.86,1000.0,1.46,144.2,3.2,0.0,514.10,2016,7,16,3
540543,2016-03-01 22:41:00,22.60,29.70,1007.0,1.66,175.0,2.8,0.0,568.50,2016,3,22,2
272258,2015-02-19 22:10:00,14.42,73.84,1008.0,1.50,300.2,2.0,0.0,262.70,2015,2,22,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
127018,2014-07-16 16:55:00,23.30,51.42,1004.0,1.62,161.8,3.5,0.0,357.00,2014,7,16,3
162740,2014-08-28 23:07:00,35.80,19.30,1001.0,1.76,288.8,2.7,0.0,606.70,2014,8,23,3
666995,2016-08-06 02:46:00,27.50,32.10,998.0,0.66,163.6,1.8,0.0,48.22,2016,8,2,3
330566,2015-05-09 13:56:00,8.64,86.20,1007.0,1.08,122.5,1.4,0.0,94.60,2015,5,13,2


In [132]:
save_pickle('model_data_dict.pkl',train_test_data)

In [None]:
# for i in img_file_names['2014']['2014/01']:
#     print(i,i.find('20140101'))
# img_file_names['2014']['2014/01']['2014/01/02']

# img_file_names['2014']['2014/01']['2014/01/02'][0][22:26]
# img_file_names['2014']['2014/01']['2014/01/02']

# tester_1 = df_merge_1.iloc[448:500]
# tester_1['file'] = tester_1.apply(lambda row: get_correct_file(row,img_file_names),axis=1)
# tester_1[['timestamp','file']]
# tester_1[tester_1.file.isnull()]['files'].iloc[0]

# df_merge_1.iloc[448:500].apply(lambda row: get_correct_file(row,img_file_names),axis=1)

In [None]:
# df_merge_1_w_imgs[df_merge_1_w_imgs['file'] == 0]
# 18720 - 16848
# c - df_merge_1_w_imgs.shape[0]
# 16848
# c

In [None]:
c - 764855

check to see which image files are not being captured by the script:

In [None]:
skipped = {}

for yr in img_file_names.keys():
    skipped[yr] = {}
    for yrmn in img_file_names[yr].keys():
        skipped[yr][yrmn] = {}
        for yrmnday in img_file_names[yr][yrmn].keys():
            skipped[yr][yrmn][yrmnday] = []
            for time in img_file_names[yr][yrmn][yrmnday]:
                if df_merge_1_w_imgs.file.isin([time]).sum() == 1:
                    pass
                else:
                    skipped[yr][yrmn][yrmnday].append(time)
#             c += len(img_file_names[yr][yrmn][yrmnday])

get the seconds that the fall within:

In [None]:
seconds_skipped = {}

for yr in skipped.keys():
    for yrmn in skipped[yr].keys():
        for yrmnday in skipped[yr][yrmn].keys():
            for time in skipped[yr][yrmn][yrmnday]:
                s = time[24:26]
                if s in seconds_skipped.keys():
                    seconds_skipped[s].append(time)
                else:
                    seconds_skipped[s] = []
                    seconds_skipped[s].append(time)

get the hours that the fall within:

In [None]:
hours_skipped = {}

for yr in skipped.keys():
    for yrmn in skipped[yr].keys():
        for yrmnday in skipped[yr][yrmn].keys():
            for time in skipped[yr][yrmn][yrmnday]:
                s = time[20:22]
                if s in hours_skipped.keys():
                    hours_skipped[s].append(time)
                else:
                    hours_skipped[s] = []
                    hours_skipped[s].append(time)

In [None]:
#get counts for the seconds below:
# seconds_skipped.keys()

# for i in ['42','41', '40', '44', '47', '46', '51', '50', '54', '55','59','56','58','53','57']:
#     print(i,len(seconds_skipped[i]))
    
#look into the groups, figure out why it's not being caught:
# seconds_skipped['42']

In [None]:
# skipped['2014']['2014/01'].keys()
# skipped['2014']['2014/01']['2014/01/02']
# '2014/01/02/20140102_004912.jpg'[24:26]
# df_merge_1[(df_merge_1['hour'] ==0)&(df_merge_1['day'] == 12)&(df_merge_1['year'] == 2014)&(df_merge_1['month'] == 11)][['timestamp','files','file']]
df_merge_1[(df_merge_1['hour'] ==0)&(df_merge_1['day'] == 12)&(df_merge_1['year'] == 2014)&(df_merge_1['month'] == 11)]['files'].iloc[0]


In [None]:
# c
# df_merge_1[(df_merge_1['hour'] ==19)&(df_merge_1['day'] == 14)&(df_merge_1['year'] == 2015)&(df_merge_1['month'] == 12)][['files']].iloc[0][0]

In [None]:
df_merge_1_w_imgs[['timestamp','file']]

In [None]:
df_merge_1_w_imgs[['timestamp','file']]

765159
vs.
279863

In [None]:
# df_5_min.describe()
# df_10_min.describe()
# df_15_min.describe()
# df_20_min.describe()
# df_25_min.describe()
# df_30_min.describe()

# sns.distplot(df_5_min['5_min_before_i']);
# sns.distplot(df_5_min['Y']);
# sns.distplot(df_5_min['air_temp']);
# g = sns.PairGrid(df_5_min, height=3.5)
# g.map(sns.scatterplot)

# corr = df_5_min.corr()
# plt.figure(figsize=(30,20))
# mask = np.zeros_like(corr)
# mask[np.triu_indices_from(mask)] = True
# with sns.axes_style("white"):
#     ax = sns.heatmap(corr,mask=mask,center=0,cmap="coolwarm",annot=True,linewidths=.5)

# corr = df_30_min.corr()
# plt.figure(figsize=(30,20))
# mask = np.zeros_like(corr)
# mask[np.triu_indices_from(mask)] = True
# with sns.axes_style("white"):
#     ax = sns.heatmap(corr,mask=mask,center=0,cmap="coolwarm",annot=True,linewidths=.5)

In [None]:
merge_test_2 = pd.merge(fol_irr, fol_sky_img,
                        how="left", on="timestamp")
merge_test_2.dropna()

In [None]:
merge_test = pd.merge(fol_irr, fol_sky_img,
                        how="left", on=["year","month","day","hour","min"])

In [None]:
merge_test.dropna()

In [None]:
merge_test

In [None]:
fol_irr.head()

In [None]:
fol_sat.head()

In [None]:
# fol_sky_img.rename(columns={'timestamp':'timeStamp'},inplace=True)
# fol_sky_img.head()
# fol_sky_img[fol_sky_img['timeStamp']=='2014-01-02 08:00:00']
# fol_waether.iloc[0,0]