# Import Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import requests
import os
import pandas as pd
import seaborn as sns
import pickle
import random

import tensorflow as tf
from tensorflow import keras
from keras import metrics


from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Setting up files

In [None]:
# define path
path = '' # file path to folder where the images are

ex_im_path = os.path.join(path, '2007/Jan/2007_1_1.png') 

im = Image.open(ex_im_path)
pix = np.array(im.getdata())
pix2 = np.reshape(pix, (430, 400, 4))

# find sea ice concentration values
pix4 = pix2[400:430, 40:380]
plt.imshow(pix4)
plt.show()

df_sc_color = pd.DataFrame(data=pix4[15])
df_sc_color = df_sc_color.drop_duplicates()
df_sc_color = df_sc_color.drop([0])

percent_list = np.arange(20, 101, 5)

df_sc_color['percent'] = percent_list
df_sc_color = df_sc_color.reset_index(drop=True)


sic_color_mapping_np = df_sc_color.to_numpy()

### create a file named saved_data in the main folder
# save a mapping reference dataframe
sic_path = os.path.join(path, 'saved_data/sic_mapping.npy')
np.save(sic_path, sic_color_mapping_np) 

# Functions

In [None]:
# generate a list of dates
def generate_dates(dates_list, num_dates):
  """
  take in number of dates to create
  return list of year, month, days

  """

  dates_data = dates_list

  dates_sample = dates_data.sample(n=num_dates, random_state = 0)

  return dates_sample


In [None]:
# dictionary for producing month numbers
months_dict = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}

# call images from google folder and create list of input image data
def open_images(list_dates, miss_idx_list, missing_images = False):
  """
  take list of generated dates
  crop images to clean data
  return array of image data

  """
  image_data = []
  sic_paths = {}

  count = 0

  if missing_images == False:
    for i in list_dates.index:
        year = str(list_dates['Year'][i])
        month_num = list_dates['Month'][i]
        month = months_dict[month_num]
        day = list_dates['Day'][i]
        file_name = f"{year}_{month_num}_{day}.png"

        im = Image.open(os.path.join(path, year, str(month), file_name))
        im_dat = np.array(im.convert('RGB').getdata())
        im_dat = np.reshape(im_dat, (430, 400, 3))
        im_dat = im_dat[2:340, 50:330]
        
        image_data.append(im_dat)
        
    image_data = np.reshape(image_data, (len(image_data), 338, 280, 3))   
    return image_data
    

  else:
    for i in list_dates.index:
        year = str(list_dates['Year'][i])
        month_num = list_dates['Month'][i]
        month = months_dict[month_num]
        day = list_dates['Day'][i]

        if count in miss_idx_list:
            file_name = f"{year}_{month_num}_{day}_im_dat.npy"
            im_dat = np.load(os.path.join(path, year, str(month), file_name))
            im_dat.astype(int)
            
            sic_path = os.path.join(path, year, str(month), f"{year}_{month_num}_{day}_sic.npy")
            sic_paths[count] = sic_path
            count += 1
        else:
            file_name = f"{year}_{month_num}_{day}.png"

            im = Image.open(os.path.join(path, year, str(month), file_name))
            im_dat = np.array(im.convert('RGB').getdata())
            im_dat = np.reshape(im_dat, (430, 400, 3))
            im_dat = im_dat[2:340, 50:330]
            
            count += 1

        image_data.append(np.reshape(im_dat, (338, 280, 3)))

    image_data = np.reshape(image_data, (len(image_data), 338, 280, 3))
    return image_data, sic_paths

In [None]:
# create a dataframe of reference colors for sea ice concentration percentage
sic_color_mapping = np.load('207_data/saved_data/sic_mapping.npy')

sic_color_mapping = np.concatenate((sic_color_mapping[:, :3], sic_color_mapping[:, -1:]), axis=1)

sic_color_df = pd.DataFrame(sic_color_mapping, columns = ["Red", "Green", "Blue", "Percent"])

# create a reference list from data frame records
list_mst = sic_color_df[['Red', 'Green', 'Blue']].values.tolist()

# sea ice concentration image array
def sic_output(x_data, sic_paths, miss_idx_list, missing_images = False):
    """
    take list of image data with 3 channels
    return list of sea ice concentration image arrays

    """
    sic_images_arr = []

    count = 0
  
    if missing_images == False:

        for im in x_data:
            sc_image = []
            
            for row in im:
                for pixel_to_search in row:
                    
                    df_idx = np.where(list(pixel_to_search.tolist() == plist for plist in list_mst))[0]
                    if df_idx.size == 0:
                        percent = 0
                    else:
                        percent = sic_color_df['Percent'].iloc[df_idx].values[0]
                    sc_image.append(percent)

            sc_image = np.reshape(sc_image, (338, 280))
            
            sic_images_arr.append(sc_image)

        sic_images_arr = np.reshape(sic_images_arr, (len(sic_images_arr), 338, 280))
    else:
        for im in x_data:
            sc_image = []
            
            if count in miss_idx_list:
                sc_image = np.load(sic_paths[count])
                count += 1

            else:
                for row in im:
                    for pixel_to_search in row:

                        df_idx = np.where(list(pixel_to_search.tolist() == plist for plist in list_mst))[0]
                        if df_idx.size == 0:
                            percent = 0
                        else:
                            percent = sic_color_df['Percent'].iloc[df_idx].values[0]
                        sc_image.append(percent)
                    count += 1 
            
            sc_image = np.reshape(sc_image, (338, 280))
            sic_images_arr.append(sc_image)
            
        sic_images_arr = np.reshape(sic_images_arr, (len(sic_images_arr), 338, 280))        

    return sic_images_arr


# Data Augmentation 

In [None]:
list_im_date_path = '' # path to N_seaice_extent_daily_v3.0.csv
                    # my path is '207_data/saved_data/N_seaice_extent_daily_v3.0.csv'
                    # csv file in saved_data folder on google drive

dates_data = pd.read_csv(list_im_date_path,
                         usecols = [0, 1, 2], header = 0, skiprows = 1, names = ['Year', 'Month', 'Day'])

In [None]:
# create an all dates pandas data frame

days_30 = np.arange(1, 31)
days_31 = np.arange(1, 32)
days_28 = np.arange(1, 29)
days_29 = np.arange(1, 30)

years = np.arange(1978, 2024)

months = np.arange(1, 13)

all_dates = pd.DataFrame(columns = ["Year", "Month", "Day"])

for year in years:
    for month in months:
        if month == 4 or month == 6 or month == 9 or month == 11:
            year_col = np.repeat(year, 30)
            month_col = np.repeat(month, 30)
            new_month = pd.DataFrame({'Year':year_col, 'Month':month_col, 'Day':days_30})
            all_dates = pd.concat([all_dates, new_month])
        elif month == 2:
            if (year % 4) == 0:
                year_col = np.repeat(year, 29)
                month_col = np.repeat(month, 29)
                new_month = pd.DataFrame({'Year':year_col, 'Month':month_col, 'Day':days_29})
                all_dates = pd.concat([all_dates, new_month])
            else:
                year_col = np.repeat(year, 28)
                month_col = np.repeat(month, 28)
                new_month = pd.DataFrame({'Year':year_col, 'Month':month_col, 'Day':days_28})
                all_dates = pd.concat([all_dates, new_month])
        else:
            year_col = np.repeat(year, 31)
            month_col = np.repeat(month, 31)
            new_month = pd.DataFrame({'Year':year_col, 'Month':month_col, 'Day':days_31})
            all_dates = pd.concat([all_dates, new_month])

In [None]:
# start index 298 (1978/oct/26) end index -75 (2023/oct/18)

all_dates = all_dates[298:-74]
all_dates = all_dates.reset_index(drop=True)
intersection = pd.merge(dates_data, all_dates)
missing_days = pd.concat([all_dates, intersection]).drop_duplicates(keep=False)

In [None]:
missing_days['Missing'] = True
missing_days['Missing'].iloc[-10:] = False

all_dates = all_dates.merge(missing_days, how = 'left', on = ['Year', 'Month', 'Day'])
miss_idx_list = [i for i, x in enumerate(all_dates['Missing'] == True) if x]

In [None]:
# create x and y data for days that are missing

for i in missing_days[:-51].index:
    
    m_year = missing_days['Year'][i]
    m_month = missing_days['Month'][i]
    m_day = missing_days['Day'][i]
    
    get_idx = all_dates[(all_dates['Year'] == m_year) & (all_dates['Month']== m_month) & (all_dates['Day']== m_day)].index[0]
    previous_day = all_dates.iloc[get_idx - 1].to_frame().T
    next_day = all_dates.iloc[get_idx + 1].to_frame().T
    input_days = pd.concat([previous_day, next_day])
    
    x_blend = open_images(input_days)
    y_blend = sic_output(x_blend)
    
    new_x = (x_blend[0] + x_blend[1])/2
    new_y = (y_blend[0] + y_blend[1])/2
    
    month_name = months[m_month]
    save_path = f'207_data/{str(m_year)}/{month_name}'
    fname_x = f'{str(m_year)}_{str(m_month)}_{str(m_day)}_im_dat.npy'
    fname_y = f'{str(m_year)}_{str(m_month)}_{str(m_day)}_sic.npy'

In [None]:
train_inputs = generate_dates(all_dates.iloc[:-2961], 3500)
train_inputs_sorted = train_inputs.sort_values(by =['Year', 'Month', 'Day']).reset_index(drop=True)

train_inputs_sorted = train_inputs_sorted.drop(np.arange(842, 851).tolist())
train_inputs_sorted = train_inputs_sorted.drop(np.arange(467, 474).tolist())

train_add_in = generate_dates(all_dates.iloc[:-2961], 3516)[-16:]
train_inputs_f = pd.concat([train_inputs_sorted, train_add_in])
train_inputs_f_sorted = train_inputs_f.sort_values(by = ['Year', 'Month', 'Day']).reset_index(drop=True)

transform_to_sic_train = pd.DataFrame(columns = ['Year', 'Month', 'Day', 'Missing'])

for i in range(len(train_inputs_f_sorted)):
    transform_row = all_dates[(all_dates['Year'] == train_inputs_f_sorted['Year'][i] + 4) & 
                     (all_dates['Month'] == train_inputs_f_sorted['Month'][i]) &
                     (all_dates['Day'] == train_inputs_f_sorted['Day'][i])]
    transform_to_sic_train = pd.concat([transform_to_sic_train, transform_row])

# train_sic_transform = train_inputs_f_sorted[['Month', 'Day']]
# train_sic_transform['Year'] = train_inputs_f_sorted['Year'] + 4

miss_idx_list_sample_x = [i for i, x in enumerate(train_inputs_f_sorted['Missing'] == True) if x]
miss_idx_list_sample_y = [i for i, x in enumerate(transform_to_sic_train['Missing'] == True) if x]

In [None]:
# run to check if there are any images that the code are missing/inaccessible 
# if code returns nothing there is no problem

for i in train_inputs_f_sorted.index:
    year = str(train_inputs_f_sorted['Year'][i])
    month_num = train_inputs_f_sorted['Month'][i]
    month = months[month_num]
    day = train_inputs_f_sorted['Day'][i]
    
    if train_inputs_f_sorted['Missing'][i] == True:
        file_name = f"{year}_{month_num}_{day}_im_dat.npy"
    else:
        file_name = f"{year}_{month_num}_{day}.png"

    file_path = os.path.join('207_data', year, month, file_name)
    
    if os.path.exists(file_path) == False:
        print(file_path)

In [None]:
# build training dataset
x_train, sic_paths = open_images(train_inputs_f_sorted, miss_idx_list_sample_x, missing_images = True)
x_train_path = os.path.join(path, 'saved_data/x_train_3500_balanced.npy')
np.save(x_train_path, x_train)
y_input, sic_paths = open_images(transform_to_sic_train, miss_idx_list_sample_y, missing_images = True)
y_train = sic_output(y_input, sic_paths, miss_idx_list_sample_y, missing_images = True)
y_train_path = os.path.join(path, 'saved_data/y_train_3500_balanced.npy')
np.save(y_train_path, y_train)

In [None]:
# build testing dataset
missing_test = []
sic_paths = []
test_dates = all_dates.iloc[-2961:]
test_dates_im = test_dates[:1500]
test_dates_sic = test_dates[1461:]

test_im_dat = open_images(test_dates_im, missing_test, missing_images = False)
x_test_path = os.path.join(path, 'saved_data/x_test_1500_balanced.npy')
np.save(x_test_path, test_im_dat)
test_sic_y = sic_output(open_images(test_dates_sic, missing_test, missing_images = False),
                        sic_paths, missing_test, missing_images = False)
y_test_path = os.path.join(path, 'saved_data/y_test_1500_balanced.npy')
np.save(y_test_path, test_sic_y)

In [None]:
# load in dataset for future use after saving
x_train_path = os.path.join(path, 'saved_data/x_train_3500_balanced.npy')
y_train_path = os.path.join(path, 'saved_data/y_train_3500_balanced.npy')
x_test_path = os.path.join(path, 'saved_data/x_test_1500_balanced.npy')
y_test_path = os.path.join(path, 'saved_data/y_test_1500_balanced.npy')

x_train = np.load(x_train_path)
y_train = np.load(y_train_path)

x_test = np.load(x_test_path)
y_test = np.load(y_test_path)

In [2]:
x_train = np.load('207_data/saved_data/x_train_3500_balanced.npy')
y_train = np.load('207_data/saved_data/y_train_3500_balanced.npy')

In [52]:
random.seed(10)
rows_id = random.sample(range(0, 3500), 350)
all_rows = np.arange(0, 3500)

train_rows_id = np.delete(all_rows, rows_id)

x_train = x_train[train_rows_id, :, :]
y_train = y_train[train_rows_id, :, :]

x_val = x_train[rows_id, :, :]
y_val = y_train[rows_id, :, :]

(350, 338, 280)
(3150, 338, 280)
