
*   Data: Sentinel-2 satellite images of shape: (740, 740, 12)
*   Dimensionality reduction PCA

### PCA Code from: 
Mimi Kuan Ting Kuo - [Code link](https://github.com/MITCriticalData-Colombia/Dengue-Prediction-with-Satellite-Images/blob/main/Dimensionality_reduction_with_statistical_model.ipynb): [mimikuo365](https://github.com/mimikuo365)

# Setup Environment and Load Functions

In [1]:
#!pip install plotly
#!pip install epiweeks

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# sklearn library
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.metrics import mean_absolute_percentage_error
from sklearn import linear_model  
from sklearn.svm import SVR

In [4]:
from random import randint, randrange
from skimage import io
from skimage.transform import rescale, resize, downscale_local_mean

import skimage
import os
import pandas as pd
import numpy as np
import pickle
import plotly.express as px
import matplotlib.pyplot as plt

In [5]:
skimage.__version__

'0.19.2'

In [6]:
from epiweeks import Week, Year
from datetime import date

# Get list of images

In [7]:
# Main Directory
main_dir = 'Dataset/5001'
print(f'The total number of images is {len(os.listdir(main_dir))}')

The total number of images is 165


In [8]:
# specify the img directory path
path = main_dir

# list files in img directory
files = os.listdir(path)

image_list = []

for file in files:
    # make sure file is an image
    if file.endswith(('.jpg', '.png', 'jpeg', 'tiff')):
        #img_path = path + file
        image_list.append(os.path.join(path, file))
        
print(f'Image list top 5 examples:')
image_list.sort(reverse=False)
image_list[:5]

Image list top 5 examples:


['Dataset/5001/image_2015-11-01.tiff',
 'Dataset/5001/image_2015-11-08.tiff',
 'Dataset/5001/image_2015-11-15.tiff',
 'Dataset/5001/image_2015-11-22.tiff',
 'Dataset/5001/image_2015-11-29.tiff']

## Read Images

In [9]:
def read_image(path, target_size = (740, 740, 12), verbose = True):
    image = io.imread(path)
    
    if target_size:
        image_test = resize(image, (target_size[0], target_size[1]),
                               anti_aliasing=True)
        
    if verbose:
        print(os.path.basename(path), '(origin shape:', image.shape, '-> rescale:', str(image_test.shape) + ')')
    return image_test

In [10]:
def get_image_name(path):
    image_name = path[path.index('/image')+7:path.index('.tiff')]
    return image_name

In [11]:
def get_images(image_list, target_size = (740, 740, 12)):
    
    dates = list()
    images = list()
    
    for image_path in image_list:
        image = read_image(image_path, target_size, verbose=False)
        date = get_image_name(image_path)
        images.append(image)
        dates.append(date)
        
    images = np.array(images)
    dates = np.array(dates)
    return images, dates

In [12]:
target_size = (740, 740, 12)
images, dates = get_images(image_list, target_size)

In [13]:
images.shape

(165, 740, 740, 12)

In [14]:
dates.shape

(165,)

## Dimensionality reduction - PCA

In [15]:
def dimension_reduct_with_PCA(images, components):
    print(' PRINCIPAL COMPONENT ANALYSIS  '.center(100, '='))

    reshape_images = images.reshape(images.shape[0], -1)

    #pca = PCA(n_components=0.95) 
    pca = PCA(n_components=components)
    pca_images = pca.fit_transform(reshape_images)

    print('Origin shape'.ljust(15), reshape_images.shape)
    print('Resize shape'.ljust(15), pca_images.shape)  

    return pca_images

In [16]:
def pca_per_band(images, components_per_band = 10):
    pca_images = None

    for band in range(images.shape[3]):
        pca_band = dimension_reduct_with_PCA(images[:, :, band], components = components_per_band)
        
        if pca_images is None:
            pca_images = pca_band
        else:
            pca_images = np.append(pca_images, pca_band, axis = 1)

    print(pca_images.shape)
    return pca_images

### PCA per band

In [17]:
pca_images_per_band = pca_per_band(images, components_per_band = 10)

Origin shape    (165, 8880)
Resize shape    (165, 10)
Origin shape    (165, 8880)
Resize shape    (165, 10)
Origin shape    (165, 8880)
Resize shape    (165, 10)
Origin shape    (165, 8880)
Resize shape    (165, 10)
Origin shape    (165, 8880)
Resize shape    (165, 10)
Origin shape    (165, 8880)
Resize shape    (165, 10)
Origin shape    (165, 8880)
Resize shape    (165, 10)
Origin shape    (165, 8880)
Resize shape    (165, 10)
Origin shape    (165, 8880)
Resize shape    (165, 10)
Origin shape    (165, 8880)
Resize shape    (165, 10)
Origin shape    (165, 8880)
Resize shape    (165, 10)
Origin shape    (165, 8880)
Resize shape    (165, 10)
(165, 120)


In [17]:
def split_columns(df):
    df_aux = pd.DataFrame(df['PCA'].tolist())
    df_aux = pd.concat( [df['Date'], df_aux], axis=1)
    return df_aux

In [18]:
def generate_pca_dataframe(image_list, dates):
    
    pca_df = pd.DataFrame(columns=['Date', 'PCA'])
    
    for i in range(len(image_list)):
        image = image_list[i]
        date = dates[i]
        pca_df = pca_df.append({'Date': date, 'PCA': image}, ignore_index=True )

    # new df from the column of lists
    pca_df = split_columns(pca_df)
    
    return pca_df

In [20]:
# new df from the column of lists
pca_df = generate_pca_dataframe(pca_images_per_band, dates)

pca_df.to_csv('Embeddings/pca_medellin_120features(10_per_band).csv',index=False)
# display the resulting df
pca_df

Unnamed: 0,Date,0,1,2,3,4,5,6,7,8,...,110,111,112,113,114,115,116,117,118,119
0,2015-11-01,2.294995,0.188790,-2.814999,-4.106509,-4.826127,-1.931590,2.110046,2.169669,-3.337853,...,4.991791,-1.130083,-4.237726,-8.811673,2.755142,-2.230883,1.070571,3.062169,-2.531766,-0.283356
1,2015-11-08,2.294995,0.188790,-2.814999,-4.106509,-4.826127,-1.931590,2.110046,2.169669,-3.337853,...,4.991791,-1.130083,-4.237726,-8.811673,2.755142,-2.230883,1.070571,3.062169,-2.531766,-0.283356
2,2015-11-15,-2.992776,-4.160585,-1.680303,0.850790,0.399466,-0.382515,1.082947,0.885324,0.939764,...,-2.651444,-4.150722,-1.211129,0.537388,1.388784,-0.833611,1.429209,-0.413522,0.802893,2.269096
3,2015-11-22,-2.992776,-4.160585,-1.680303,0.850790,0.399466,-0.382515,1.082947,0.885324,0.939764,...,-2.651444,-4.150722,-1.211129,0.537388,1.388784,-0.833611,1.429209,-0.413522,0.802893,2.269096
4,2015-11-29,-2.958983,-0.799012,0.408298,-0.275674,0.540541,-0.400654,0.018210,-0.093618,0.140624,...,-2.709884,-0.841614,0.306946,0.147025,-0.587185,-0.286853,-0.149727,0.027457,0.067555,0.085697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,2018-11-25,-5.522933,0.242564,-0.107411,-0.137835,0.082961,-0.152577,0.088666,-0.176468,0.371281,...,-5.691023,0.108531,-0.000158,-0.193409,-0.068230,-0.285634,0.164417,-0.391486,-0.267216,-0.075663
161,2018-12-02,9.346026,2.523671,-4.925394,3.703968,-5.341458,-1.085077,1.720805,2.278464,-0.258981,...,7.905108,4.050743,-2.794811,-0.623507,6.047697,-1.914679,-0.644306,-0.847323,0.325168,2.051028
162,2018-12-09,-2.652909,-0.563147,0.440027,-0.110709,-0.128444,-0.195813,0.060844,-0.127574,0.285074,...,-2.629335,-0.678518,0.398805,-0.135323,0.174343,-0.425002,0.079115,-0.410194,-0.122415,-0.016983
163,2018-12-16,0.993320,0.777557,14.432274,2.801776,1.415641,0.281821,0.892250,-1.261930,-2.372967,...,2.240127,2.318474,14.520888,1.314388,0.167148,-0.415064,0.581475,0.089534,-1.565607,-1.747052


### PCA in the overall image

In [19]:
pca_images = dimension_reduct_with_PCA(images, components=100)

Origin shape    (165, 6571200)
Resize shape    (165, 100)


In [20]:
# new df from the column of lists
pca_df = generate_pca_dataframe(pca_images, dates)

pca_df.to_csv('Embeddings/pca_medellin_100features.csv',index=False)
# display the resulting df
pca_df

Unnamed: 0,Date,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,2015-11-01,98.435754,108.962228,-40.915729,39.468939,47.325778,70.472168,187.431726,-229.801740,43.888213,...,0.587965,-0.365837,-2.923909,0.096117,-0.608161,0.119047,1.095555,-1.305447,-1.034465,-0.486601
1,2015-11-08,98.435754,108.962228,-40.915729,39.468939,47.325778,70.472168,187.431726,-229.801740,43.888213,...,0.587965,-0.365837,-2.923909,0.096117,-0.608161,0.119047,1.095555,-1.305447,-1.034465,-0.486601
2,2015-11-15,-149.578506,118.675591,-28.032221,-37.096690,-3.305299,-67.746083,37.584909,-11.838416,-15.613838,...,-13.746142,13.683615,-1.131025,-3.349845,-10.681103,-6.437450,12.024817,12.829025,12.699458,0.475610
3,2015-11-22,-149.578506,118.675591,-28.032221,-37.096690,-3.305299,-67.746083,37.584909,-11.838416,-15.613838,...,-13.746142,13.683615,-1.131025,-3.349845,-10.681103,-6.437450,12.024817,12.829025,12.699458,0.475610
4,2015-11-29,-28.099436,34.715737,47.153387,-50.761985,-63.137078,-58.088638,-6.508121,18.149721,-29.781310,...,-6.670381,-5.520467,-4.101447,1.232668,-2.903102,2.551955,-3.338012,-5.490846,-2.383766,-1.712792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,2018-11-25,-154.158021,-40.385477,14.959814,-16.314940,32.494744,32.049769,-110.123837,-32.925602,70.966415,...,-24.752246,-0.418285,-13.387984,11.733950,22.422833,-23.083992,-1.774827,-12.985468,-0.344016,-11.627382
161,2018-12-02,71.530670,-194.253970,-60.854103,59.359774,61.782572,136.246484,-189.902994,-27.714046,46.346134,...,2.033835,-3.499518,1.421727,0.473346,-3.668544,3.775593,0.595169,9.517483,1.630138,-4.106867
162,2018-12-09,-83.791680,26.428490,19.535233,-9.675937,22.466795,24.155634,-16.545517,8.703227,-4.421293,...,5.490728,13.986629,-1.157662,9.442495,17.961227,5.249652,-13.657355,17.829749,10.827447,-14.438771
163,2018-12-16,101.759249,-100.737490,-118.204529,-52.278016,58.832020,212.931482,27.047104,-18.487913,-129.743708,...,2.554147,4.894150,-3.007075,0.121323,-0.454753,7.455948,2.480901,4.855074,4.578702,-5.699132
