This code was adapted from the following github: https://github.com/justjoshtings/satellite_image_segmentation

In [3]:
!pip install pystac pystac_client

Collecting pystac
  Downloading pystac-1.8.3-py3-none-any.whl (175 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.6/175.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pystac_client
  Downloading pystac_client-0.7.2-py3-none-any.whl (33 kB)
Installing collected packages: pystac, pystac_client
Successfully installed pystac-1.8.3 pystac_client-0.7.2


##Setup:

Change the abspath_curr to reflect your drive/directories, which hase the data

In [1]:
from google.colab import drive
import sys

# Mount Google Drive
drive.mount('/content/drive')

# Get the absolute path of the current folder
abspath_curr = '/content/drive/My Drive/Application Development Final proj'

Mounted at /content/drive


In [4]:
import matplotlib.pyplot as plt
%matplotlib inline

%tensorflow_version 2.x
import tensorflow as tf
from tensorflow import keras

import numpy as np
import pandas as pd
import os
import shutil
import math

from PIL import Image
import glob
import cv2
import pickle
import copy

import warnings
import random


import requests

import tempfile
from pathlib import Path
import itertools as it
from urllib.parse import urljoin

from pystac import Item
from pystac.extensions.eo import EOExtension
from pystac.extensions.label import LabelRelType
from pystac.extensions.scientific import ScientificExtension
from pystac_client import Client

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [None]:
# Ignore warnings
warnings.filterwarnings('ignore')

In [5]:
# Set matplotlib sizes
plt.rc('font', size=20)
plt.rc('axes', titlesize=20)
plt.rc('axes', labelsize=20)
plt.rc('xtick', labelsize=20)
plt.rc('ytick', labelsize=20)
plt.rc('legend', fontsize=20)
plt.rc('figure', titlesize=20)

In [7]:
# Random Seed

# The random seed
random_seed = 42

# Set random seed in tensorflow
tf.random.set_seed(random_seed)

# Set random seed in numpy
np.random.seed(random_seed)

In [8]:
# Check what version of TF we are using
print(tf.version.VERSION)

2.12.0


In [6]:
# Print the number of GPUs available
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Test to see if GPU is found and connected
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print('GPU device not found')
else:
  print('\nFound GPU at: {}'.format(device_name))
  print('\nCurrently using:')
  !nvidia-smi -L

Num GPUs Available:  0
GPU device not found


## TO-DO:

Organize data based on Emma's directoires

#### **Set Up Data Directories**

The data is aviable for download in the following link:
https://www.kaggle.com/datasets/balraj98/deepglobe-land-cover-classification-dataset?resource=download
it is a litle bit heavy but the directories are already organized. It is necessary to make sure the directories of images and masks between **train**, **valid**, and **test** are setup correctly.




In [9]:
meta_df = pd.read_csv(abspath_curr + "/data/metadata.csv")
class_df = pd.read_csv(abspath_curr + "/data/class_dict.csv")

print(class_df.head(10))

print(meta_df.head())

               name    r    g    b
0        urban_land    0  255  255
1  agriculture_land  255  255    0
2         rangeland  255    0  255
3       forest_land    0  255    0
4             water    0    0  255
5       barren_land  255  255  255
6           unknown    0    0    0
   image_id  split        sat_image_path              mask_path
0    100694  train  train/100694_sat.jpg  train/100694_mask.png
1    102122  train  train/102122_sat.jpg  train/102122_mask.png
2     10233  train   train/10233_sat.jpg   train/10233_mask.png
3    103665  train  train/103665_sat.jpg  train/103665_mask.png
4    103730  train  train/103730_sat.jpg  train/103730_mask.png


In [10]:
# 'Valid' does not have masks so let's merge that with the train and take a subset of train for validation
meta_df[meta_df['split'] == 'valid'].head()

Unnamed: 0,image_id,split,sat_image_path,mask_path
803,105036,valid,valid/105036_sat.jpg,
804,107780,valid,valid/107780_sat.jpg,
805,108490,valid,valid/108490_sat.jpg,
806,127801,valid,valid/127801_sat.jpg,
807,128240,valid,valid/128240_sat.jpg,


In [13]:
# Update meta_df => with the actual path
meta_df['sat_image_path'] = meta_df['sat_image_path'].str.replace('valid/', 'test/')
meta_df['split'] = meta_df['split'].str.replace('valid', 'test')

Here we can change the variable *0.9 for a one thet the user can change
(Christina)

In [20]:
# Create divides for train, test, val
train_df = meta_df[meta_df['split'] == 'train']

#### Here we can change the variable *0.9 for a one that the user can change
###########Christina
train_num_samples = round(len(train_df)*0.75)
val_num_samples = len(train_df) - train_num_samples

In [21]:
#shuffle all rows of DataFrame
train_df = train_df.sample(frac=1, random_state=random_seed).reset_index(drop=True)
train_df.head()

Unnamed: 0,image_id,split,sat_image_path,mask_path
0,462612,train,train/train_images/images/462612_sat.jpg,train/train_masks/masks/462612_mask.png
1,935318,train,train/train_images/images/935318_sat.jpg,train/train_masks/masks/935318_mask.png
2,58910,train,train/train_images/images/58910_sat.jpg,train/train_masks/masks/58910_mask.png
3,471187,train,train/train_images/images/471187_sat.jpg,train/train_masks/masks/471187_mask.png
4,548686,train,train/train_images/images/548686_sat.jpg,train/train_masks/masks/548686_mask.png


these two lines of code split the shuffled training dataset into a training portion and a validation portion by updating the 'split' column values accordingly. The first train_num_samples rows are marked as 'train', and the remaining rows are marked as 'valid'.

In [22]:
train_df['split'].iloc[:train_num_samples] = 'train'
train_df['split'].iloc[train_num_samples:] = 'valid'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['split'].iloc[:train_num_samples] = 'train'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['split'].iloc[train_num_samples:] = 'valid'


In [23]:
non_train_df = meta_df[meta_df['split'] != 'train']

train_df['sat_image_path'] = train_df['sat_image_path'].str.replace('train/', 'train/train_images/images/')
train_df['mask_path'] = train_df['mask_path'].str.replace('train/', 'train/train_masks/masks/')

meta_df = pd.concat([train_df, non_train_df], axis=0)
meta_df.head()

Unnamed: 0,image_id,split,sat_image_path,mask_path
0,462612,train,train/train_images/images/train_images/images/...,train/train_masks/masks/train_masks/masks/4626...
1,935318,train,train/train_images/images/train_images/images/...,train/train_masks/masks/train_masks/masks/9353...
2,58910,train,train/train_images/images/train_images/images/...,train/train_masks/masks/train_masks/masks/5891...
3,471187,train,train/train_images/images/train_images/images/...,train/train_masks/masks/train_masks/masks/4711...
4,548686,train,train/train_images/images/train_images/images/...,train/train_masks/masks/train_masks/masks/5486...


Now we have labeled validation data to work with.

The code snippet is extracting and displaying the initial rows of the DataFrame meta_df that correspond to the validation split,

In [24]:
meta_df[meta_df['split'] == 'valid'].head()

Unnamed: 0,image_id,split,sat_image_path,mask_path
542,499266,valid,train/train_images/images/train_images/images/...,train/train_masks/masks/train_masks/masks/4992...
543,748225,valid,train/train_images/images/train_images/images/...,train/train_masks/masks/train_masks/masks/7482...
544,291214,valid,train/train_images/images/train_images/images/...,train/train_masks/masks/train_masks/masks/2912...
545,129298,valid,train/train_images/images/train_images/images/...,train/train_masks/masks/train_masks/masks/1292...
546,137499,valid,train/train_images/images/train_images/images/...,train/train_masks/masks/train_masks/masks/1374...


Final dataset split redy to process will be this:

In [25]:
print('Samples in train: ', len(meta_df[meta_df['split'] == 'train']))
print('Samples in validation: ', len(meta_df[meta_df['split'] == 'valid']))
print('Samples in test: ', len(meta_df[meta_df['split'] == 'test']))

Samples in train:  542
Samples in validation:  261
Samples in test:  343


The levels have the following values
This is what our classes look like.

In [26]:
class_df = pd.read_csv(abspath_curr + "/data/class_dict.csv")

class_df.head(10)

Unnamed: 0,name,r,g,b
0,urban_land,0,255,255
1,agriculture_land,255,255,0
2,rangeland,255,0,255
3,forest_land,0,255,0
4,water,0,0,255
5,barren_land,255,255,255
6,unknown,0,0,0


#### **Move Files**

Move images to their correct directories based on train, validation, and testing splits.

In [27]:
def make_directory(path):
  """
  Function to make directory if not exits

  Paramater:
    path - path of directory

  Return:
    None
  """
  directory = os.path.dirname(path)
  if not os.path.exists(directory):
    os.makedirs(directory)

  return


In [37]:
# Make directories => inside the directories allready have
make_directory(abspath_curr + '/data/train/train_images/images/')
make_directory(abspath_curr + '/data/train/train_masks/masks/')

make_directory(abspath_curr + '/data/val/val_images/images/')
make_directory(abspath_curr + '/data/val/val_masks/masks/')

In [41]:
def move_files(list_filenames, source_path, target_path):
  """
  Function to move files from source to target directory based on a list of filenames within source

  Parameters:
    list_filesname - list of filenames
    source_path - path of source directory
    target_path - path of target directory

  Return:
    None
  """
  for file_name in list_filenames:
      try:
        shutil.move(os.path.join(source_path, file_name), os.path.join(target_path, file_name))
      except FileNotFoundError:
        continue

In [39]:
def folder_is_empty(path, img_types=['.png', '.jpg', '.tif']):
  """
  Function to check if a filepath is empty

  Parameters:
    path - filepath to image or mask data

  Return:
    True if empty, False if not

  We can use this function like this:
    if not False:
      do X
    else:
      do Y

  this will do X.
  """

  if any(list(map(lambda x: True if x in ''.join(os.listdir(path)) else False, img_types))):
    return False
  else:
    return True

In [40]:
# Moves files from main folders of train, val to sub-directories

# Old validation goes to test
if not folder_is_empty(abspath_curr + '/data/valid'):
  # image
  move_files(old_val_img_path, abspath_curr + '/data/valid/', abspath_curr + '/data/test')

# Split old train to new train and new val
if not folder_is_empty(abspath_curr + '/data/train'):
  # train_image
  move_files(train_sat_img_path, abspath_curr + '/data/train/', abspath_curr + '/data/train/train_images/images')
  # train_mask
  move_files(train_msk_img_path, abspath_curr + '/data/train/', abspath_curr + '/data/train/train_masks/masks')

  # val_image
  move_files(new_val_sat_img_path, abspath_curr + '/data/train/', abspath_curr + '/data/val/val_images/images')
  # val_mask
  move_files(new_val_msk_img_path, abspath_curr + '/data/train/', abspath_curr + '/data/val/val_masks/masks')



NameError: ignored