<a href="https://colab.research.google.com/github/gbessardon/AI_soil_testing/blob/master/Preparation_segmentation_data/Prepare_corine_sentinel_tiles_for_segmentationwgs84_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparation running environment
if running on kaggle you need to set the variable kaggleenv to 1

In [1]:
local=0 # 1 means running locally (no kaggle or drive setup)
kaggleenv=0 #1 means running in kaggle environement
savekaggle=0 #1 means saving on Kaggle whether saving on kaggle or not

In [2]:
"""
CORINEtxtfile: file containing the corine labels in the original corine files
should be in the form 
$CORINEfolder/u2018_clc2018_v2020_20u1_raster100m/Legend/CLC2018_CLC2018_V2018_20_QGIS.txt

cover_definition_directory: directory where the label definition is stored

corine_path: folder where corine tiles and sentinel tiles are sorted 
i.e the output of the previous script
"""

CORINEtxtfile='/gdrive/MyDrive/CORINE/u2018_clc2018_v2020_20u1_raster100m/Legend/CLC2018_CLC2018_V2018_20_QGIS.txt' #CORINE2018 file

cover_definition_directory='/gdrive/MyDrive/Tertiary_Cover_Types'

corine_path = '/gdrive/MyDrive/tilessentinelcorine'

savefn= '/gdrive/MyDrive/Segmentsentinelcorine92overlap.zip'

resolution=10  # depends on the selected size of the satelitte data

In [3]:
import os
import shutil

In [4]:
savedir=os.path.join(os.getcwd(),'temp')
if savekaggle==1:
  savedir='/kaggle/temp/'
  ownername='geoffreybessardon'
  datasetslug='segmenteirefiloverlap2020'
  datasetTitle='Segment ireland filter 2020'
  vnotes='090221v3'

In [5]:
sentinel_save_path = os.path.join(savedir,'sentinel')
corine_save_path = os.path.join(savedir,'corine')

In [6]:
if local==0:
    if not os.path.isdir('/kaggle'):
      os.mkdir('/kaggle')
    if not os.path.isdir('/kaggle/temp'):
      os.mkdir('/kaggle/temp')
    if 'google.colab' in str(get_ipython()):
      print('Running on CoLab')
      from google.colab import drive
      drive.mount('/gdrive') # link to google drive
      # import kaggle account keys
      if not os.path.isdir('/root/.kaggle'):
        os.mkdir('/root/.kaggle')
        shutil.copyfile('/gdrive/MyDrive/kaggle.json','/root/.kaggle/kaggle.json')  
        os.chmod('/root/.kaggle/kaggle.json',600)
    elif kaggleenv==1:
      ## kaggle
      if not os.path.isdir('~/.kaggle'):
        os.mkdir('~/.kaggle')
        shutil.copyfile('/kaggle/input/apitoken/kaggle.json', '~/.kaggle/kaggle.json')    
        os.chmod('~/.kaggle/kaggle.json',600)
    os.system('pip install -q kaggle')
    from kaggle import api
    

Running on CoLab
Mounted at /gdrive


In [7]:
import numpy as np
import matplotlib.pyplot as plt
import tifffile as tif 
#import corine as cor
from tqdm import tqdm
import os
import pandas as pd 
import difflib
import random


In [8]:
import zipfile

# Declare functions

In [9]:
#create a folder if it doesnt already exist
def create_folder(path):
    
    exists =  os.path.exists(path) #check if the folder exists
    
    if exists == False:
        
        os.mkdir(path) #create the folder if it doesnt exist
        

In [10]:
def listlines(fn,path):
    with open(os.path.join(path,fn),'r') as txtfile:
        lines = txtfile.readlines()
        savedlabels=[line.replace('\n','') for line in lines]
    return savedlabels

In [11]:
def cover_classes(path):
    """path = directory containing the text files which contain info on which
          tertiary corine labels belong to which desired classes

   Returns: dictionary with N lists defining which tertiary corine labels belong 
            to which to which desired classes (N is the number of desired classes)
    
    """
    txt_files = os.listdir(path)
    col=[o.replace('.txt','').lower() for o in txt_files if o.endswith('.txt')]
    data=[listlines(t,path) for t in txt_files if t.endswith('.txt')]
    cover_dict={}
    for i,_ in enumerate(col):
        cover_dict[col[i]]=data[i]
    return(cover_dict)

In [12]:
def label_anyclass(labelss,dictionary):
        """labells = tertiary labels for a particular sentinel-2 segment from bigearthnet 
    
       dictionary = dictionary containing info about which tertiary labels belong to which 
                    desired class, obtained using cover_classes() function above
        
       Returns: dictionary with N lists defining which tertiary corine labels belong 
                to  which desired classes (N is the number of desired classes)
    
        """
        
        for i,na in enumerate(labelss):
            for key, value in dictionary.items():
                if len(difflib.get_close_matches(na,value,n=1,cutoff=0.8))>0:
                    c=difflib.get_close_matches(na,value,n=1,cutoff=0.8)[0]
                    if c.capitalize().startswith(na.capitalize()[0]):
                   # if difflib.get_close_matches(na,value,n=1,cutoff=0.8)[0]
                        labelss[i]=key
        return labelss

In [13]:
def convertion_dataframe(legend_file,cover_definition_dir):
    """ 
    legend_file = file containing the legend and the color of the corine labels
    for example'/kaggle/input/corine2018/u2018_clc2018_v2020_20u1_raster100m/Legend/CLC2018_CLC2018_V2018_20_QGIS.txt'
    
    cover_definition_dir=  directory containing the text files which contain info on which
          tertiary corine labels belong to which desired classes
          
    returns df a dataframe containing the tertiary labels values and the corresponding new labels and value
    """
    
    file=open(legend_file,'r')
    content=file.readlines()
    file.close()
    values=[]
    names=[]
    for i,c in enumerate(content):
        names.append(c.split(',')[5].strip())
        values.append(i+1)
    # with indices and columns specified 
    df = pd.DataFrame(list(zip(names, values)), 
                   columns =['Name', 'value'])
    
    main_labels=[o.replace('.txt','').lower() for o in np.sort(os.listdir(cover_definition_dir)) if o.endswith('.txt')] 
    classes=cover_classes(cover_definition_dir)
    
    df['newlabels']=label_anyclass(names,classes)
    newvalues=np.zeros(len(df['value']))
    i=1
    for l in main_labels:
        if 'no data' not in l:
            newvalues[df['newlabels']==l]=int(i)
            i=i+1
    df['newvalues']=newvalues
    return df
    

In [14]:
def convert_corine_new_labels(cpath,df):
    """
    cpath=corine file path
    
    df= the conversion dataframe obtained with convertion_dataframe 
    
    returns newimages the array with the desired labels 
    """
    corine = tif.imread(cpath)
    numbers,indices = np.unique(corine,return_inverse=True)
    newnumbers=numbers*0+1
    for _,row in df.iterrows():
        newnumbers[numbers==row['value']]=row['newvalues']-1
        if row['Name']=='Sea and ocean':
            newnumbers[(numbers>44)] =row['newvalues']-1 
            newnumbers[(numbers==0)] =row['newvalues']-1
    
    
    newimages = newnumbers[indices].reshape(corine.shape)#reshape the data to original image dimensions
    
    return newimages

In [15]:
# Function returns the N largest element 
def Nmaxelements(list1, N): 
    list2=np.sort(list1)
    return list2[-N]

In [16]:
# Function returns the N smallest element 
def Nminelements(list1, N): 
    list2=np.sort(list1)     
    return list2[N]

In [17]:
def Create_kaggle_dataset(dirname,datasetslug,datasetTitle,ownername,vnotes):
    "dirname: the directory you want to upload in Kaggle"
    # initialize dataset create the json metadata
    api.dataset_initialize(dirname)
    #Open the metadafile and extract the text(content)
    f=open(os.path.join(dirname,'dataset-metadata.json'),'r')
    contents=f.readlines()
    f.close()
    # overwrite the title and the slug in the text(content) and creates a new text (newcontent)
    newcontents=[]
    for c in contents:
        if 'INSERT_SLUG_HERE' in c:
            c=c.replace('INSERT_SLUG_HERE',datasetslug)
        if 'INSERT_TITLE_HERE' in c:
            c=c.replace('INSERT_TITLE_HERE',datasetTitle)
        newcontents.append(c)
    # Writes the new text (newcontent) in the metadata file
    f = open(os.path.join(dirname,'dataset-metadata.json'),'w')
    for n in newcontents:
        f.write(n)
    f.close()
    if api.dataset_status(ownername+'/'+datasetslug)=='ready':
        api.dataset_create_version(dirname,version_notes=vnotes,dir_mode='zip')
    else:
        api.dataset_create_new(dirname,dir_mode='zip')

In [18]:
def selecttrainingtile(dft,tilen,nfpt):
    #Check number of  cover contained in each 120x120 image
    covernumbers=[len(l) for l in dft.corine[dft.tile==tilen]]
    tilefname=[l for l in dft.sentinel[dft.tile==tilen]]
    """
    To ensure variety of covers half of the nfpt file will contain the largest cover number and the other half the lowest cover number 
    Nmaxelements find the number of cover of nfpt/2 images with the largest number of cover and the minimum number of cover in this list 
    Nminelements find the number of cover of nfpt/2 images with the lowest number of cover and the maximum number of cover in this list
    """
    minUmax=Nmaxelements(covernumbers, int(nfpt/2))
    maxUmin=Nminelements(covernumbers, int(nfpt/2))
    
    if minUmax==maxUmin:
        print('1')
        listminUmax=list(np.where(np.array(covernumbers)==minUmax)[0])
        savelist=random.sample(listminUmax,nfpt)
    else:
        #select files in the maximums
        if np.max(covernumbers)==minUmax:
            listminUmax=list(np.where(np.array(covernumbers)==minUmax)[0])
            savelistmax=random.sample(listminUmax,int(nfpt/2))
        else:
            savelistmax=np.where(np.array(covernumbers)>minUmax)[0]
            listminUmax=list(np.where(np.array(covernumbers)==minUmax)[0])
            savelistmax=np.hstack((savelistmax,random.sample(listminUmax,int(nfpt/2)-len(savelistmax))))

        # select files in the minmums    
        if np.min(covernumbers)==maxUmin:
            listmaxUmin=list(np.where(np.array(covernumbers)==maxUmin)[0])
            savelistmin=random.sample(listmaxUmin,int(nfpt/2))
        else:
            savelistmin=np.where(np.array(covernumbers)<maxUmin)[0]
            listmaxUmin=list(np.where(np.array(covernumbers)==maxUmin)[0])
            savelistmin=np.hstack((savelistmin,random.sample(listmaxUmin,int(nfpt/2)-len(savelistmin))))

        savelist=np.hstack((savelistmax,savelistmin))
    
    training_files=[tilefname[sl] for sl in savelist]
    return training_files

# MAIN

## Define conversion dataframe

In [19]:
df=convertion_dataframe(CORINEtxtfile,cover_definition_directory)

In [20]:
df

Unnamed: 0,Name,value,newlabels,newvalues
0,Continuous urban fabric,1,continuous urban fabric,12.0
1,Discontinuous urban fabric,2,discontinuous urban fabric,13.0
2,Industrial or commercial units,3,industrial or commercial units,19.0
3,Road and rail networks and associated land,4,road and rail networks and associated land,34.0
4,Port areas,5,port areas,32.0
5,Airports,6,airports,2.0
6,Mineral extraction sites,7,mineral extraction sites,23.0
7,Dump sites,8,dump sites,14.0
8,Construction sites,9,construction sites,11.0
9,Green urban areas,10,green urban areas,18.0


## Define tiles path

In [21]:
#corine path
corine_files=[os.path.join(corine_path,o) for o in os.listdir(corine_path) if o.startswith('corine')]
tiles=[cf.split('corine')[1][0:5] for cf in corine_files ]

## Create saving directories 

In [22]:
#All data directories
create_folder(savedir)
create_folder(sentinel_save_path)
create_folder(corine_save_path)

#Training data directories must contain /kaggle/working to be saved with the directory otherwise needs to create an independant dataset
#tr_sentinel_save_path = '/kaggle/working/sentinel'
#create_folder(tr_sentinel_save_path)
#tr_corine_save_path = '/kaggle/working/corine'
#create_folder(tr_corine_save_path)

In [None]:
imsize=120
count = 0 #for saving the segments

corine_list = [] #initiate corine list

sentinel_list = [] #initiate sentinel list

sentinel_tilefn_segment=[] #initiate tile filename segment list

tile_list=[]

unique_cover_tile=[]

mean_segment=[]

cover_tile=[]
for i,cf in enumerate(corine_files):
    #create_folder(os.path.join(corine_save_path,tiles[i]))
   # create_folder(os.path.join(sentinel_save_path,tiles[i]))
    corine=convert_corine_new_labels(cf,df)
    #iterate through the width and length of the sentinel and corine tiles     

    sentinel = tif.imread(cf.replace('/corine','/sentinel'))

    unique_cover_tile.append(np.unique(corine))
    cover_tile.append(tiles[i])
    # reshape corine and sentinel to 120x120 images
    sentinel120=np.zeros(((int(sentinel.shape[0]/imsize)+1)*imsize,(int(sentinel.shape[0]/imsize)+1)*imsize,sentinel.shape[2]))
    sentinel120[0:sentinel.shape[0],0:sentinel.shape[1],:]=sentinel
    corine120=np.zeros(((int(sentinel.shape[0]/imsize)+1)*imsize,(int(sentinel.shape[0]/imsize)+1)*imsize))
    corine120[0:corine.shape[0],0:corine.shape[1]]=corine
    for j in tqdm(range(int(sentinel120.shape[0]/imsize))):

        for k in range(int(sentinel120.shape[1]/imsize)):
            #sentinel segment   
            sentinel_segment = sentinel120[(j*imsize):(j*imsize)+imsize,(k*imsize):(k*imsize)+imsize]

            #corine segment
            corine_segment = corine120[(j*imsize):(j*imsize)+imsize,(k*imsize):(k*imsize)+imsize]

            #remove values greater than 80 (consistent with classifier training)
            sentinel_segment = np.where(sentinel_segment>80,np.mean(sentinel_segment),sentinel_segment)

            #normalise the sentinel image
            sentinel_segment = (sentinel_segment/80).astype('float32')

            #save the sentinel segment
            plt.imsave(sentinel_save_path + '/'+str(count) + '.png',sentinel_segment)

            #save the corine segment
            tif.imwrite(corine_save_path + '/'+str(count) + '.tif',corine_segment)
            
            #average segment_value
            mean_segment.append(np.mean(sentinel_segment))

            #append sentinel to list
            sentinel_list.append(count)

            #append corine to list
            corine_list.append(np.unique(corine_segment))
            
            tile_list.append(tiles[i])
            
            sentinel_tilefn_segment.append('S2'+cf.replace('/corine','/sentinel').split('/sentinel')[1])

            #increase the count
            count = count + 1

#corine_stuff = np.array(corine_list) #turn corine list into numpy array

#sentinel_stuff = np.array(sentinel_list) #turn sentinel list into numpy array

100%|██████████| 93/93 [00:36<00:00,  2.55it/s]
100%|██████████| 93/93 [00:46<00:00,  1.98it/s]
100%|██████████| 93/93 [00:41<00:00,  2.25it/s]
100%|██████████| 93/93 [00:58<00:00,  1.60it/s]
 46%|████▌     | 43/93 [00:25<00:29,  1.72it/s]

## Training  files selection

In [None]:
#Create dataframe containing the sentinel file number, the corresponding tile name
dft = pd.DataFrame(list(zip(sentinel_list, corine_list, tile_list,sentinel_tilefn_segment,mean_segment)), 
               columns =['sentinel', 'corine', 'tile','sentinel_tile_fn','segment'])

csvdftraining = dft.to_csv(os.path.join(savedir,'all_file_details.csv'), index = True)

In [None]:
shutil.make_archive(savefn.split('.zip')[0],'zip',savedir)

# Create kaggle dataset

In [None]:
if savekaggle==1:
  Create_kaggle_dataset(savedir,datasetslug,datasetTitle,ownername,vnotes)