In [132]:
import pandas as pd
import numpy as np
import os
import torch
from torch.utils.data import Dataset, DataLoader
import re
import sys
from datetime import timedelta
# from torch.nn.functional import normalize

In [7]:

proj_paths = ["/Users/gopal/Google Drive/_Research/Research projects/ML/manclassify/app_data/Thailand",
              "/Users/gopalpenny/Library/CloudStorage/GoogleDrive-gopalpenny@gmail.com/My Drive/_Research/Research projects/ML/manclassify/app_data/Thailand"]

proj_path = [path for path in proj_paths if os.path.exists(path)][0]

class_path = os.path.join(proj_path,"Thailand_classification")
ts_path = os.path.join(proj_path,"Thailand_download_timeseries")
# pd.read_csv("
os.listdir(class_path)

loc_id = 0

s2_csv_name = f"pt_ts_loc{loc_id}_s2.csv"
s2_csv_name

class_colname = 'Subclass2019'

proj_normpath = os.path.normpath(proj_path)
proj_dirname = proj_normpath.split(os.sep)[-1]
proj_name = re.sub("_classification$","",proj_dirname)
class_path = os.path.join(proj_path, proj_name + "_classification")
ts_path = os.path.join(proj_path, proj_name + "_download_timeseries")
pt_classes = pd.read_csv(os.path.join(class_path,"location_classification.csv"))
pt_classes = pt_classes[['loc_id', class_colname]].dropna()

pt_classes

Unnamed: 0,loc_id,Subclass2019
0,0,Plantation
1,1,Crop(Single)
2,2,Crop(Single)
3,3,Crop(Single)
4,4,Plantation
...,...,...
496,496,Crop(Single)
497,497,Crop(Single)
498,498,Plantation
499,499,Plantation


## Generate the torch tensor dataset

### Define function to read timeseries

* Read timeseries
* Filter timeseries to date range (+/- 60 days)
* Remove observations with clouds
* Take the mean value for each day (occurs when multiple overpasses happen on the same day)

In [137]:
# prep dataset
date_range = pd.to_datetime(['2019-06-01','2020-05-31'])

def prep_s2_loc(loc_id, date_range, proj_path):
    ts_path = os.path.join(proj_path,"Thailand_download_timeseries")
    s2_csv_name = f"pt_ts_loc{loc_id}_s2.csv"
    s2_csv_path = os.path.join(ts_path, s2_csv_name)
    s2_ts = pd.read_csv(s2_csv_path)

    # extract dates from image ids
    s2_ts['datestr'] = [re.sub("(^[0-9]+)[a-zA-Z].*","\\1",x) for x in s2_ts.image_id]
    s2_ts['date'] = pd.to_datetime(s2_ts.datestr, format = "%Y%m%d")

    # subset to cloud-free days AND within date_range
    s2_ts = s2_ts[(s2_ts.date >= date_range[0] - timedelta(days = 60)) & 
                  (s2_ts.date <= date_range[1] + timedelta(days = 60)) & 
                  (s2_ts.cloudmask == 0)]

    # calculate day from startday
    date_diff = (s2_ts.date - date_range[0])
    s2_ts['day'] = [x.days for x in date_diff]
    s2_ts['loc_id'] = loc_id

    # select only predictor and position columns, return tensor
    s2_ts_x = s2_ts[['loc_id','day','B8','B4','B3','B2']]
    return s2_ts_x

# s2_ts_loc125 = prep_s2_loc(125, date_range, proj_path)
# s2_ts_loc125.groupby(['loc_id','day'],as_index = False).mean()

### Get the torch tensor dataset (prep and save OR read)

In [151]:
from ipywidgets import IntProgress
from IPython.display import display

if os.path.exists(os.path.join(proj_path, 's2_ts_prepped.pt')):
    loc_ts_tor = torch.load(os.path.join(proj_path, 's2_ts_prepped.pt'))
    
else:
    f = IntProgress(min=0, max=pt_classes.shape[0]) # instantiate the bar
    display(f) # display the bar
    
    s2_ts_list = []
    loc_id_list = []
    for i in np.arange(pt_classes.shape[0]):
        # loc_id = 499
        # print(loc_id)
        loc_id = pt_classes.loc_id.iloc[i]
        # loc_id_list.append(loc_id)
        s2_ts_loc = prep_s2_loc(loc_id, date_range, proj_path)
        s2_ts_loc = s2_ts_loc.groupby(['loc_id','day'],as_index = False).mean()
        s2_ts_tor = torch.tensor(s2_ts_loc.to_numpy())
        s2_ts_list.append(s2_ts_tor)
        f.value += 1
        
    loc_ts_tor = torch.cat(s2_ts_list)

    torch.save(loc_ts_tor, os.path.join(proj_path, 's2_ts_prepped.pt'))

sys.getsizeof(loc_ts_tor)

72

### Prep the dataset tensors

* Subset to training classes (crops & plantations)
* Check max number of rows
* Normalize & center
* Split loc_id into training and test datasets

In [197]:
print('All classes')
print(pt_classes.groupby('Subclass2019').count())

train_classes = ['Crop(Double)','Crop(Single)','Plantation']
pt_classes_ag = pt_classes[pt_classes['Subclass2019'].isin(train_classes)]
print('\nTraining dataset\n',pt_classes_ag)

All classes
              loc_id
Subclass2019        
Crop(Double)      68
Crop(Single)     278
Forest             3
Golf               1
Mixed             20
Plantation       109
Unsure            17
Urban              1
Water              4

Training dataset
      loc_id  Subclass2019
0         0    Plantation
1         1  Crop(Single)
2         2  Crop(Single)
3         3  Crop(Single)
4         4    Plantation
..      ...           ...
496     496  Crop(Single)
497     497  Crop(Single)
498     498    Plantation
499     499    Plantation
500     500  Crop(Double)

[455 rows x 2 columns]


In [192]:
loc_ts_tor = loc_ts_tor[(loc_ts_tor[:,1] >= -30) & (loc_ts_tor[:,1] <= 395)]

row_means= loc_ts_tor.mean(dim = 1)#.shape #.unsqueeze(0).repeat(5,1)
loc_ts_tor = loc_ts_tor[~torch.isnan(row_means)]
col_means= loc_ts_tor.mean(dim = 0)#.shape #.unsqueeze(0).repeat(5,1)
col_std= loc_ts_tor.std(dim = 0)#.shape #.unsqueeze(0).repeat(5,1)
col_means[[0,1]] = 0
col_std[[0,1]] = 1

loc_ts_tor_std = col_std.unsqueeze(0).repeat(loc_ts_tor.shape[0],1)
loc_ts_tor_mean = col_means.unsqueeze(0).repeat(loc_ts_tor.shape[0],1)

loc_ts_norm = (loc_ts_tor - loc_ts_tor_mean) / loc_ts_tor_std

# get max of number of observations per location
# idx = np.arange(loc_ts_norm.shape[0])
loc_id = np.unique(loc_ts_norm[:,0])
num_obs = pd.DataFrame({'loc_id' : np.unique(loc_ts_norm[:,0]).astype('int')})
num_obs['num_obs'] = [loc_ts_norm[loc_ts_norm[:,0]==i,:].shape[0] for i in num_obs['loc_id']]
print("Max number of observations for any loc_id")
print(num_obs.iloc[[num_obs['num_obs'].idxmax()]])

Max number of observations for any loc_id
     loc_id  num_obs
481     481       94


In [198]:
loc_train = pt_classes_ag.groupby('Subclass2019', group_keys = False).apply(lambda x: x.sample(frac = 0.8))
loc_test = pt_classes_ag[~pt_classes_ag['loc_id'].isin(loc_train.loc_id)]
print('Training\n', loc_train.groupby('Subclass2019').count())
print('\nTesting\n', loc_test.groupby('Subclass2019').count())

Training
               loc_id
Subclass2019        
Crop(Double)      54
Crop(Single)     222
Plantation        87

Testing
               loc_id
Subclass2019        
Crop(Double)      14
Crop(Single)      56
Plantation        22


In [199]:
loc_train

Unnamed: 0,loc_id,Subclass2019
57,57,Crop(Double)
162,162,Crop(Double)
335,335,Crop(Double)
48,48,Crop(Double)
449,449,Crop(Double)
...,...,...
478,478,Plantation
39,39,Plantation
422,422,Plantation
99,99,Plantation


In [280]:
class s2Dataset(Dataset):
    """Sentinel 2 dataset"""
    
    def __init__(self, x_train, y_train):
        """
        Args:
            proj_path (string): path to manclassify project
        """
        self.x_train = x_train
        self.y_train = y_train
        # self.proj_path = proj_path
        # proj_normpath = os.path.normpath(proj_path)
        # proj_dirname = proj_normpath.split(os.sep)[-1]
        # self.proj_name = re.sub("_classification$","",proj_dirname)
        # self.class_path = os.path.join(proj_path, self.proj_name + "_classification")
        # self.ts_path = os.path.join(proj_path, self.proj_name + "_download_timeseries")
        # self.pt_classes = pd.read_csv(os.path.join(self.class_path,"location_classification.csv"))
        # self.pt_classes = classes[['loc_id', class_colname]].dropna()
        # self.classes = pd.unique(self.pt_classes[class_colname])
        # self.labels = self.pt_classes.assign(val = 1).pivot_table(columns = class_colname, index = 'loc_id', values = 'val', fill_value= 0)

    
    def __getitem__(self, idx):
        # get loc_id
        loc_id = self.y_train[idx,0]
        self.last_loc_id = loc_id
        
        # select location id
        x_loc = self.x_train[self.x_train[:,0]==loc_id]
        x = x_loc[:,1:] # remove loc_id column
        
        # get one-hot encoding for the point as tensor
        y =torch.tensor(y_train[idx,1:])
        
        return x, y
        
    def __len__(self):
        return self.pt_classes.shape[0]

In [284]:
y_train = (loc_train.assign(val = 1) \
  .pivot_table(columns = class_colname, index = 'loc_id', values = 'val', fill_value= 0) \
  .reset_index('loc_id').to_numpy())


# loc_ts_norm[:,0]
x_train = loc_ts_norm[torch.isin(loc_ts_norm[:,0],torch.tensor(y_train[:,0]).to(torch.float64)),:]

s2_dataset = s2Dataset(x_train = x_train, y_train = y_train)
x, y = s2_dataset.__getitem__(2)

print(x.shape)
print(y.shape)
# sys.getsizeof(x)

torch.Size([37, 5])
torch.Size([3])


In [None]:
dataloader = DataLoader(s2_dataset, batch_size = 10, shuffle = True)



In [None]:
len(dataloader)

## Old S2 pytorch dataset

In [None]:
# class s2Dataset(Dataset):
#     """Sentinel 2 dataset"""
    
#     def __init__(self, proj_path, class_colname):
#         """
#         Args:
#             proj_path (string): path to manclassify project
#         """
#         self.proj_path = proj_path
#         proj_normpath = os.path.normpath(proj_path)
#         proj_dirname = proj_normpath.split(os.sep)[-1]
#         self.proj_name = re.sub("_classification$","",proj_dirname)
#         self.class_path = os.path.join(proj_path, self.proj_name + "_classification")
#         self.ts_path = os.path.join(proj_path, self.proj_name + "_download_timeseries")
#         self.pt_classes = pd.read_csv(os.path.join(self.class_path,"location_classification.csv"))
#         self.pt_classes = classes[['loc_id', class_colname]].dropna()
#         # self.pt_classes['loc_id'] = self.pt_classes['loc_id'] + 10.5 # for testing index only
#         self.classes = pd.unique(self.pt_classes[class_colname])
#         self.labels = self.pt_classes.assign(val = 1).pivot_table(columns = class_colname, index = 'loc_id', values = 'val', fill_value= 0)

    
#     def __getitem__(self, idx):
#         loc_id = self.labels.index[idx]
#         self.last_loc_id = loc_id
        
#         # select location id
#         s2_ts_x = s2_ts[['B8','B4','B3','B2','day']]
#         x = torch.tensor(s2_ts_x.to_numpy())
        
#         # get one-hot encoding for the point as tensor
#         y = torch.tensor(self.labels.iloc[idx].to_numpy())
        
#         return x, y
        
#     def __len__(self):
#         return self.pt_classes.shape[0]


# proj_path = "/Users/gopal/Google Drive/_Research/Research projects/ML/manclassify/app_data/Thailand"
# # date_rangeX = pd.to_datetime(['2019-06-01','2020-05-31'])
# s2_dataset = s2Dataset(proj_path = proj_path, class_colname = 'Subclass2019')
# x = s2_dataset.__getitem__(10)
# sys.getsizeof(x)