In [1]:
import pandas as pd
import numpy as np
import os
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import re
import sys
from datetime import timedelta
# from torch.nn.functional import normalize

In [2]:

proj_paths = ["/Users/gopal/Google Drive/_Research/Research projects/ML/manclassify/app_data/Thailand",
              "/Users/gopalpenny/Library/CloudStorage/GoogleDrive-gopalpenny@gmail.com/My Drive/_Research/Research projects/ML/manclassify/app_data/Thailand"]

proj_path = [path for path in proj_paths if os.path.exists(path)][0]

class_path = os.path.join(proj_path,"Thailand_classification")
ts_path = os.path.join(proj_path,"Thailand_download_timeseries")
# pd.read_csv("
os.listdir(class_path)

loc_id = 0

s2_csv_name = f"pt_ts_loc{loc_id}_s2.csv"
s2_csv_name

class_colname = 'Subclass2019'

proj_normpath = os.path.normpath(proj_path)
proj_dirname = proj_normpath.split(os.sep)[-1]
proj_name = re.sub("_classification$","",proj_dirname)
class_path = os.path.join(proj_path, proj_name + "_classification")
ts_path = os.path.join(proj_path, proj_name + "_download_timeseries")
pt_classes = pd.read_csv(os.path.join(class_path,"location_classification.csv"))
pt_classes = pt_classes[['loc_id', class_colname]].dropna()

pt_classes

Unnamed: 0,loc_id,Subclass2019
0,0,Plantation
1,1,Crop(Single)
2,2,Crop(Single)
3,3,Crop(Single)
4,4,Plantation
...,...,...
496,496,Crop(Single)
497,497,Crop(Single)
498,498,Plantation
499,499,Plantation


## Generate the torch tensor dataset

### Define function to read timeseries

* Read timeseries
* Filter timeseries to date range (+/- 60 days)
* Remove observations with clouds
* Take the mean value for each day (occurs when multiple overpasses happen on the same day)

In [3]:
# prep dataset
date_range = pd.to_datetime(['2019-06-01','2020-05-31'])

def prep_s2_loc(loc_id, date_range, proj_path):
    ts_path = os.path.join(proj_path,"Thailand_download_timeseries")
    s2_csv_name = f"pt_ts_loc{loc_id}_s2.csv"
    s2_csv_path = os.path.join(ts_path, s2_csv_name)
    s2_ts = pd.read_csv(s2_csv_path)

    # extract dates from image ids
    s2_ts['datestr'] = [re.sub("(^[0-9]+)[a-zA-Z].*","\\1",x) for x in s2_ts.image_id]
    s2_ts['date'] = pd.to_datetime(s2_ts.datestr, format = "%Y%m%d")

    # subset to cloud-free days AND within date_range
    s2_ts = s2_ts[(s2_ts.date >= date_range[0] - timedelta(days = 60)) & 
                  (s2_ts.date <= date_range[1] + timedelta(days = 60)) & 
                  (s2_ts.cloudmask == 0)]

    # calculate day from startday
    date_diff = (s2_ts.date - date_range[0])
    s2_ts['day'] = [x.days for x in date_diff]
    s2_ts['loc_id'] = loc_id

    # select only predictor and position columns, return tensor
    s2_ts_x = s2_ts[['loc_id','day','B8','B4','B3','B2']]
    return s2_ts_x

# s2_ts_loc125 = prep_s2_loc(125, date_range, proj_path)
# s2_ts_loc125.groupby(['loc_id','day'],as_index = False).mean()

### Get the torch tensor dataset (prep and save OR read)

In [4]:
# from ipywidgets import IntProgress
# from IPython.display import display

if os.path.exists(os.path.join(proj_path, 's2_ts_prepped.pt')):
    loc_ts_tor = torch.load(os.path.join(proj_path, 's2_ts_prepped.pt'))
    
else:
    # f = IntProgress(min=0, max=pt_classes.shape[0]) # instantiate the bar
    display(f) # display the bar
    
    s2_ts_list = []
    loc_id_list = []
    for i in np.arange(pt_classes.shape[0]):
        # loc_id = 499
        # print(loc_id)
        loc_id = pt_classes.loc_id.iloc[i]
        # loc_id_list.append(loc_id)
        s2_ts_loc = prep_s2_loc(loc_id, date_range, proj_path)
        s2_ts_loc = s2_ts_loc.groupby(['loc_id','day'],as_index = False).mean()
        s2_ts_tor = torch.tensor(s2_ts_loc.to_numpy())
        s2_ts_list.append(s2_ts_tor)
        # f.value += 1
        
    loc_ts_tor = torch.cat(s2_ts_list)

    torch.save(loc_ts_tor, os.path.join(proj_path, 's2_ts_prepped.pt'))

sys.getsizeof(loc_ts_tor)

72

### Prep the dataset tensors

* Subset to training classes (crops & plantations)
* Check max number of rows
* Normalize & center
* Split loc_id into training and test datasets

In [15]:
print('All classes')
print(pt_classes.groupby('Subclass2019').count())

train_classes = ['Crop(Double)','Crop(Single)','Plantation']
pt_classes_ag = pt_classes[pt_classes['Subclass2019'].isin(train_classes)]
print('\nTraining dataset (pt_classes_ag)\n',pt_classes_ag)

All classes
              loc_id
Subclass2019        
Crop(Double)      68
Crop(Single)     278
Forest             3
Golf               1
Mixed             20
Plantation       109
Unsure            17
Urban              1
Water              4

Training dataset (pt_classes_ag)
      loc_id  Subclass2019
0         0    Plantation
1         1  Crop(Single)
2         2  Crop(Single)
3         3  Crop(Single)
4         4    Plantation
..      ...           ...
496     496  Crop(Single)
497     497  Crop(Single)
498     498    Plantation
499     499    Plantation
500     500  Crop(Double)

[455 rows x 2 columns]


In [6]:
loc_ts_tor = loc_ts_tor[(loc_ts_tor[:,1] >= -30) & (loc_ts_tor[:,1] <= 395)]

row_means= loc_ts_tor.mean(dim = 1)#.shape #.unsqueeze(0).repeat(5,1)
loc_ts_tor = loc_ts_tor[~torch.isnan(row_means)]
col_means= loc_ts_tor.mean(dim = 0)#.shape #.unsqueeze(0).repeat(5,1)
col_std= loc_ts_tor.std(dim = 0)#.shape #.unsqueeze(0).repeat(5,1)
col_means[[0,1]] = 0
col_std[[0,1]] = 1

loc_ts_tor_std = col_std.unsqueeze(0).repeat(loc_ts_tor.shape[0],1)
loc_ts_tor_mean = col_means.unsqueeze(0).repeat(loc_ts_tor.shape[0],1)

loc_ts_norm = (loc_ts_tor - loc_ts_tor_mean) / loc_ts_tor_std

# get max of number of observations per location
# idx = np.arange(loc_ts_norm.shape[0])
loc_id = np.unique(loc_ts_norm[:,0])
num_obs = pd.DataFrame({'loc_id' : np.unique(loc_ts_norm[:,0]).astype('int')})
num_obs['num_obs'] = [loc_ts_norm[loc_ts_norm[:,0]==i,:].shape[0] for i in num_obs['loc_id']]
print("Max number of observations for any loc_id")
print(num_obs.iloc[[num_obs['num_obs'].idxmax()]])

Max number of observations for any loc_id
     loc_id  num_obs
481     481       94


In [7]:
loc_train = pt_classes_ag.groupby('Subclass2019', group_keys = False).apply(lambda x: x.sample(frac = 0.8))
loc_train['n'] = loc_train.groupby('Subclass2019')['loc_id'].transform(len)
loc_train['weight'] = loc_train.shape[0] / loc_train['n'] 

loc_test = pt_classes_ag[~pt_classes_ag['loc_id'].isin(loc_train.loc_id)]
print('Training (loc_train summary)\n', loc_train.groupby('Subclass2019').count())
print('\nTesting (loc_test summary)\n', loc_test.groupby('Subclass2019').count())

SyntaxError: invalid syntax (3013741568.py, line 6)

In [8]:
loc_train

Unnamed: 0,loc_id,Subclass2019,n,weight
171,171,Crop(Double),54,6.722222
41,41,Crop(Double),54,6.722222
134,134,Crop(Double),54,6.722222
395,395,Crop(Double),54,6.722222
48,48,Crop(Double),54,6.722222
...,...,...,...,...
482,482,Plantation,87,4.172414
120,120,Plantation,87,4.172414
17,17,Plantation,87,4.172414
380,380,Plantation,87,4.172414


In [17]:
class s2Dataset(Dataset):
    """Sentinel 2 dataset"""
    
    def __init__(self, x_train, y_train):
        """
        Args:
            x_train (tensor): contains loc_id and predictors as columns, s2 observations as rows
            y_train (tensor): contains loc_id as rows, weights and class as 1-hot columns
        """
        self.x_train = x_train
        self.y_train = y_train
        # self.proj_path = proj_path
        # proj_normpath = os.path.normpath(proj_path)
        # proj_dirname = proj_normpath.split(os.sep)[-1]
        # self.proj_name = re.sub("_classification$","",proj_dirname)
        # self.class_path = os.path.join(proj_path, self.proj_name + "_classification")
        # self.ts_path = os.path.join(proj_path, self.proj_name + "_download_timeseries")
        # self.pt_classes = pd.read_csv(os.path.join(self.class_path,"location_classification.csv"))
        # self.pt_classes = classes[['loc_id', class_colname]].dropna()
        # self.classes = pd.unique(self.pt_classes[class_colname])
        # self.labels = self.pt_classes.assign(val = 1).pivot_table(columns = class_colname, index = 'loc_id', values = 'val', fill_value= 0)

    
    def __getitem__(self, idx):
        # get loc_id
        loc_id = self.y_train[idx,0]
        self.last_loc_id = loc_id
        
        # select location id
        x_loc = self.x_train[self.x_train[:,0]==loc_id]
        x = x_loc[:,1:] # remove loc_id column
        
        # get one-hot encoding for the point as tensor
        y = torch.tensor(y_train[idx,2:])
        
        return x, y
        
    def __len__(self):
        return self.y_train.shape[0]

### get training data

* `y_train` directly from `loc_train` & pivot
* `x_train` from `loc_ts_norm`, subset to `y_train[:,0]`

In [None]:
# get y_train values from loc_train
y_train_df = (loc_train.assign(val = 1) \
  .pivot_table(columns = class_colname, index = ['loc_id','weight'], values = 'val', fill_value= 0) \
  .reset_index(['loc_id','weight']))
y_train = y_train_df.to_numpy()
print('y_train:\n',y_train)

y_train = y_train_df.to_numpy()

# get x_train values from loc_ts_norm (based on loc_id)
x_train = loc_ts_norm[torch.isin(loc_ts_norm[:,0],torch.tensor(y_train[:,0]).to(torch.float64)),:]

### build pytorch dataset: `s2_dateset`

In [None]:
s2_dataset = s2Dataset(x_train = x_train, y_train = y_train)

# example item in dataset
idx_test = 1
x, y = s2_dataset.__getitem__(idx_test)

print(f'x example (idx={idx_test}): day, B8, B4, B3, B2\n',x)
print(x.shape)
print(f'\n\ny example (idx={idx_test}): crops(double) crops(single) plantation\n',y)
print(y.shape)
# sys.getsizeof(x)

y_train:
 [[  0.           4.17241379   0.           0.           1.        ]
 [  1.           1.63513514   0.           1.           0.        ]
 [  5.           6.72222222   1.           0.           0.        ]
 ...
 [497.           1.63513514   0.           1.           0.        ]
 [498.           4.17241379   0.           0.           1.        ]
 [499.           4.17241379   0.           0.           1.        ]]
x example (idx=1): day, B8, B4, B3, B2
 tensor([[-1.2000e+01,  7.3413e-01,  2.8951e+00,  3.5055e+00,  3.1997e+00],
        [-7.0000e+00, -9.7997e-01,  7.5053e-01,  3.9384e-01,  4.4937e-01],
        [ 1.0800e+02, -7.0083e-01, -1.1618e+00, -1.1620e+00, -1.1403e+00],
        [ 1.1300e+02, -7.6956e-01, -1.2326e+00, -1.2688e+00, -1.2431e+00],
        [ 1.2800e+02, -6.0825e-01, -1.0004e+00, -8.2507e-01, -8.3469e-01],
        [ 1.3800e+02, -9.3459e-02, -8.5873e-01, -4.8542e-01, -2.1763e-01],
        [ 1.4300e+02, -5.8861e-01, -1.1552e+00, -1.1455e+00, -1.0903e+00],
        [ 1

### generate sampling weights for data loader

In [27]:
loc_train_n = loc_train
loc_train_n['n'] = loc_train_n.groupby('Subclass2019')['loc_id'].transform(len)
loc_train_n['weight'] = loc_train_n.shape[0] / loc_train_n['n'] 
loc_train_n

Unnamed: 0,loc_id,Subclass2019,n,weight
171,171,Crop(Double),54,6.722222
41,41,Crop(Double),54,6.722222
134,134,Crop(Double),54,6.722222
395,395,Crop(Double),54,6.722222
48,48,Crop(Double),54,6.722222
...,...,...,...,...
482,482,Plantation,87,4.172414
120,120,Plantation,87,4.172414
17,17,Plantation,87,4.172414
380,380,Plantation,87,4.172414


In [24]:
dataloader = DataLoader(s2_dataset, batch_size = 4, shuffle = True)

In [26]:
len(dataloader)

91

In [None]:
class TransformerNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        

## Old S2 pytorch dataset

In [None]:
# class s2Dataset(Dataset):
#     """Sentinel 2 dataset"""
    
#     def __init__(self, proj_path, class_colname):
#         """
#         Args:
#             proj_path (string): path to manclassify project
#         """
#         self.proj_path = proj_path
#         proj_normpath = os.path.normpath(proj_path)
#         proj_dirname = proj_normpath.split(os.sep)[-1]
#         self.proj_name = re.sub("_classification$","",proj_dirname)
#         self.class_path = os.path.join(proj_path, self.proj_name + "_classification")
#         self.ts_path = os.path.join(proj_path, self.proj_name + "_download_timeseries")
#         self.pt_classes = pd.read_csv(os.path.join(self.class_path,"location_classification.csv"))
#         self.pt_classes = classes[['loc_id', class_colname]].dropna()
#         # self.pt_classes['loc_id'] = self.pt_classes['loc_id'] + 10.5 # for testing index only
#         self.classes = pd.unique(self.pt_classes[class_colname])
#         self.labels = self.pt_classes.assign(val = 1).pivot_table(columns = class_colname, index = 'loc_id', values = 'val', fill_value= 0)

    
#     def __getitem__(self, idx):
#         loc_id = self.labels.index[idx]
#         self.last_loc_id = loc_id
        
#         # select location id
#         s2_ts_x = s2_ts[['B8','B4','B3','B2','day']]
#         x = torch.tensor(s2_ts_x.to_numpy())
        
#         # get one-hot encoding for the point as tensor
#         y = torch.tensor(self.labels.iloc[idx].to_numpy())
        
#         return x, y
        
#     def __len__(self):
#         return self.pt_classes.shape[0]


# proj_path = "/Users/gopal/Google Drive/_Research/Research projects/ML/manclassify/app_data/Thailand"
# # date_rangeX = pd.to_datetime(['2019-06-01','2020-05-31'])
# s2_dataset = s2Dataset(proj_path = proj_path, class_colname = 'Subclass2019')
# x = s2_dataset.__getitem__(10)
# sys.getsizeof(x)