In [1]:
import pandas as pd
import numpy as np
import os
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import re
import sys
from datetime import timedelta
# from torch.nn.functional import normalize

In [2]:

proj_paths = ["/Users/gopal/Google Drive/_Research/Research projects/ML/manclassify/app_data/Thailand",
              "/Users/gopalpenny/Library/CloudStorage/GoogleDrive-gopalpenny@gmail.com/My Drive/_Research/Research projects/ML/manclassify/app_data/Thailand"]

proj_path = [path for path in proj_paths if os.path.exists(path)][0]

class_path = os.path.join(proj_path,"Thailand_classification")
ts_path = os.path.join(proj_path,"Thailand_download_timeseries")
# pd.read_csv("
os.listdir(class_path)

loc_id = 0

s2_csv_name = f"pt_ts_loc{loc_id}_s2.csv"
s2_csv_name

class_colname = 'Subclass2019'

proj_normpath = os.path.normpath(proj_path)
proj_dirname = proj_normpath.split(os.sep)[-1]
proj_name = re.sub("_classification$","",proj_dirname)
class_path = os.path.join(proj_path, proj_name + "_classification")
ts_path = os.path.join(proj_path, proj_name + "_download_timeseries")
pt_classes = pd.read_csv(os.path.join(class_path,"location_classification.csv"))
pt_classes = pt_classes[['loc_id', class_colname]].dropna()

pt_classes

Unnamed: 0,loc_id,Subclass2019
0,0,Plantation
1,1,Crop(Single)
2,2,Crop(Single)
3,3,Crop(Single)
4,4,Plantation
...,...,...
496,496,Crop(Single)
497,497,Crop(Single)
498,498,Plantation
499,499,Plantation


## Generate the torch tensor dataset

### Define function to read timeseries

* Read timeseries
* Filter timeseries to date range (+/- 60 days)
* Remove observations with clouds
* Take the mean value for each day (occurs when multiple overpasses happen on the same day)

In [3]:
# prep dataset
date_range = pd.to_datetime(['2019-06-01','2020-05-31'])

def prep_s2_loc(loc_id, date_range, proj_path):
    ts_path = os.path.join(proj_path,"Thailand_download_timeseries")
    s2_csv_name = f"pt_ts_loc{loc_id}_s2.csv"
    s2_csv_path = os.path.join(ts_path, s2_csv_name)
    s2_ts = pd.read_csv(s2_csv_path)

    # extract dates from image ids
    s2_ts['datestr'] = [re.sub("(^[0-9]+)[a-zA-Z].*","\\1",x) for x in s2_ts.image_id]
    s2_ts['date'] = pd.to_datetime(s2_ts.datestr, format = "%Y%m%d")

    # subset to cloud-free days AND within date_range
    s2_ts = s2_ts[(s2_ts.date >= date_range[0] - timedelta(days = 60)) & 
                  (s2_ts.date <= date_range[1] + timedelta(days = 60)) & 
                  (s2_ts.cloudmask == 0)]

    # calculate day from startday
    date_diff = (s2_ts.date - date_range[0])
    s2_ts['day'] = [x.days for x in date_diff]
    s2_ts['loc_id'] = loc_id

    # select only predictor and position columns, return tensor
    s2_ts_x = s2_ts[['loc_id','day','B8','B4','B3','B2']]
    return s2_ts_x

# s2_ts_loc125 = prep_s2_loc(125, date_range, proj_path)
# s2_ts_loc125.groupby(['loc_id','day'],as_index = False).mean()

### Get the torch tensor dataset (prep and save OR read)

In [4]:
# from ipywidgets import IntProgress
# from IPython.display import display

if os.path.exists(os.path.join(proj_path, 's2_ts_prepped.pt')):
    loc_ts_tor = torch.load(os.path.join(proj_path, 's2_ts_prepped.pt'))
    
else:
    # f = IntProgress(min=0, max=pt_classes.shape[0]) # instantiate the bar
    display(f) # display the bar
    
    s2_ts_list = []
    loc_id_list = []
    for i in np.arange(pt_classes.shape[0]):
        # loc_id = 499
        # print(loc_id)
        loc_id = pt_classes.loc_id.iloc[i]
        # loc_id_list.append(loc_id)
        s2_ts_loc = prep_s2_loc(loc_id, date_range, proj_path)
        s2_ts_loc = s2_ts_loc.groupby(['loc_id','day'],as_index = False).mean()
        s2_ts_tor = torch.tensor(s2_ts_loc.to_numpy())
        s2_ts_list.append(s2_ts_tor)
        # f.value += 1
        
    loc_ts_tor = torch.cat(s2_ts_list)

    torch.save(loc_ts_tor, os.path.join(proj_path, 's2_ts_prepped.pt'))

sys.getsizeof(loc_ts_tor)

72

### Prep the dataset tensors

* Subset to training classes (crops & plantations)
* Check max number of rows
* Normalize & center
* Split loc_id into training and test datasets

In [5]:
print('All classes')
print(pt_classes.groupby('Subclass2019').count())

train_classes = ['Crop(Double)','Crop(Single)','Plantation']
pt_classes_ag = pt_classes[pt_classes['Subclass2019'].isin(train_classes)]
print('\nTraining dataset (pt_classes_ag)\n',pt_classes_ag)

All classes
              loc_id
Subclass2019        
Crop(Double)      68
Crop(Single)     278
Forest             3
Golf               1
Mixed             20
Plantation       109
Unsure            17
Urban              1
Water              4

Training dataset (pt_classes_ag)
      loc_id  Subclass2019
0         0    Plantation
1         1  Crop(Single)
2         2  Crop(Single)
3         3  Crop(Single)
4         4    Plantation
..      ...           ...
496     496  Crop(Single)
497     497  Crop(Single)
498     498    Plantation
499     499    Plantation
500     500  Crop(Double)

[455 rows x 2 columns]


In [6]:
loc_ts_tor = loc_ts_tor[(loc_ts_tor[:,1] >= -30) & (loc_ts_tor[:,1] <= 395)]

row_means= loc_ts_tor.mean(dim = 1)#.shape #.unsqueeze(0).repeat(5,1)
loc_ts_tor = loc_ts_tor[~torch.isnan(row_means)]
col_means= loc_ts_tor.mean(dim = 0)#.shape #.unsqueeze(0).repeat(5,1)
col_std= loc_ts_tor.std(dim = 0)#.shape #.unsqueeze(0).repeat(5,1)
col_means[[0,1]] = 0
col_std[[0,1]] = 1

loc_ts_tor_std = col_std.unsqueeze(0).repeat(loc_ts_tor.shape[0],1)
loc_ts_tor_mean = col_means.unsqueeze(0).repeat(loc_ts_tor.shape[0],1)

loc_ts_norm = (loc_ts_tor - loc_ts_tor_mean) / loc_ts_tor_std

# get max of number of observations per location
# idx = np.arange(loc_ts_norm.shape[0])
loc_id = np.unique(loc_ts_norm[:,0])
num_obs = pd.DataFrame({'loc_id' : np.unique(loc_ts_norm[:,0]).astype('int')})
num_obs['num_obs'] = [loc_ts_norm[loc_ts_norm[:,0]==i,:].shape[0] for i in num_obs['loc_id']]
print("Max number of observations for any loc_id")
print(num_obs.iloc[[num_obs['num_obs'].idxmax()]])

Max number of observations for any loc_id
     loc_id  num_obs
481     481       94


In [7]:
loc_train = pt_classes_ag.groupby('Subclass2019', group_keys = False).apply(lambda x: x.sample(frac = 0.8))
loc_train['n'] = loc_train.groupby('Subclass2019')['loc_id'].transform(len)
loc_train['weight'] = loc_train.shape[0] / loc_train['n'] 

loc_test = pt_classes_ag[~pt_classes_ag['loc_id'].isin(loc_train.loc_id)]
print('Training (loc_train summary)\n', loc_train.groupby('Subclass2019').count())
print('\nTesting (loc_test summary)\n', loc_test.groupby('Subclass2019').count())

Training (loc_train summary)
               loc_id    n  weight
Subclass2019                     
Crop(Double)      54   54      54
Crop(Single)     222  222     222
Plantation        87   87      87

Testing (loc_test summary)
               loc_id
Subclass2019        
Crop(Double)      14
Crop(Single)      56
Plantation        22


In [8]:
loc_train

Unnamed: 0,loc_id,Subclass2019,n,weight
166,166,Crop(Double),54,6.722222
279,279,Crop(Double),54,6.722222
172,172,Crop(Double),54,6.722222
65,65,Crop(Double),54,6.722222
445,445,Crop(Double),54,6.722222
...,...,...,...,...
124,124,Plantation,87,4.172414
221,221,Plantation,87,4.172414
78,78,Plantation,87,4.172414
454,454,Plantation,87,4.172414


In [9]:
foo = np.random.rand(4,3)

np.sum(foo[:,0] > 0.4)

3

In [10]:
# loc_ids_all = np.unique(x_train[:,0])
# loc_ids_all

# np.unique(loc_train['n'])
# i = loc_ids_all[0]
# [i for i in loc_ids_all]
# torch.tensor.to_numpy(x_train[:,0] == i)
# np.sum(x_train[:,0] == i)
# [np.sum(x_train[:,0] == i) for i in np.unique(x_train[:,0])]
# for i in loc_ids_all:
    

In [127]:
class s2Dataset(Dataset):
    """Sentinel 2 dataset"""
    
    def __init__(self, x_train, y_train, max_obs):
        """
        Args:
            x_train (tensor): contains loc_id and predictors as columns, s2 observations as rows
            y_train (tensor): contains loc_id as rows, weights and class as 1-hot columns
        """
        self.x_train = x_train
        self.y_train = y_train
        self.max_obs = max_obs
        # self.proj_path = proj_path
        # proj_normpath = os.path.normpath(proj_path)
        # proj_dirname = proj_normpath.split(os.sep)[-1]
        # self.proj_name = re.sub("_classification$","",proj_dirname)
        # self.class_path = os.path.join(proj_path, self.proj_name + "_classification")
        # self.ts_path = os.path.join(proj_path, self.proj_name + "_download_timeseries")
        # self.pt_classes = pd.read_csv(os.path.join(self.class_path,"location_classification.csv"))
        # self.pt_classes = classes[['loc_id', class_colname]].dropna()
        # self.classes = pd.unique(self.pt_classes[class_colname])
        # self.labels = self.pt_classes.assign(val = 1).pivot_table(columns = class_colname, index = 'loc_id', values = 'val', fill_value= 0)

    
    def __getitem__(self, idx):
        # get loc_id
        loc_id = self.y_train[idx,0]
        self.last_loc_id = loc_id
        
        # select location id
        x_loc = self.x_train[self.x_train[:,0]==loc_id]
        x_prep = x_loc[:,1:] # remove loc_id column
        
        # pad zeros to max_obs
        n_pad = self.max_obs - x_prep.shape[0]
        
        x = torch.cat((x_prep, torch.zeros(n_pad, x_prep.shape[1])), dim = 0)
        
        x = x.float()
        
        
        
        # get one-hot encoding for the point as tensor
        y = torch.tensor(y_train[idx,2:])
        
        return x, y
        
    def __len__(self):
        return self.y_train.shape[0]

In [12]:
torch.random.manual_seed(100)
a = torch.rand(2,3)
torch.cat((a,a), 1)

tensor([[0.1117, 0.8158, 0.2626, 0.1117, 0.8158, 0.2626],
        [0.4839, 0.6765, 0.7539, 0.4839, 0.6765, 0.7539]])

### get training data

* `y_train` directly from `loc_train` & pivot
* `x_train` from `loc_ts_norm`, subset to `y_train[:,0]`

In [13]:
# get y_train values from loc_train
y_train_df = (loc_train.assign(val = 1) \
  .pivot_table(columns = class_colname, index = ['loc_id','weight'], values = 'val', fill_value= 0) \
  .reset_index(['loc_id','weight']))
y_train = y_train_df.to_numpy()
print('y_train:\n',y_train)

y_train = y_train_df.to_numpy()

# get x_train values from loc_ts_norm (based on loc_id)
x_train = loc_ts_norm[torch.isin(loc_ts_norm[:,0],torch.tensor(y_train[:,0]).to(torch.float64)),:]

y_train:
 [[  0.           4.17241379   0.           0.           1.        ]
 [  1.           1.63513514   0.           1.           0.        ]
 [  2.           1.63513514   0.           1.           0.        ]
 ...
 [497.           1.63513514   0.           1.           0.        ]
 [498.           4.17241379   0.           0.           1.        ]
 [499.           4.17241379   0.           0.           1.        ]]


### build pytorch dataset: `s2_dateset`

In [128]:
s2_dataset = s2Dataset(x_train = x_train, y_train = y_train, max_obs = 100)

# example item in dataset
idx_test = 2
x, y = s2_dataset.__getitem__(idx_test)

print(f'x example, shape: {x.shape} \n(idx={idx_test}) columns: day, B8, B4, B3, B2\n',x)
# print()
print(f'\n\ny example (idx={idx_test}): crops(double) crops(single) plantation\n',y)
print(y.shape)
# sys.getsizeof(x)

x example, shape: torch.Size([100, 5]) 
(idx=2) columns: day, B8, B4, B3, B2
 tensor([[-1.2000e+01,  1.2615e+00, -2.6081e-01,  3.0071e-01,  5.2696e-02],
        [-7.0000e+00, -3.9504e-01,  3.6840e-01,  6.7323e-01,  7.4909e-01],
        [ 3.0000e+00,  4.5408e-02,  1.8113e+00,  2.3523e+00,  2.2565e+00],
        [ 1.0800e+02, -1.0164e+00, -8.9497e-01, -7.8398e-01, -5.6436e-01],
        [ 1.1800e+02, -1.7902e-01, -1.1190e+00, -9.3737e-01, -9.5810e-01],
        [ 1.2300e+02, -4.3852e-01, -1.1486e+00, -9.7298e-01, -8.8758e-01],
        [ 1.2800e+02,  1.0011e-01, -1.1602e+00, -8.7985e-01, -1.0991e+00],
        [ 1.3300e+02,  4.2694e-01, -9.9709e-01, -5.8403e-01, -3.2929e-01],
        [ 1.4300e+02,  4.5780e-01, -1.3430e+00, -1.3044e+00, -1.2813e+00],
        [ 1.6300e+02,  7.8603e-01, -6.9072e-01, -9.6463e-02, -6.3488e-01],
        [ 1.6800e+02,  5.8545e-01,  8.6584e-01,  1.0786e+00,  7.2852e-01],
        [ 1.7300e+02, -5.0893e-03,  8.2136e-01,  5.0888e-01,  3.4947e-01],
        [ 1.7800e+02, 

### generate sampling weights for data loader

In [15]:
loc_train_n = loc_train
loc_train_n['n'] = loc_train_n.groupby('Subclass2019')['loc_id'].transform(len)
loc_train_n['weight'] = loc_train_n.shape[0] / loc_train_n['n'] 
loc_train_n

Unnamed: 0,loc_id,Subclass2019,n,weight
166,166,Crop(Double),54,6.722222
279,279,Crop(Double),54,6.722222
172,172,Crop(Double),54,6.722222
65,65,Crop(Double),54,6.722222
445,445,Crop(Double),54,6.722222
...,...,...,...,...
124,124,Plantation,87,4.172414
221,221,Plantation,87,4.172414
78,78,Plantation,87,4.172414
454,454,Plantation,87,4.172414


In [129]:
dataloader = DataLoader(s2_dataset, batch_size = 5, shuffle = True)

In [17]:
len(dataloader)

73

In [124]:
train_features, train_labels = next(iter(dataloader))
tf_test = train_features[:,:,:]
# tf_test
# train_labels
# tf_test
tf_test = tf_test.float()
print(tf_test.shape)

print(tf_test[0, 0:3, :])

torch.Size([5, 100, 5])
tensor([[-1.3000e+01,  9.9363e-01, -1.2507e+00, -1.1154e+00, -9.3165e-01],
        [ 2.0000e+00, -5.6989e-02, -5.6439e-04,  6.8418e-01,  8.0492e-01],
        [ 7.0000e+00,  1.6108e+00, -1.2738e+00, -9.8120e-01, -9.2872e-01]])


In [100]:
train_labels.shape

torch.Size([5, 3])

In [19]:
# class PositionalEncoding(nn.Module):

#     def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
#         super().__init__()
#         self.dropout = nn.Dropout(p=dropout)

#         position = torch.arange(max_len).unsqueeze(1)
#         div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
#         pe = torch.zeros(max_len, 1, d_model)
#         pe[:, 0, 0::2] = torch.sin(position * div_term)
#         pe[:, 0, 1::2] = torch.cos(position * div_term)
#         self.register_buffer('pe', pe)

#     def forward(self, x: Tensor) -> Tensor:
#         """
#         Args:
#             x: Tensor, shape [seq_len, batch_size, embedding_dim]
#         """
#         x = x + self.pe[:x.size(0)]
#         return self.dropout(x)

In [101]:
import torch.nn as nn


nhead = 6 # number of attention heads
head_dim = 8 # dimension of each word for each attention head
dmodel = nhead * head_dim # embed_dim -- each word (row) is embedded to this dimension then split
# across the nhead attention heads

data_in = tf_test[:, :, 1:] # select only the data
positions = tf_test[:,:,0:1] # split out positional data
data_dim = data_in.shape[-1]

In [21]:
torch.exp(torch.tensor([5.2333e-01]))/torch.sum(torch.exp(torch.tensor([-1.3249e-01, 5.2333e-01, -2.9124e-01])))

tensor([0.5097])

In [189]:
from torch import nn, Tensor
class TransformerClassifier(nn.Module):
    def __init__(self, ntoken: int, dmodel: int, nhead: int, dhid: int, 
                 nlayers: int, data_dim: int, nclasses: int):
        """
        data_dim: dimension of data (i.e., num of columns) including position as first dimension
        """
        super().__init__()
        self.positional_layer = nn.Linear(1, dmodel)
        self.embed_layer = nn.Linear(data_dim - 1, dmodel) # transform data to embed dimension (dmodel)
        
        # dim_feedforward: https://stackoverflow.com/questions/68087780/pytorch-transformer-argument-dim-feedforward
        # shortly: dim_feedforward is a hidden layer between two forward layers at the end of the encoder layer, passed for each word one-by-one
        self.encoderlayer = nn.TransformerEncoderLayer(d_model = dmodel, nhead = nhead, dim_feedforward = dhid)
        self.encoder = nn.TransformerEncoder(self.encoderlayer, nlayers)
        
        self.num_params = ntoken * dmodel
        
        self.class_encoder = nn.Linear(dmodel, nclasses)
    
    def forward(self, src: Tensor) -> Tensor:
        
        positions = src[:, :, 0:1]
        data = src[:, :, 1:]
        pe = self.positional_layer(positions)
        data_embed = self.embed_layer(data)
        data_and_pe = pe + data_embed
        encoder_out = self.encoder(data_and_pe)
        
        maxpool = torch.max(encoder_out,dim = 1)[0]
        
        # softmax ensures output of model is probability of class membership -- which sum to 1
        # BUT this is already done with CrossEntropyLoss so it's not necessary for this loss function
        classes = self.class_encoder(maxpool), dim = 1
        
        # classes = nn.functional.softmax(classes, 1) # don't use softmax with cross entropy loss... or do?
        # don't: https://stackoverflow.com/questions/55675345/should-i-use-softmax-as-output-when-using-cross-entropy-loss-in-pytorch
        # do: Machine Learning with Pytorch and Scikitlearn (p 471: Loss functions for classifiers) -- BUT NOT WITH CROSS ENTROPY LOSS (p478
        
        return classes

        # data_in = tf_test[:, :, 1:] # select only the data
        # positions = tf_test[:,:,0:1] # split out positional data
        # data_dim = data_in.shape[-1]
        
        
tfnetwork = TransformerClassifier(100, dmodel = 36, nhead = 6, dhid = 100, nlayers = 3, data_dim = 5, nclasses = 3)

tfnetwork(tf_test).shape

torch.Size([5, 3])

In [185]:
from torchinfo import summary
print(tuple(tf_test.shape))
summary(tfnetwork, input_size = (5, 100, 5))

(5, 100, 5)


Layer (type:depth-idx)                        Output Shape              Param #
TransformerClassifier                         [5, 3]                    12,808
├─Linear: 1-1                                 [5, 100, 36]              72
├─Linear: 1-2                                 [5, 100, 36]              180
├─TransformerEncoder: 1-3                     [5, 100, 36]              --
│    └─ModuleList: 2-1                        --                        --
│    │    └─TransformerEncoderLayer: 3-1      [5, 100, 36]              12,808
│    │    └─TransformerEncoderLayer: 3-2      [5, 100, 36]              12,808
│    │    └─TransformerEncoderLayer: 3-3      [5, 100, 36]              12,808
├─Linear: 1-4                                 [5, 3]                    111
Total params: 51,595
Trainable params: 51,595
Non-trainable params: 0
Total mult-adds (M): 0.11
Input size (MB): 0.01
Forward/backward pass size (MB): 2.78
Params size (MB): 0.09
Estimated Total Size (MB): 2.89

In [198]:
train_features, train_labels = next(iter(dataloader))

tfnetwork = TransformerClassifier(100, dmodel = 36, nhead = 6, dhid = 100, nlayers = 3, data_dim = 5, nclasses = 3)

train_out = tfnetwork(train_features)



loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(tfnetwork.parameters(), lr = 0.001)

print(train_out)
print(train_labels)
pred = torch.argmax(train_out, dim = 1)
actual = torch.argmax(train_labels, dim = 1)

print(pred == actual)
print(torch.sum(pred == actual))
tfnetwork.train()
loss = loss_fn(train_out, train_labels)
loss.backward()
optimizer.step()
optimizer.zero_grad()
# tf_train
# tf_test.shape

tensor([[0.1145, 0.7564, 0.1291],
        [0.1723, 0.6622, 0.1655],
        [0.1269, 0.6999, 0.1732],
        [0.1711, 0.6840, 0.1449],
        [0.2155, 0.6460, 0.1385]], grad_fn=<SoftmaxBackward0>)
tensor([[1., 0., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.]], dtype=torch.float64)
tensor([False, False,  True,  True,  True])
tensor(3)


In [176]:
tfnetwork

TransformerClassifier(
  (positional_layer): Linear(in_features=1, out_features=36, bias=True)
  (embed_layer): Linear(in_features=4, out_features=36, bias=True)
  (encoderlayer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=36, out_features=36, bias=True)
    )
    (linear1): Linear(in_features=36, out_features=100, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=100, out_features=36, bias=True)
    (norm1): LayerNorm((36,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((36,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=36, out_features=36, bias=True)
        )
       

In [40]:
tfnetwork_out = tfnetwork(tf_test)
torch.max(tfnetwork_out,dim = 1)[0].shape

torch.Size([5, 36])

In [25]:
torch.triu(torch.ones(4, 4) * float('-inf'), diagonal=0)

tensor([[-inf, -inf, -inf, -inf],
        [0., -inf, -inf, -inf],
        [0., 0., -inf, -inf],
        [0., 0., 0., -inf]])

In [41]:
dmodel

48

## Old S2 pytorch dataset

In [None]:
# class s2Dataset(Dataset):
#     """Sentinel 2 dataset"""
    
#     def __init__(self, proj_path, class_colname):
#         """
#         Args:
#             proj_path (string): path to manclassify project
#         """
#         self.proj_path = proj_path
#         proj_normpath = os.path.normpath(proj_path)
#         proj_dirname = proj_normpath.split(os.sep)[-1]
#         self.proj_name = re.sub("_classification$","",proj_dirname)
#         self.class_path = os.path.join(proj_path, self.proj_name + "_classification")
#         self.ts_path = os.path.join(proj_path, self.proj_name + "_download_timeseries")
#         self.pt_classes = pd.read_csv(os.path.join(self.class_path,"location_classification.csv"))
#         self.pt_classes = classes[['loc_id', class_colname]].dropna()
#         # self.pt_classes['loc_id'] = self.pt_classes['loc_id'] + 10.5 # for testing index only
#         self.classes = pd.unique(self.pt_classes[class_colname])
#         self.labels = self.pt_classes.assign(val = 1).pivot_table(columns = class_colname, index = 'loc_id', values = 'val', fill_value= 0)

    
#     def __getitem__(self, idx):
#         loc_id = self.labels.index[idx]
#         self.last_loc_id = loc_id
        
#         # select location id
#         s2_ts_x = s2_ts[['B8','B4','B3','B2','day']]
#         x = torch.tensor(s2_ts_x.to_numpy())
        
#         # get one-hot encoding for the point as tensor
#         y = torch.tensor(self.labels.iloc[idx].to_numpy())
        
#         return x, y
        
#     def __len__(self):
#         return self.pt_classes.shape[0]


# proj_path = "/Users/gopal/Google Drive/_Research/Research projects/ML/manclassify/app_data/Thailand"
# # date_rangeX = pd.to_datetime(['2019-06-01','2020-05-31'])
# s2_dataset = s2Dataset(proj_path = proj_path, class_colname = 'Subclass2019')
# x = s2_dataset.__getitem__(10)
# sys.getsizeof(x)