<a href="https://colab.research.google.com/github/gangaraju09/map_generalisation_ml/blob/main/map_generalization_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset, WeightedRandomSampler
import pandas as pd
import numpy as np
import csv
torch.manual_seed(40)

<torch._C.Generator at 0x7fd5401ace10>

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
idaho_df = pd.read_csv('/content/drive/MyDrive/Vertices_Labels/Idaho_vertices_labels.csv')
labels_unique, count = np.unique(idaho_df['case'], return_counts=True)

print(f"Number of samples with their counts are: {labels_unique},{count}")

class_weights = [sum(count)/c for c in count]
print(f"Class weights needed for resampling: {class_weights}")
# mapping = {'no': 0, 'yes': 1}

Number of samples with their counts are: ['no' 'yes'],[35135   107]
Class weights needed for resampling: [1.0030453963284474, 329.3644859813084]


In [34]:
# Define a custom dataset class to load the CSV file
class CustomDataset(Dataset):
    def __init__(self, csv_file):
      # We read csv file to data.
      # 'No' is mapped to 0 and 'Yes' is mapped to 1 to send it as input to Binary cross entropy loss
        self.data = pd.read_csv(csv_file)
        self.mapping = {'no': 0, 'yes': 1}
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
      # Get the lat, long and output of a point
        sample = {'lat': self.data.iloc[idx]['Latitude'],
                  'long': self.data.iloc[idx]['Longitude'],
                  'output': self.mapping[self.data.iloc[idx]['case']]}
        return sample

# Define the neural network architecture
# Declared a simple NN with 2 -> 5 -> 10 -> 5 -> 1 with Sigmoid activation function
# TODOs: 
# 1. Experiment with ReLU
# 2. Try Batch Normalization 
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(2, 5)
        self.bn1 = nn.BatchNorm1d(5)
        self.fc2 = nn.Linear(5, 10)
        self.bn2 = nn.BatchNorm1d(10)
        self.fc3 = nn.Linear(10, 20)
        self.bn3 = nn.BatchNorm1d(20)
        self.fc4 = nn.Linear(20, 50)
        self.bn4 = nn.BatchNorm1d(50)
        self.fc5 = nn.Linear(50, 20)
        self.bn5 = nn.BatchNorm1d(20)
        self.fc6 = nn.Linear(20, 10)
        self.bn6 = nn.BatchNorm1d(10)
        self.fc7 = nn.Linear(10, 5)
        self.bn7 = nn.BatchNorm1d(5)
        self.fc8 = nn.Linear(5, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.sigmoid(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.sigmoid(x)
        x = self.fc3(x)
        x = self.bn3(x)
        x = self.sigmoid(x)
        x = self.fc4(x)
        x = self.bn4(x)
        x = self.sigmoid(x)
        x = self.fc5(x)
        x = self.bn5(x)
        x = self.sigmoid(x)
        x = self.fc6(x)
        x = self.bn6(x)
        x = self.sigmoid(x)
        x = self.fc7(x)
        x = self.bn7(x)
        x = self.sigmoid(x)
        x = self.fc8(x)
        x = self.sigmoid(x)
        return x


In [35]:
# Load the dataset and split into training and testing sets
dataset = CustomDataset('/content/drive/MyDrive/Vertices_Labels/Idaho_vertices_labels.csv')
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - (train_size)

# Creating train and test sets - 
# TODO: 1. But make sure that train has the minority classes we need!!
# 2. Can also create validation dataset!!
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

example_weights = [class_weights[e['output']] for e in train_dataset]

print(f"Length of training dataset is {train_size}")
sampler = WeightedRandomSampler(example_weights, train_size, replacement=True)

print(f"First 10 elements in sampler are: {list(sampler)[:10]}")
# Set up the data loaders
train_loader = DataLoader(train_dataset, batch_size=256, sampler=sampler)
test_loader = DataLoader(test_dataset, batch_size=256)

# num_samples = 32
# mini_train_subset = Subset(train_loader.dataset, range(num_samples))
# train_loader = DataLoader(mini_train_subset, batch_size=num_samples, sampler=sampler)

# mini_test_subset = Subset(test_loader.dataset, range(num_samples))
# test_loader = DataLoader(mini_test_subset, batch_size=num_samples)

# for data in train_loader:
#   print(np.unique(data['output'], return_counts=True))
#   break

Length of training dataset is 28193
First 10 elements in sampler are: [15128, 3896, 18057, 11374, 13890, 23601, 22059, 23890, 19712, 21648]


In [36]:
# Set up the neural network and the optimizer
net = Net()
criterion = nn.BCELoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)

In [37]:
# Train the neural network
num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        data['lat'] = data['lat'].float()
        data['long'] = data['long'].float()
        data['output'] = data['output'].float()
        inputs = torch.stack([data['lat'], data['long']], dim=1)
        labels = torch.tensor(data['output']).unsqueeze(1)
        # print(inputs)
        # print(labels)
        optimizer.zero_grad()
        outputs = net(inputs)
        # print(outputs)
        # break
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print('Epoch [%d], Loss: %.4f' % (epoch+1, running_loss/len(train_loader)))

  labels = torch.tensor(data['output']).unsqueeze(1)


Epoch [1], Loss: 0.6911
Epoch [2], Loss: 0.6898
Epoch [3], Loss: 0.6902
Epoch [4], Loss: 0.6882
Epoch [5], Loss: 0.6875
Epoch [6], Loss: 0.6870
Epoch [7], Loss: 0.6868
Epoch [8], Loss: 0.6872
Epoch [9], Loss: 0.6832
Epoch [10], Loss: 0.6850


In [40]:
# Evaluate the neural network on the test set
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        data['lat'] = data['lat'].float()
        data['long'] = data['long'].float()
        data['output'] = data['output'].float()
        inputs = torch.stack([data['lat'], data['long']], dim=1)
        labels = torch.tensor(data['output']).unsqueeze(1)
        outputs = net(inputs)
        predicted = (outputs > 0.5).float()
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

torch.save(net, 'best-model.pt')

print(f"Total number of test points are: {total} and correct points are {correct}")
print('Accuracy of the network on the test data: %d %%' % (100 * correct / total))

  labels = torch.tensor(data['output']).unsqueeze(1)


Total number of test points are: 7049 and correct points are 2197
Accuracy of the network on the test data: 31 %


In [41]:
idaho_train_model = torch.load('best-model.pt')

In [42]:
dataset = CustomDataset('/content/drive/MyDrive/Vertices_Labels/Maine_vertices_labels.csv')
dataloader = DataLoader(dataset, batch_size=256)

print(len(dataloader))

with open('maine_evaluate.csv', 'a+', newline='') as csvfile:
  writer = csv.writer(csvfile, delimiter=',')
  writer.writerow(['lat', 'long', 'case'])

  with torch.no_grad():
      idaho_train_model.eval()
      for data in dataloader:
          data['lat'] = data['lat'].float()
          data['long'] = data['long'].float()
          inputs = torch.stack([data['lat'], data['long']], dim=1)
          outputs = net(inputs)
          predicted = (outputs > 0.5).float()
          # predicted = predicted.detach().cpu().numpy()[0]
          for i in range(len(data['lat'])):
            # print(data['lat'][i].detach().cpu().numpy())
            # print(data['long'][i])
            # print(predicted[i])
            writer.writerow([data['lat'][i].detach().cpu().numpy(), data['long'][i].detach().cpu().numpy(), 
                             int(predicted[i].detach().cpu().numpy()[0])])
  csvfile.close()

3221


In [1]:
import wget

In [2]:
wget.download('https://github.com/Sidrcs/Geospatial_BigData_Analytics/raw/main/GeographicWeightedRegression/MoransI_GWR_Sid.ipynb')

'MoransI_GWR_Sid.ipynb'