In [189]:
#!pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
#!pip3 install matplotlib
#!pip3 install matplotlib

In [190]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import matplotlib.pyplot as plt

In [None]:
# Here you must specify the cause_id you want to train on
"""
cause_id columns meaning:
  "508","Chronic respiratory diseases"
  "509","Chronic obstructive pulmonary disease"
  "510","Pneumoconiosis"
  "511","Silicosis"
  "512","Asbestosis"
  "513","Coal workers pneumoconiosis"
  "514","Other pneumoconiosis"
  "515","Asthma"
  "516","Interstitial lung disease and pulmonary sarcoidosis"
  "520","Other chronic respiratory diseases"
  "-1","All causes"
"""
cause_id = -1

In [191]:
torch.manual_seed(42) # Setting the seed
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
if torch.cuda.is_available(): 
  torch.cuda.manual_seed(42)
  torch.cuda.manual_seed_all(42)
print("Current device is {}".format(device))

Current device is cpu


In [192]:
class LinearClassifier(nn.Module):
  def __init__(self, input_dim, output_dim):
    super().__init__()
    self.linear = nn.Linear(input_dim, output_dim)

  def forward(self, x):
    x = self.linear(x)
    return x

In [193]:
class AirDataSet(data.Dataset):

  def __init__(self):
    super().__init__()
    file_path = 'training_data/cause_id_{}.csv'.format(cause_id)
    if (cause_id == -1):
      file_path = 'training_data/all_data.csv'
    training_df = pd.read_csv(file_path,
      dtype={
        'parameter_85101': 'float32',
        'parameter_88101': 'float32',
        'parameter_44201': 'float32',
        'parameter_42602': 'float32',
        'parameter_42401': 'float32',
        'parameter_42101': 'float32',
        'cause_id': 'float32',
      })

    # Use only the first 1000 rows for training
    # training_df = training_df[:1000]

    # For each row we have the following columns corresponding to features:
    # -parameter_85101
    # -parameter_88101
    # -parameter_44201
    # -parameter_42602
    # -parameter_42401
    # -parameter_42101
    # -cause_id

    # And the following columns corresponding to labels:
    # -rate

    # We want to predict the rate based on the parameters
    self.data = training_df[['parameter_85101', 'parameter_88101', 'parameter_44201', 'parameter_42602', 'parameter_42401', 'parameter_42101', 'cause_id']].to_numpy()
    self.label = training_df[['rate']].to_numpy()

    # Convert to torch tensors
    self.data = torch.from_numpy(self.data)
    self.label = torch.from_numpy(self.label)

    # Normalize the data based on mean and variance
    print("Data mean: {}, Data std: {}".format(self.data.mean(dim=0), self.data.std(dim=0)))

    self.data = (self.data - self.data.mean(dim=0)) / self.data.std(dim=0)



  def __len__(self):
    # Number of data point we have. Alternatively self.data.shape[0], or self.label.shape[0]
    return self.data.shape[0]

  def __getitem__(self, idx):
    # Return the idx-th data point of the dataset
    # If we have multiple things to return (data point and label), we can return them as tuple
    data_point = self.data[idx]
    data_label = self.label[idx]
    return data_point, data_label

In [194]:
model = LinearClassifier(7, 1)
dataset = AirDataSet()
train_data_loader = data.DataLoader(dataset, batch_size=32, shuffle=True)
loss_func = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

Data mean: tensor([1.8104e+01, 8.8269e+00, 4.7087e-02, 1.4174e+01, 5.2491e+00, 5.0115e-01,
        5.1280e+02]), Data std: tensor([8.4843e+00, 3.1032e+00, 8.0861e-03, 1.0613e+01, 1.0969e+01, 4.7684e-01,
        3.4293e+00])


In [195]:
model.to(device)

LinearClassifier(
  (linear): Linear(in_features=7, out_features=1, bias=True)
)

In [None]:
losses = []
def train_model(model, optimizer, data_loader, loss_module, num_epochs=1000):
  model.train()
  for epoch in range(num_epochs):
    for data_point, data_label in data_loader:
      data_point = data_point.to(device)
      data_label = data_label.to(device)
      data_label = data_label.squeeze(dim=1)

      output = model(data_point)
      output = output.squeeze(dim=1)

      loss = loss_module(output, data_label.float())
      
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
    # print("Epoch {} loss: {}".format(epoch, loss.item()))
    losses.append(loss.item())
    print("Epoch {} loss: {}".format(epoch, loss.item()))

train_model(model, optimizer, train_data_loader, loss_func, num_epochs=500)

# Plotting the loss
plt.plot(losses)
plt.show()
