**Machine learning model development and deployment for the task of Covid-19 contact tracing**
**My contribution in the paper: **

In [None]:
import statistics
from sklearn.model_selection import train_test_split
import os
import random
import math
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torchvision import models
from torch import optim
import time
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'You are using {device}')

In [None]:
train_dir=!tar xzvf ../input/datanist/tc4tl_training_data_v1.tgz


In [None]:
INTERVAL_LENGTH=4
NUM_READINGS_PER_INTERVAL=150

In [None]:

trainset_path=train_dir[0]
trainkey_path=train_dir[len(train_dir)-1]

test_key_path="../input/validation/validation/docs/tc4tl_test_metadata.tsv"
test_data_path="../input/validation/validation/data/test"
val_key_path="../input/validation/validation/docs/tc4tl_dev_key.tsv"
val_data_path="../input/validation/validation/data/dev"

In [None]:
#read key to dataframe
df_train_key = pd.read_csv(trainkey_path, sep='\t', index_col="fileid")
df_train_key.head()

In [None]:
df_val_key=pd.read_csv(val_key_path, sep='\t', index_col="fileid")
df_test_key=pd.read_csv(test_key_path, sep='\t', index_col="fileid")

In [None]:
df_train_key_fined=df_train_key[df_train_key["coarse_grain"]=='N']
df_train_key_coarse=df_train_key[df_train_key["coarse_grain"]=='Y']

df_val_key_fined=df_val_key[df_val_key["coarse_grain"]=='N']
df_val_key_coarse=df_val_key[df_val_key["coarse_grain"]=='Y']

df_test_key_fined=df_test_key[df_test_key["coarse_grain"]=='N']
df_test_key_coarse=df_test_key[df_test_key["coarse_grain"]=='Y']

In [None]:
fined_file_list=list(df_train_key_fined.index)
coarse_file_list=list(df_train_key_coarse.index)

val_fined_file_list=list(df_val_key_fined.index)
val_coarse_file_list=list(df_val_key_coarse.index)

test_fined_file_list=list(df_test_key_fined.index)
test_coarse_file_list=list(df_test_key_coarse.index)

In [None]:
def read_metadata(key_path, data_path,isFineOrCoarse,isCF):
# first pass to find the various values for the categorical features to use for one-hot encoding
    fixed_variables_possible_values = [set() for _ in range(9)]
    
    class_labels_possible_values = set()
    for file_id in os.listdir(data_path):
        if file_id in isFineOrCoarse:
            if file_id.startswith("."):
                # there are some weird extra files starting with .
                continue
            with open(os.path.join(data_path, file_id), 'r', errors="ignore") as data_file:
                for index in range(7):
                    value = data_file.readline().strip().split(",")[1]
                    fixed_variables_possible_values[index].add(value)
    with open(key_path, 'r', errors="ignore") as key_file:
        key_file.readline()  # skip header
        for line in key_file:
            record = line.split("\t")
            
            if record[4].lstrip().rstrip() ==isCF:
                
                transmitter_position, receiver_position = record[1].split("_")
                if len(record) == 5:
                    # has labels

                    class_labels_possible_values.add(float(record[2]))
                fixed_variables_possible_values[7].add(transmitter_position)
                fixed_variables_possible_values[8].add(receiver_position)
    for i in range(3,9):
        fixed_variables_possible_values[i].add("unknown")
    fixed_variables_possible_values = [list(fixed_variable_possible_values) for fixed_variable_possible_values in fixed_variables_possible_values]
    class_labels_possible_values = list(class_labels_possible_values)
    return fixed_variables_possible_values,class_labels_possible_values

In [None]:
def calculate_averages(list_2d):
    cell_total = list()
    row_totals = dict()
    column_totals = dict()
    for row_idx, row in enumerate(list_2d):
        for cell_idx, cell in enumerate(row):
            # is cell a number?
            if type(cell) in [int, float, complex]:
                cell_total.append(cell)                
                if row_idx in row_totals:
                    row_totals[row_idx].append(cell)
                else:
                    row_totals[row_idx] = [cell]
                if cell_idx in column_totals:
                    column_totals[cell_idx].append(cell)
                else:
                    column_totals[cell_idx] = [cell]
    per_row_avg = [sum(row_totals[row_idx]) / len(row_totals[row_idx]) for row_idx in row_totals]
    per_col_avg = [sum(column_totals[col_idx]) / len(column_totals[col_idx]) for col_idx in column_totals]
    row_avg = sum(per_row_avg) / len(per_row_avg)
    col_avg = sum(per_col_avg) / len(per_col_avg)
    return {'cell_average': sum(cell_total) / len(cell_total),
            'per_row_average': per_row_avg,
            'per_column_average': per_col_avg,
            'row_average': row_avg,
            'column_average': col_avg}

In [None]:
def compute_distance(TX, RSSI, N):
    result=10**((TX-RSSI)/(10-N))
    return result

In [None]:
def load_data(key_path, data_path,fixed_variables_possible_values,class_labels_possible_values,isFineOrCoarse,isTest):
    X = list()  # each intervals index in this list gives you the input features
    y = list()  # each intervals index in this list gives you the label
    interval_to_file = list()
    check=set()# this is used to output predictions. Each intervals index in this list gives you the file that the interval was from
    with open(key_path, 'r') as key_file:
        key_file.readline()  # skip header
        for line in key_file:
            record = line.split("\t")
            file_id = record[0]
            if file_id in isFineOrCoarse:
                TX=-54
                N=2.1
                if isTest and record[3].lstrip().rstrip() =="Y":
                    TX=-52
                    N=2.6
                elif not isTest and  record[4].lstrip().rstrip() =="Y":
                    TX=-52
                    N=2.6
                with open(os.path.join(data_path, file_id)) as data_file:
                    # fixed variables tx_device, tx_power, rx_device, tx_carry, rx_carry, rx_pose, tx_pose,
                    #                              transmitter_position, receiver_position
                    txpower=7
                    fixed_variables=[]
                    for i in range(7):
                        val=data_file.readline().strip().split(",")[1]
                        if i==1 and (val=="Unknown" or val=="unknown"):
                            val=7
                        if i==1:
                            txpower=int(val)
                        fixed_variables.append(val)
                    #fixed_variables = [data_file.readline().strip().split(",")[1] for _ in range(7)]
                    fixed_variables.extend(record[1].split("_"))
                    fixed_part = list()
                    #ONE HOT ENCODING
                    for variable, key in zip(fixed_variables, fixed_variables_possible_values):
                        fixed_part.extend([int(possible_value == variable) for possible_value in key])
                    #print("Loading file {} with fixed variables of {}".format(file_id, fixed_part))
                    interval_start_time = 0
                    interval_data = list()
                    num_intervals = 0
                    reading_count = 0
                    previous_value = {
                        "Bluetooth": (0,),
                        "Accelerometer": (0,0,0),
                        "Gyroscope": (0,0,0),
                        "path_loss":(0,),
                        "distance":(0,)
                    }
                    for line in data_file:
                        reading = line.strip().split(",")
                        curr_time = float(reading[0])
                        if (curr_time - interval_start_time) > INTERVAL_LENGTH:
                            if reading_count > NUM_READINGS_PER_INTERVAL:
                                # randomly remove readings
                                for _ in range(reading_count - NUM_READINGS_PER_INTERVAL):
                                    interval_data.pop(math.floor(random.random() * len(interval_data)))
                            else:
                                # randomly duplicate readings
                                # todo: try other methods such as averaging
                                for _ in range(NUM_READINGS_PER_INTERVAL - reading_count):
                                    random_index = math.floor(random.random() * len(interval_data))
                                    cal_avg=calculate_averages(interval_data)
                                    interval_data.insert(random_index, cal_avg["per_column_average"])
                                    #print(interval_data)
                            X.append(interval_data)
                            num_intervals += 1

                            # reset values
                            interval_start_time = curr_time
                            reading_count = 0
                            interval_data = list()
                            previous_value = {
                                "Bluetooth": (0,),
                                "Accelerometer": (0,0,0),
                                "Gyroscope": (0,0,0),
                                "path_loss":(0,),
                                "distance":(0,)
                            }
                        type = reading[1]
                        if type in {"Pedometer", "Activity","Heading", "Altitude", "Attitude","Gravity","Magnetic-field"}: #, "Heading", "Altitude", "Attitude", "Gravity"}:
                            continue
                        elif type == "Bluetooth":
                            previous_value[type] = (float(reading[2]), )
                            previous_value["path_loss"]=(txpower-41-float(reading[2]),)
                            previous_value["distance"]=(compute_distance(TX,float(reading[2]),N),)
                        else:
                            check.add(type)
                            previous_value[type] = (float(reading[2]), float(reading[3]), float(reading[4]))
                        # combine the values in previous_value into one giant list
                        # Uses 0 as angle for nist data

                        interval_data.append( [reading for value in previous_value.values() for reading in value]  + fixed_part)
                        reading_count += 1
                    # the last interval needs to be added manually
                    if reading_count > NUM_READINGS_PER_INTERVAL:
                        # randomly remove readings
                        for i in range(reading_count - NUM_READINGS_PER_INTERVAL):
                            interval_data.pop(math.floor(random.random() * len(interval_data)))
                    else:
                        # randomly duplicate readings
                        # todo: try other methods such as averaging
                        for i in range(NUM_READINGS_PER_INTERVAL - reading_count):
                            random_index = math.floor(random.random() * len(interval_data))
                            interval_data.insert(random_index, interval_data[random_index])
                    X.append(interval_data)
                    num_intervals += 1

                if len(record) == 5:
                    # this means this file has labels
                    distance = float(record[2])
                    label = torch.zeros(INTERVAL_LENGTH)
                    label[class_labels_possible_values.index(distance)] = 1
                    for _ in range(num_intervals):
                        y.append(label)

                for _ in range(num_intervals):
                    interval_to_file.append(file_id)
    
    return [torch.Tensor(interval) for interval in X], y, class_labels_possible_values, interval_to_file

In [None]:
fixed_variables_possible_values,class_labels_possible_values=read_metadata( trainkey_path,trainset_path,coarse_file_list,"Y")

In [None]:
class_labels_possible_values

In [None]:
train_X, train_y, train_labels_to_distance, train_intervals_to_file = load_data( trainkey_path,trainset_path,fixed_variables_possible_values,class_labels_possible_values,coarse_file_list,False)

In [None]:

test_X, _, test_labels_to_distance, test_intervals_to_file = load_data(test_key_path, test_data_path,fixed_variables_possible_values,class_labels_possible_values,test_coarse_file_list,True)



In [None]:
val_X, val_y, labels_to_distance, dev_interval_to_file = load_data(val_key_path, val_data_path,fixed_variables_possible_values,class_labels_possible_values,val_coarse_file_list,False)


In [None]:

train_data_loader = torch.utils.data.DataLoader(list(zip(train_X, train_y)), batch_size=128, drop_last=True)
val_data_loader = torch.utils.data.DataLoader(list(zip(val_X, val_y)), batch_size=128,drop_last=True)
test_data_loader= torch.utils.data.DataLoader(list(zip(test_X, _)), batch_size=128,drop_last=True)

In [None]:
def CSELoss(predictions, targets, epsilon=1e-12):
  """
  cross entropy loss  
  """

  predictions = torch.clamp(predictions, epsilon, 1. - epsilon)
  N = predictions.shape[0]
  ce = -torch.sum(targets*torch.log(predictions+1e-9))/N
  return ce

In [None]:
class CNN_ForecastNet_pooled(nn.Module):
    def __init__(self,input_size, len_timestamp, hidden_size, output_size, kernel_size):
        super(CNN_ForecastNet_pooled,self).__init__()

        self.input_size= input_size
        self.len_timestamp= len_timestamp
        self.hidden_size= hidden_size
        self.kernel_size= kernel_size
        self.output_size= output_size
        self.padding = kernel_size // 2
        
        self.conv1 = nn.Conv1d(self.input_size, self.hidden_size, kernel_size=3, padding=self.padding)
        self.conv2 = nn.Conv1d(self.hidden_size, self.hidden_size // 2, kernel_size=3, padding=self.padding)
        self.conv3 = nn.Conv1d(self.hidden_size // 2, self.hidden_size // 2, kernel_size=3, padding=self.padding)
        #self.maxpool1d = nn.MaxPool1d(kernel_size=3)
        self.conv_outdim = (self.len_timestamp //16)
        self.linear_input_size = self.hidden_size * self.conv_outdim
        self.fc1 = nn.Linear(self.linear_input_size, 128)
        self.fc2 = nn.Linear(128, self.output_size)
        self.softmax = nn.Softmax()
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout()

    def forward(self,x):

        x = x.permute(0,2,1)
        out = nn.functional.max_pool1d(self.relu(self.conv1(x)), 2)
        out1 = out
        out= self.dropout(out)
        out = nn.functional.max_pool1d(self.relu(self.conv2(out)), 2)
        out= self.dropout(out)
        out = nn.functional.max_pool1d(self.relu(self.conv3(out)), 2)
        out= self.dropout(out)
        out= out.view(out.size(0), -1)
        out = self.relu(self.fc1(out))
        out = self.fc2(out)
        y_pred = self.softmax(out)

        return y_pred


In [None]:
def eval_model(model, val_data_loader, best_acc, save=True):
  model.eval()
  predict_dist = {0:0,1:0,2:0,3:0}
  label_dist = {0:0,1:0,2:0,3:0}
  with torch.no_grad():
      test_total_loss = list()
      test_total_acc = list()
      test_total_tc_acc = list()
      for idx, batch in enumerate(val_data_loader):
          input = batch[0]
          label = torch.max(batch[1],axis=1)[1]
          for lab in label:
            label_dist[int(lab)] += 1
          # if idx % 100 == 0:
          #   print("label {}".format(label))
          prediction = model(input) #, batch_size=len(label))
          loss = loss_fn(prediction, batch[1])
          prediction = torch.max(prediction, 1)[1].view(label.size())
          for pred in prediction:
            predict_dist[int(pred)] += 1
          # if idx % 100 == 0:
          #   print("pred {}".format(prediction))
          num_corrects = (prediction == label).float().sum()
          acc = 100.0 * num_corrects / len(label)
          prediction = ((prediction == 0) + (prediction == 3).float())
          label = ((label == 0) + (label == 3).float())
          num_tc_corrects = (label == prediction).float().sum()
          tc_acc = 100.0 * num_tc_corrects / len(label)
          test_total_loss.extend([loss.item()] * len(label))
          test_total_acc.extend([acc.item()] * len(label))
          test_total_tc_acc.extend([tc_acc.item()] * len(label))
          # if idx % 100 == 0:
          #   print("acc {}".format(acc))
      
      curr_acc = statistics.mean(test_total_acc)
      if best_acc < curr_acc and save:
        torch.save(model.state_dict(), MODEL_PATH)
        best_acc = curr_acc
      #print(predict_dist)
      #print(label_dist)
      return statistics.mean(test_total_loss), statistics.mean(test_total_acc), statistics.mean(test_total_tc_acc), best_acc
import json

def output_predictions(model, X, labels_to_distance, intervals_to_file, output_path):
  model.eval()
  with torch.no_grad():
      with open(output_path, "w") as f:
        f.write("fileid\tdistance\n")
        file_to_interval_pred = dict()
        i = 0
        for tensor, file_id in zip(X, intervals_to_file):
            i+=1
            #print(i)
            if file_id not in file_to_interval_pred:
              file_to_interval_pred[file_id] = list()            
            input = tensor.view(1, tensor.shape[0], tensor.shape[1])
            prediction = model(input)#, batch_size=1)
            prediction = labels_to_distance[torch.max(prediction, 1)[1]]
            file_to_interval_pred[file_id].append(str(prediction))
        file_to_pred = [file_id + "\t" + max(set(preds), key=preds.count) for file_id, preds in file_to_interval_pred.items()]
        #print(len(file_to_pred))
        file_to_pred.sort()
        f.write("\n".join(file_to_pred))
  return

In [None]:

EXP_NAME = "CNN_ForecastNet_pooled_coarse"
MODEL_PATH = "./model_epochs_100_{}.pth".format(EXP_NAME)
labels_to_distance = [1.2, 3.0, 4.5,1.8]
num_features= 55

model =CNN_ForecastNet_pooled(input_size=num_features,len_timestamp =NUM_READINGS_PER_INTERVAL, hidden_size=64, output_size=len(labels_to_distance), kernel_size=3)
print(model)
best_acc = 0
total_loss = []  # for plotting
total_test_loss = []
loss_fn = CSELoss
train_start_time = time.time()
epoch_num=100

for epoch in range(epoch_num):
    total_epoch_loss = 0
    total_epoch_acc = 0
    total_epoch_tc_acc = 0
    # optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    # exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
    optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5,weight_decay=1e-4)
    steps = 0
    model.train()
    for idx, batch in enumerate(train_data_loader):
        input = batch[0]
        label = torch.max(batch[1], axis=1)[1]
        optim.zero_grad()
        prediction = model(input)#, batch_size=len(label))
        loss = loss_fn(prediction, batch[1])
        prediction = torch.max(prediction, 1)[1].view(label.size())

        num_corrects = (prediction == label).float().sum()
        acc = 100.0 * num_corrects / len(label)
        prediction = ((prediction == 0) + (prediction == 3).float())
        label = ((label == 0) + (label == 3).float())
        num_tc_corrects = (label == prediction).float().sum()
        tc_acc = 100.0 * num_tc_corrects / len(label)
        loss.backward()
        optim.step()
        steps += 1
        total_epoch_loss += loss.item()
        total_epoch_acc += acc.item()
        total_epoch_tc_acc += tc_acc.item()
    # eval
    print("before eval")
    loss, acc, tc_acc, best_acc = eval_model(model, val_data_loader, best_acc)
    print("Testing Loss: {}, AVG: {}, TC4TL or Not AVG: {}".format(loss, acc, tc_acc))
    total_loss.append((epoch, total_epoch_loss/steps))
    total_test_loss.append((epoch, loss))
    print (f'Epoch: {epoch+1}, Training Loss: {total_epoch_loss/steps:.4f}, Training Accuracy: {total_epoch_acc/steps: .2f}% TC4TL or Not ACC: {total_epoch_tc_acc/steps: .2f}%')
print("finished training, took {} seconds".format(time.time() - train_start_time))
plt.scatter(*zip(*total_loss))
plt.xlabel("Epochs")
plt.ylabel("loss")
plt.show()
plt.scatter(*zip(*total_test_loss))
plt.xlabel("Epochs")
plt.ylabel("test loss")
plt.show()

# eval
loss, acc, tc_acc, best_acc = eval_model(model, val_data_loader, best_acc)
print("Testing Loss: {}, AVG: {}, TC4TL or Not AVG: {}".format(loss, acc, tc_acc))
print(best_acc)

In [None]:
plt.scatter(*zip(*total_loss))
plt.xlabel("Epochs")
plt.ylabel("loss")
plt.show()
plt.scatter(*zip(*total_test_loss))
plt.xlabel("Epochs")
plt.ylabel("test loss")
plt.show()

# eval
loss, acc, tc_acc, best_acc = eval_model(model, val_data_loader, best_acc)
print("Testing Loss: {}, AVG: {}, TC4TL or Not AVG: {}".format(loss, acc, tc_acc))
print(best_acc)

In [None]:
# TEST ON VAL SECTION OF DEV DATA
EXP_NAME = "CNN_ForecastNet_pooled_coarse"
MODEL_PATH = "./model_epochs_100_{}.pth".format(EXP_NAME)
labels_to_distance = [1.2, 3.0, 4.5,1.8]
num_features = 55
best_acc=0

val_data_loader = torch.utils.data.DataLoader(list(zip(val_X, val_y)), batch_size=128, drop_last=True)
model_test = CNN_ForecastNet_pooled(input_size=num_features,len_timestamp =NUM_READINGS_PER_INTERVAL, hidden_size=64, output_size=len(labels_to_distance), kernel_size=3)
model_test.load_state_dict(torch.load(MODEL_PATH))
#print(sum(p.numel() for p in model_test.parameters()))
output_predictions(model_test, val_X, labels_to_distance, dev_interval_to_file, "./NIST_{}_layers_2_coarse.tsv".format(EXP_NAME))

<a href="./NIST_CNN_ForecastNet_pooled_layers_2_final_coarse_output.tsv"> Download File </a>
<a href="./model_epochs_100_CNN_ForecastNet_pooled_coarse.pth"> Download File </a>
<a href="./NIST_CNN_ForecastNet_pooled_layers_2_coarse.tsv"> Download File </a>


In [None]:
### EXP_NAME = "CNN_ForecastNet_pooled"
MODEL_PATH = "./model_epochs_100_{}.pth".format(EXP_NAME)
labels_to_distance = [1.2, 3.0, 4.5,1.8]
num_features = 55
best_acc=0

#val_data_loader = DataLoader(list(zip(val_X, val_y)), batch_size=50, drop_last=True)
model_test = CNN_ForecastNet_pooled(input_size=num_features,len_timestamp =NUM_READINGS_PER_INTERVAL, hidden_size=64, output_size=len(labels_to_distance), kernel_size=3)
model_test.load_state_dict(torch.load(MODEL_PATH))
#print(len(test_X))
#print(model_test)
#print(sum(p.numel() for p in model_test.parameters()))
output_predictions(model_test, test_X, labels_to_distance, test_intervals_to_file , "./NIST_{}_layers_2_final_coarse_output.tsv".format(EXP_NAME))


