# Prepare

In [None]:

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import rgb2hex
import seaborn as sns
import os
import sys


from scipy.signal import find_peaks
from scipy.signal import savgol_filter


In [None]:
%%capture 

def download_data():
  # download the data file 
  !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies \
    --no-check-certificate 'https://docs.google.com/uc?export=download&id=1u78Z2-WhhyHzoHf6OtBnxWETnjBw7pg8' \
    -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1u78Z2-WhhyHzoHf6OtBnxWETnjBw7pg8" -O corona_tested_individuals_ver_0083.english.csv && rm -rf /tmp/cookies.txt
  !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies \
    --no-check-certificate 'https://docs.google.com/uc?export=download&id=1u8R1viz0ty6zR2CuVghRPQrv5XwDNOpk' \
    -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1u8R1viz0ty6zR2CuVghRPQrv5XwDNOpk" -O corona_tested_individuals_ver_006.english.csv && rm -rf /tmp/cookies.txt


# download the data file
download_data()

In [None]:

df = pd.read_csv('corona_tested_individuals_ver_0083.english.csv')

df = df.assign(contact_with_confirmed = lambda x : x.test_indication == 'Contact with confirmed')
df = df.drop(columns = ['test_indication'], axis = 0) 

df['test_date'] = pd.to_datetime(df.test_date, infer_datetime_format = True) 
df['month'] = df.test_date.dt.month 
df['day'] = df.test_date.dt.day
df.drop(columns = ['test_date'], inplace = True) 

results = df.corona_result.values
age_above_60 = df.age_60_and_above.values 
gender = df.gender.values 

binary_age = [1 if age_above_60[i] == 'Yes' else 0 for i in range(len(df)) ]
binary_results = [1 if results[i] == 'positive' else 0 for i in range(len(df)) ]
binary_gender = [1 if gender[i] == 'male' else 0 for i in range(len(df)) ]
df['corona_result'] = np.array(binary_results, dtype = np.int32)  
df['age_60_and_above'] = np.array(binary_age, dtype = np.int32)
df['gender'] = np.array(binary_gender, dtype = np.int32) 

df = df.astype(np.float32) 

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Columns with missing values
missing_cols = df.columns[df.isnull().any()]
# Rows with missing values
missing_rows = df.index[df.isnull().any(axis=1)]

#if no missing values
if len(missing_rows) == 0 and len(missing_cols) == 0: 
    print('No missing values')
# if there are missing values
else: 
    print('Number of rows that have missing values', len(missing_rows))
    print('Number of columns that have missing values', len(missing_cols))

for col in df.columns : 
  print(col) 
  print(df[col].unique())

No missing values
cough
[0. 1.]
fever
[0. 1.]
sore_throat
[0. 1.]
shortness_of_breath
[0. 1.]
head_ache
[0. 1.]
corona_result
[0. 1.]
age_60_and_above
[0. 1.]
gender
[1. 0.]
contact_with_confirmed
[0. 1.]
month
[11. 10.  9.  8.  7.  6.  5.  4.  3.]
day
[12. 11. 10.  9.  8.  7.  6.  5.  4.  3.  2.  1. 31. 30. 29. 28. 27. 26.
 25. 24. 23. 22. 21. 20. 19. 18. 17. 16. 15. 14. 13.]


In [None]:
y_features = 'corona_result'
x_features = [col for col in df.columns if col != y_features]
x_features.remove(y_features) 
x, y = df[x_features].values, df[y_features].values
x.shape, y.shape

((2742596, 10), (2742596,))

# Cross validation

In [None]:
n_folds = 5
params = {
    'task':'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss', 'auc'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}
skfold = StratifiedKFold(n_splits = n_folds) 
scores = [] 
for fold_idx, (train_idc, val_idc) in enumerate(skfold.split(x, y)) : 
  print(" Fold {} / {}".format(fold_idx, n_folds)) 
  train_ds = lgb.Dataset(x[train_idc, :], y[train_idc]) 
  val_ds = lgb.Dataset(x[val_idc, :], y[val_idc]) 
  res = {} 
  gbm = lgb.train(params, train_ds, num_boost_round = 2500, 
                valid_sets = [val_ds], valid_names = ['valid'], 
                evals_result = res, verbose_eval = 100) 
  scores.append(res['valid']['auc'][-1])
print("Average validation score: ", np.mean(scores)) 

 Fold 0 / 5
[100]	valid's auc: 0.813982	valid's binary_logloss: 0.402241
[200]	valid's auc: 0.813589	valid's binary_logloss: 0.647809
[300]	valid's auc: 0.812547	valid's binary_logloss: 0.698081
[400]	valid's auc: 0.813648	valid's binary_logloss: 0.712458
[500]	valid's auc: 0.814263	valid's binary_logloss: 0.723128
[600]	valid's auc: 0.813308	valid's binary_logloss: 0.737626
[700]	valid's auc: 0.813526	valid's binary_logloss: 0.74109
[800]	valid's auc: 0.813587	valid's binary_logloss: 0.741223
[900]	valid's auc: 0.813849	valid's binary_logloss: 0.740939
[1000]	valid's auc: 0.813522	valid's binary_logloss: 0.748149
[1100]	valid's auc: 0.813576	valid's binary_logloss: 0.747807
[1200]	valid's auc: 0.813453	valid's binary_logloss: 0.751368
[1300]	valid's auc: 0.813549	valid's binary_logloss: 0.751304
[1400]	valid's auc: 0.813532	valid's binary_logloss: 0.754826
[1500]	valid's auc: 0.813569	valid's binary_logloss: 0.754823
[1600]	valid's auc: 0.813676	valid's binary_logloss: 0.754691
[1700]

In [None]:
res['valid'].keys()

dict_keys(['auc', 'binary_logloss'])