In [48]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats
import os
from pathlib import Path

from pandas.core.interchange.from_dataframe import datetime_column_to_ndarray

filename = Path('..') / 'data' / 'patient.csv'

if not filename.exists():
    raise FileNotFoundError(f'File is not found: {filename}')

df = pd.read_csv(filename, sep='\t')

df['address'] = df['address'].str.replace('\n', ', ').str.replace('\r', '')

df['registration'] = (
    df['registration']
    .astype(str)
    .str.replace(',', '', regex=False)
    .str.strip()
)

df['registration'] = pd.to_datetime(
    df['registration'],
    errors='coerce',
    format='mixed'  
)
df.head(500)



Unnamed: 0,blood_group,user_id,company,registration,address,job,ssn,username,residence,name,current_location,mail,station_ID
0,B-,1384,Wulf Heinz AG,2024-06-13,"Zänkerweg 6-2, 75317 Pößneck",,079-86-6480,ilias32,,Herr Pirmin Stadelmann B.A.,"(Decimal('-9.5081185'), Decimal('-108.465353'))",bruno18@gmx.de,289
1,O+,1398,Borges Moreira Ltda.,2024-10-11,"Fazenda Brenda Vieira, 51, Piratininga, 99724-...",,84926073196,pedro-miguelvargas,,Sr. Matheus Cirino,"(Decimal('38.5020005'), Decimal('151.185055'))",ana-beatrizpacheco@hotmail.com,326
2,B-,163,高橋建設有限会社,2022-10-19,島根県豊島区上高野24丁目16番6号,,127-45-0018,skobayashi,,中村 真綾,"(Decimal('87.3250985'), Decimal('-83.533367'))",vmaeda@yahoo.com,594
3,B+,112,Yang-Gray,2019-01-19,"93041 Wright Turnpike, Lake Loritown, RI 96307",,765-77-3956,kochmario,,Collin Wright,"(Decimal('-45.2256685'), Decimal('147.973684'))",reaton@yahoo.com,738
4,A+,92,Johnson Ltd,2019-11-10,"0089 William Run, West Adam, TX 90462",,457-20-6978,paul10,,Kristina Murray,"(Decimal('7.750758'), Decimal('71.027557'))",cameron61@hotmail.com,628
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,AB-,390,Ribeiro - EI,2023-04-02,"Distrito de Novais, 80, Custodinha, 14164-737 ...",Técnico em rede,74305216817,martinsrodrigo,,Stella Albuquerque,"(Decimal('36.779429'), Decimal('151.447586'))",castromaria-isis@uol.com.br,72
496,O-,1147,"Atkins, Mullen and Johnson",2020-11-29,"800 Garcia Haven, West Nicholasbury, CT 60777",,717-17-8430,jenniferdean,,Alyssa Morgan,"(Decimal('-21.4125165'), Decimal('-102.501147'))",susanmeyer@yahoo.com,261
497,AB-,938,Barth GmbH & Co. OHG,2022-03-03,"Kargestraße 89/31, 01514 Stadtroda",Medizininformatiker,289-77-4716,lhering,,Fredo Stey,"(Decimal('-60.9537825'), Decimal('-19.480944'))",freudenbergerheinz-josef@web.de,39
498,A-,2081,Ullrich,2025-01-23,,,743-45-6170,bdrubin,,Ing. Mustafa Weimer B.Sc.,"(Decimal('29.936179'), Decimal('9.355469'))",cetinkuhl@hotmail.de,593


In [49]:
df = df.drop(columns=['residence'])

Unnamed: 0,blood_group,user_id,company,registration,address,job,ssn,username,name,current_location,mail,station_ID
0,B-,1384,Wulf Heinz AG,2024-06-13,"Zänkerweg 6-2, 75317 Pößneck",,079-86-6480,ilias32,Herr Pirmin Stadelmann B.A.,"(Decimal('-9.5081185'), Decimal('-108.465353'))",bruno18@gmx.de,289
1,O+,1398,Borges Moreira Ltda.,2024-10-11,"Fazenda Brenda Vieira, 51, Piratininga, 99724-...",,84926073196,pedro-miguelvargas,Sr. Matheus Cirino,"(Decimal('38.5020005'), Decimal('151.185055'))",ana-beatrizpacheco@hotmail.com,326
2,B-,163,高橋建設有限会社,2022-10-19,島根県豊島区上高野24丁目16番6号,,127-45-0018,skobayashi,中村 真綾,"(Decimal('87.3250985'), Decimal('-83.533367'))",vmaeda@yahoo.com,594
3,B+,112,Yang-Gray,2019-01-19,"93041 Wright Turnpike, Lake Loritown, RI 96307",,765-77-3956,kochmario,Collin Wright,"(Decimal('-45.2256685'), Decimal('147.973684'))",reaton@yahoo.com,738
4,A+,92,Johnson Ltd,2019-11-10,"0089 William Run, West Adam, TX 90462",,457-20-6978,paul10,Kristina Murray,"(Decimal('7.750758'), Decimal('71.027557'))",cameron61@hotmail.com,628
...,...,...,...,...,...,...,...,...,...,...,...,...
2149,AB-,985,"Rismondo, Lovato e Camicione Group",2024-02-11,"Piazza Fantozzi, 818 Appartamento 90, 98066, M...","Journalist, broadcasting",BBTGRN07A47H265Z,yguarato,Giancarlo Verdone,"(Decimal('44.8976455'), Decimal('-136.440245'))",trossellini@live.com,547
2150,A+,983,Carter-Taylor,2019-09-18,"870 Daniel Lodge Suite 656, Anthonyfort, GA 25104",,812-33-5813,toddjohnston,Marilyn Walsh,"(Decimal('-88.4267365'), Decimal('88.945793'))",bakerveronica@hotmail.com,179
2151,AB+,690,Flaiano Group,2023-01-13,"Strada Casini, 17 Appartamento 11, 55064, Pasc...",,FLGDLN89S45D685Z,sandro83,Simonetta Foletti-Deledda,"(Decimal('26.9203445'), Decimal('-31.621188'))",sabatiniedoardo@gmail.com,724
2152,AB+,2005,"Thompson, Fritz and Ochoa",2025-08-11,"80943 Jerome Rapids Apt. 096, Taylormouth, MA ...",,254-38-2393,yesenia83,Kurt Crane,"(Decimal('-52.702617'), Decimal('97.370153'))",lpalmer@hotmail.com,348


In [50]:
filename = Path('..') / 'data' / 'observation.csv'

if not filename.exists():
    raise FileNotFoundError(f'File is not found: {filename}')

df = pd.read_csv(filename, sep='\t')

df.head(500)



Unnamed: 0,SpO₂,HR,PI,RR,EtCO₂,FiO₂,PRV,BP,Skin Temperature,Motion/Activity index,...,CO,Blood Flow Index,PPG waveform features,Signal Quality Index,Respiratory effort,O₂ extraction ratio,SNR,oximetry,latitude,longitude
0,96.491498,75.410035,11.966950,14.781380,38.814201,60.447244,130.452955,94.990567,34.465008,8.254118,...,4.000203,80.401412,36.370500,51.040438,27.830965,0.286386,26.305034,1.0,6.84019,79.87116
1,97.657620,90.314511,9.500074,16.118786,42.528828,60.183638,106.651627,107.891610,37.058675,9.514584,...,4.073080,63.098636,26.194633,50.549199,51.169605,0.257752,28.282578,1.0,-29.29750,-51.50361
2,96.528162,79.127147,11.268005,15.921648,41.262696,77.600291,131.520657,109.497981,36.934570,10.435298,...,4.000805,47.467045,47.915005,44.390213,45.069050,0.255257,34.529833,1.0,9.33370,122.86370
3,98.195660,83.015909,14.140633,14.348950,40.283494,59.027420,144.702141,101.915115,34.855279,8.896198,...,4.003649,36.139538,51.788271,55.517932,29.478669,0.255097,36.136759,1.0,33.54428,-84.23381
4,97.573527,83.388999,9.058458,17.513347,39.844401,49.965222,111.977370,98.441554,36.833986,7.009457,...,4.004251,47.409300,86.559604,40.041646,53.128176,0.286358,23.648813,1.0,50.80019,7.20769
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,97.185706,81.884912,11.452810,14.923200,43.307395,71.667122,128.184133,103.286854,34.877968,7.707493,...,4.002273,30.782345,70.018453,50.038510,60.552935,0.296721,37.194601,1.0,50.80019,7.20769
496,96.832180,92.850063,11.369622,14.380791,42.186797,71.033281,153.652800,101.755101,36.737478,9.074675,...,4.208342,48.091136,43.905227,68.690248,37.165035,0.284141,34.176709,1.0,36.75965,137.36215
497,98.257659,87.291331,6.730592,15.096904,40.509463,48.998996,159.828204,100.426069,36.135235,9.035881,...,4.021062,54.390328,48.319716,39.251134,34.658063,0.259793,20.269991,1.0,54.90083,38.07083
498,96.751528,75.451300,9.288238,15.389489,38.964244,60.038810,146.414287,95.594919,36.121366,9.755444,...,4.000315,39.634866,78.360121,67.020541,61.941621,0.255948,29.825476,1.0,51.47805,6.86250


In [51]:
filename = Path('..') / 'data' / 'station.csv'

if not filename.exists():
    raise FileNotFoundError(f'File is not found: {filename}')

df = pd.read_csv(filename, sep='\t')


df.head(500)


Unnamed: 0,station,longitude,location,QoS,latitude,revision
0,Kenda,86.51499,Asia/Kolkata,good,23.19590,17 Jun 2018
1,Canton,-83.48216,America/Detroit,good,42.30865,2022/10/10
2,Zaysan,84.87144,Asia/Almaty,good,47.46657,2016/11/06
3,Shushary,30.38167,Europe/Moscow,good,59.80917,2022/09/10
4,Cheraga,2.95924,Africa/Algiers,good,36.76775,07 Jan 2024
...,...,...,...,...,...,...
495,Arba Minch,37.55000,Africa/Addis_Ababa,good,6.03333,2022/06/06
496,Ishim,69.49015,Asia/Yekaterinburg,acceptable,56.11281,"02/27/2016, 00:00:00"
497,Lesnoy,59.80222,Asia/Yekaterinburg,good,58.63667,2022-09-15
498,Karlovy Vary,12.87117,Europe/Prague,good,50.23271,06 Sep 2021


In [52]:
df[['continent', 'city']] = df['location'].str.rsplit('/', n=1, expand=True)
df = df.drop(columns=['location'])
df.head(1100)

Unnamed: 0,station,longitude,QoS,latitude,revision,continent,city
0,Kenda,86.51499,good,23.19590,17 Jun 2018,Asia,Kolkata
1,Canton,-83.48216,good,42.30865,2022/10/10,America,Detroit
2,Zaysan,84.87144,good,47.46657,2016/11/06,Asia,Almaty
3,Shushary,30.38167,good,59.80917,2022/09/10,Europe,Moscow
4,Cheraga,2.95924,good,36.76775,07 Jan 2024,Africa,Algiers
...,...,...,...,...,...,...,...
741,Ferrol,-8.21940,good,43.48961,03 Apr 2016,Europe,Madrid
742,Arba Minch,37.55000,maintenance,6.03333,"03/06/2020, 00:00:00",Africa,Addis_Ababa
743,Gainsborough,-0.76667,good,53.38333,2018/08/12,Europe,London
744,Odessa,-102.36764,good,31.84568,2020-12-09,America,Chicago


In [53]:
df['revision'] = (
    df['revision']
    .astype(str)
    .str.replace(',', '', regex=False)
    .str.strip()
)

df['revision'] = pd.to_datetime(
    df['revision'],
    errors='coerce',
    format='mixed'  
)
df.head(1100)


Unnamed: 0,station,longitude,QoS,latitude,revision,continent,city
0,Kenda,86.51499,good,23.19590,2018-06-17,Asia,Kolkata
1,Canton,-83.48216,good,42.30865,2022-10-10,America,Detroit
2,Zaysan,84.87144,good,47.46657,2016-11-06,Asia,Almaty
3,Shushary,30.38167,good,59.80917,2022-09-10,Europe,Moscow
4,Cheraga,2.95924,good,36.76775,2024-01-07,Africa,Algiers
...,...,...,...,...,...,...,...
741,Ferrol,-8.21940,good,43.48961,2016-04-03,Europe,Madrid
742,Arba Minch,37.55000,maintenance,6.03333,2020-03-06,Africa,Addis_Ababa
743,Gainsborough,-0.76667,good,53.38333,2018-08-12,Europe,London
744,Odessa,-102.36764,good,31.84568,2020-12-09,America,Chicago
