## Section 1 - Imports

In [2]:
import pandas as pd # For Data Manipulation of Numerical Yables and Time Series 
import numpy as np # For Mathematical operations on arrays
import sklearn
import warnings
import matplotlib as plt
from IPython.display import display
from sklearn.model_selection import train_test_split, GroupKFold, KFold # Split arrays or matrices into random train and test subsets, K-fold iterator variant with non-overlapping groups, K-fold iterator variant with non-overlapping groups.
from sklearn.metrics import mean_absolute_error # Mean absolute error regression loss.
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import normalize, PowerTransformer # Scale input vectors individually to unit norm (vector length).
#Potential ADDONS 
import lightgbm as lgb # For Distributed gradient boosting framework (XDBoost possible alternative)

## Section 2 - Data loading

In [3]:
train = pd.read_csv('train.csv') # Read a comma-separated values (csv) file into Train DataFrame
test = pd.read_csv('test.csv') # Read a comma-separated values (csv) file into Test DataFrame
submission = pd.read_csv('sample_submission.csv') # Read a comma-separated values (csv) file into Prediction DataFrame

In [4]:
display(train)
display(test)
display(submission)

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.000000,0.083334,0,5.837492
1,2,1,20,50,0.033652,18.383041,0,5.907794
2,3,1,20,50,0.067514,22.509278,0,7.876254
3,4,1,20,50,0.101542,22.808822,0,11.742872
4,5,1,20,50,0.135756,25.355850,0,12.234987
...,...,...,...,...,...,...,...,...
6035995,6035996,125749,50,10,2.504603,1.489714,1,3.869032
6035996,6035997,125749,50,10,2.537961,1.488497,1,3.869032
6035997,6035998,125749,50,10,2.571408,1.558978,1,3.798729
6035998,6035999,125749,50,10,2.604744,1.272663,1,4.079938


Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out
0,1,0,5,20,0.000000,0.000000,0
1,2,0,5,20,0.031904,7.515046,0
2,3,0,5,20,0.063827,14.651675,0
3,4,0,5,20,0.095751,21.230610,0
4,5,0,5,20,0.127644,26.320956,0
...,...,...,...,...,...,...,...
4023995,4023996,125748,20,10,2.530117,4.971245,1
4023996,4023997,125748,20,10,2.563853,4.975709,1
4023997,4023998,125748,20,10,2.597475,4.979468,1
4023998,4023999,125748,20,10,2.631134,4.982648,1


Unnamed: 0,id,pressure
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
4023995,4023996,0
4023996,4023997,0
4023997,4023998,0
4023998,4023999,0


## Train Data

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6036000 entries, 0 to 6035999
Data columns (total 8 columns):
 #   Column     Dtype  
---  ------     -----  
 0   id         int64  
 1   breath_id  int64  
 2   R          int64  
 3   C          int64  
 4   time_step  float64
 5   u_in       float64
 6   u_out      int64  
 7   pressure   float64
dtypes: float64(3), int64(5)
memory usage: 368.4 MB


In [6]:
train.describe()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
count,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0
mean,3018000.0,62838.86,27.03618,26.08072,1.307225,7.321615,0.6204493,11.22041
std,1742443.0,36335.26,19.59549,17.15231,0.7659778,13.4347,0.4852752,8.109703
min,1.0,1.0,5.0,10.0,0.0,0.0,0.0,-1.895744
25%,1509001.0,31377.0,5.0,10.0,0.6428995,0.3936623,0.0,6.329607
50%,3018000.0,62765.5,20.0,20.0,1.308123,4.386146,1.0,7.032628
75%,4527000.0,94301.0,50.0,50.0,1.965502,4.983895,1.0,13.64103
max,6036000.0,125749.0,50.0,50.0,2.937238,100.0,1.0,64.82099


In [11]:
train.nunique().to_frame()

Unnamed: 0,0
id,6036000
breath_id,75450
R,3
C,3
time_step,3767571
u_in,4020300
u_out,2
pressure,950


In [13]:
train.R.value_counts().to_frame()

Unnamed: 0,R
50,2410080
5,1988800
20,1637120


## Test Data

In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024000 entries, 0 to 4023999
Data columns (total 7 columns):
 #   Column     Dtype  
---  ------     -----  
 0   id         int64  
 1   breath_id  int64  
 2   R          int64  
 3   C          int64  
 4   time_step  float64
 5   u_in       float64
 6   u_out      int64  
dtypes: float64(2), int64(5)
memory usage: 214.9 MB


In [8]:
test.describe()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out
count,4024000.0,4024000.0,4024000.0,4024000.0,4024000.0,4024000.0,4024000.0
mean,2012000.0,62927.96,27.10785,26.07038,1.307083,7.338098,0.6203864
std,1161629.0,36249.24,19.54281,17.17103,0.7658902,13.50955,0.4852908
min,1.0,0.0,5.0,10.0,0.0,0.0,0.0
25%,1006001.0,31530.5,5.0,10.0,0.6428454,0.4096735,0.0
50%,2012000.0,63057.5,20.0,20.0,1.308083,4.377512,1.0
75%,3018000.0,94333.25,50.0,50.0,1.96524,4.983472,1.0
max,4024000.0,125748.0,50.0,50.0,2.935203,100.0,1.0


In [12]:
test.nunique().to_frame()

Unnamed: 0,0
id,4024000
breath_id,50300
R,3
C,3
time_step,2855528
u_in,2787822
u_out,2


In [14]:
test.C.value_counts().to_frame()

Unnamed: 0,C
10,1504800
50,1315840
20,1203360


## Submission Data

In [9]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024000 entries, 0 to 4023999
Data columns (total 2 columns):
 #   Column    Dtype
---  ------    -----
 0   id        int64
 1   pressure  int64
dtypes: int64(2)
memory usage: 61.4 MB


In [10]:
submission.describe()

Unnamed: 0,id,pressure
count,4024000.0,4024000.0
mean,2012000.0,0.0
std,1161629.0,0.0
min,1.0,0.0
25%,1006001.0,0.0
50%,2012000.0,0.0
75%,3018000.0,0.0
max,4024000.0,0.0
