In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv
/kaggle/input/tabular-playground-series-mar-2022/train.csv
/kaggle/input/tabular-playground-series-mar-2022/test.csv


In [2]:
%%time
import datatable as dt
train = dt.fread("/kaggle/input/tabular-playground-series-mar-2022/train.csv").to_pandas()
test = dt.fread("/kaggle/input/tabular-playground-series-mar-2022/test.csv").to_pandas()
print(train.shape)
print(test.shape)

(848835, 6)
(2340, 5)
CPU times: user 791 ms, sys: 366 ms, total: 1.16 s
Wall time: 1.48 s


In [3]:
train.head()

Unnamed: 0,row_id,time,x,y,direction,congestion
0,0,1991-04-01,0,0,EB,70
1,1,1991-04-01,0,0,NB,49
2,2,1991-04-01,0,0,SB,24
3,3,1991-04-01,0,1,EB,18
4,4,1991-04-01,0,1,NB,60


In [4]:
train.describe()

Unnamed: 0,row_id,x,y,congestion
count,848835.0,848835.0,848835.0,848835.0
mean,424417.0,1.138462,1.630769,47.815305
std,245037.70221,0.801478,1.089379,16.799392
min,0.0,0.0,0.0,0.0
25%,212208.5,0.0,1.0,35.0
50%,424417.0,1.0,2.0,47.0
75%,636625.5,2.0,3.0,60.0
max,848834.0,2.0,3.0,100.0


In [5]:
test.describe()

Unnamed: 0,row_id,x,y
count,2340.0,2340.0,2340.0
mean,850004.5,1.138462,1.630769
std,675.644137,0.801649,1.089611
min,848835.0,0.0,0.0
25%,849419.75,0.0,1.0
50%,850004.5,1.0,2.0
75%,850589.25,2.0,3.0
max,851174.0,2.0,3.0


In [6]:
%%time
dir_mapper = {'EB': 0, 
              'NE': 1, 
              'NB': 2, 
              'NW': 3, 
              'WB': 4, 
              'SW': 5, 
              'SB': 6, 
              'SE': 7}
def feature_engineering(data):
    tt = pd.to_datetime(data['time'])
    data['month'] = (tt.dt.month).astype(np.int8)
    data['weekday'] = (tt.dt.weekday).astype(np.int8)
    data['hour'] = (tt.dt.hour).astype(np.int8)
    data['minute'] = (tt.dt.minute).astype(np.int8)
    data['is_month_start'] = (tt.dt.is_month_start).astype(np.int8)
    data['is_month_end'] = (tt.dt.is_month_end).astype(np.int8)
    data['minute_in_a_day'] = (tt.dt.hour * 60 + tt.dt.minute).astype(np.int16)
    data['is_weekend'] = (tt.dt.dayofweek > 4).astype(np.int8)
    data['is_afternoon'] = (tt.dt.hour > 12).astype(np.int8)
    data['direction'] = data['direction'].map(lambda x: dir_mapper[x]).astype(np.int8)
    return data.drop(['time'], axis=1)
train = feature_engineering(train)
train['congestion'] = (train['congestion']/100).astype(np.float)

CPU times: user 1.37 s, sys: 40.9 ms, total: 1.41 s
Wall time: 1.41 s


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [7]:
train.tail()

Unnamed: 0,row_id,x,y,direction,congestion,month,weekday,hour,minute,is_month_start,is_month_end,minute_in_a_day,is_weekend,is_afternoon
848830,848830,2,3,2,0.54,9,0,11,40,0,1,700,0,0
848831,848831,2,3,1,0.28,9,0,11,40,0,1,700,0,0
848832,848832,2,3,6,0.68,9,0,11,40,0,1,700,0,0
848833,848833,2,3,5,0.17,9,0,11,40,0,1,700,0,0
848834,848834,2,3,4,0.24,9,0,11,40,0,1,700,0,0


In [8]:
%%time
from sklearn.model_selection import train_test_split
y = train['congestion']
X = train.drop(['congestion', 'row_id'], axis=1)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.14, test_size=0.06, random_state=0)

CPU times: user 692 ms, sys: 353 ms, total: 1.05 s
Wall time: 1.22 s


In [9]:
import warnings
import tensorflow as tf
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
warnings.filterwarnings('ignore')
model_tf = tf.keras.models.Sequential()
model_tf.add(tf.keras.layers.Input(shape=(len(X_train.columns),)))
model_tf.add(tf.keras.layers.Dense(units=128, activation='relu', use_bias=True)) #64, 128
model_tf.add(tf.keras.layers.Dropout(0.25)) #0.1, 0.25
model_tf.add(tf.keras.layers.BatchNormalization())
model_tf.add(tf.keras.layers.Dense(units=1, activation='sigmoid', use_bias=True))
model_tf.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               1664      
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
batch_normalization (BatchNo (None, 128)               512       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 2,305
Trainable params: 2,049
Non-trainable params: 256
_________________________________________________________________


2022-03-13 07:49:54.229507: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [10]:
%%time
model_tf.compile(
    loss="mse",
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), #0.1, 0.01
    metrics=['accuracy'])
model_tf.fit(
    x=X_train,
    y=y_train,
    batch_size=512,
    epochs=20, #3, 20
    validation_data=(X_valid, y_valid))

2022-03-13 07:49:54.486356: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 31.9 s, sys: 4.99 s, total: 36.9 s
Wall time: 41.9 s


<keras.callbacks.History at 0x7f6af5cb8e90>

In [11]:
%%time
test = feature_engineering(test)
x_test = test.drop(['row_id'], axis=1)
pred_test = model_tf.predict(x_test)
submission = pd.DataFrame(data={"row_id" : test.row_id, "congestion" : pred_test.reshape(-1)})
submission['congestion'] = (submission['congestion']*100).astype(np.int8)
submission.to_csv('submission.csv', index=False)
print(submission.shape)
submission.head()

(2340, 2)
CPU times: user 287 ms, sys: 16.7 ms, total: 304 ms
Wall time: 277 ms


Unnamed: 0,row_id,congestion
0,848835,51
1,848836,51
2,848837,51
3,848838,47
4,848839,51
