## Get raw data

In [13]:
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

In [17]:
api.competitions_list(search='playground')

[tabular-playground-series-nov-2022,
 tabular-playground-series-sep-2021,
 tabular-playground-series-aug-2022,
 tabular-playground-series-aug-2021,
 tabular-playground-series-jan-2021,
 tabular-playground-series-jan-2022,
 tabular-playground-series-mar-2021,
 tabular-playground-series-feb-2021,
 tabular-playground-series-sep-2022,
 tabular-playground-series-nov-2021,
 tabular-playground-series-jul-2021,
 tabular-playground-series-feb-2022,
 tabular-playground-series-jul-2022,
 tabular-playground-series-apr-2021,
 tabular-playground-series-dec-2021,
 tabular-playground-series-jun-2021,
 tabular-playground-series-may-2022,
 tabular-playground-series-may-2021,
 tabular-playground-series-oct-2021,
 tabular-playground-series-mar-2022]

In [14]:
api.competition_download_files('tabular-playground-series-mar-2021', path='data/raw/')

In [15]:
from zipfile import ZipFile

with ZipFile("data/raw/tabular-playground-series-mar-2021.zip", 'r') as z_obj:
    z_obj.extractall(path="data/raw")

In [16]:
import os

os.remove("data/raw/tabular-playground-series-mar-2021.zip")

## Data Processing

In [18]:
import pandas as pd

In [19]:
train_raw = pd.read_csv('data/raw/train.csv')
test_raw = pd.read_csv('data/raw/test.csv')

In [20]:
train_raw.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,A,I,A,B,B,BI,A,S,Q,...,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.965006,0.665915,0
1,1,A,I,A,A,E,BI,K,W,AD,...,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.927406,0.493729,0
2,2,A,K,A,A,E,BI,A,E,BM,...,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.292696,0.549452,0
3,3,A,K,A,C,E,BI,A,Y,AD,...,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.760318,0.934242,0
4,4,A,I,G,B,E,BI,C,G,Q,...,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.357084,0.32896,1


In [21]:
y = train_raw['target']
y

0         0
1         0
2         0
3         0
4         1
         ..
299995    0
299996    0
299997    1
299998    0
299999    0
Name: target, Length: 300000, dtype: int64

In [22]:
train_raw.columns

Index(['id', 'cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7',
       'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15',
       'cat16', 'cat17', 'cat18', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4',
       'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'target'],
      dtype='object')

In [23]:
cont_cols = [col for col in train_raw.columns if col[:4] == 'cont']
cat_cols = [col for col in train_raw.columns if col[:3] == 'cat']

In [24]:
for col in cat_cols:
    print(train_raw[col].value_counts())
    print('-'*10)

A    223525
B     76475
Name: cat0, dtype: int64
----------
I    90809
F    43818
K    41870
L    31891
H    17257
N    13231
M    11354
G    11248
A    10547
J    10036
O     8740
B     6847
C     1703
D      414
E      235
Name: cat1, dtype: int64
----------
A    168694
C     38875
D     22720
G     18225
Q     10901
F      9877
J      9102
M      8068
I      5287
L      3997
O      2749
N       340
H       219
B       218
S       197
U       166
R       129
K       126
E       110
Name: cat2, dtype: int64
----------
A    187251
B     79951
C     15957
D      8676
E      3318
F      2489
K       846
G       372
L       292
J       286
H       274
I       177
N       111
Name: cat3, dtype: int64
----------
E    129385
F     76678
G     30754
D     27919
H     23388
J      4307
I      3241
K      1481
M       547
C       506
O       330
B       301
S       285
T       215
L       214
Q       117
P       100
A        92
N        81
R        59
Name: cat4, dtype: int64
----------
BI    2

In [25]:
one_hot_train = pd.get_dummies(train_raw[cat_cols])
one_hot_test = pd.get_dummies(test_raw[cat_cols])

In [26]:
one_hot_train

Unnamed: 0,cat0_A,cat0_B,cat1_A,cat1_B,cat1_C,cat1_D,cat1_E,cat1_F,cat1_G,cat1_H,...,cat16_C,cat16_D,cat17_A,cat17_B,cat17_C,cat17_D,cat18_A,cat18_B,cat18_C,cat18_D
0,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,1,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
299996,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,1,0,0
299997,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1
299998,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,1,0,0,0


In [27]:
from sklearn.preprocessing import StandardScaler



In [28]:
scaler = StandardScaler()
cont_X = scaler.fit_transform(train_raw[cont_cols])
cont_X_test = scaler.transform(test_raw[cont_cols])
cont_X = pd.DataFrame(cont_X, columns=cont_cols)
cont_X_test = pd.DataFrame(cont_X_test, columns=cont_cols)
cont_X.describe()

Unnamed: 0,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10
count,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0
mean,5.141191e-16,3.838633e-17,-9.912663000000001e-17,-1.895365e-16,-1.096267e-15,3.03733e-16,4.186362e-16,-6.821462e-16,-2.602733e-17,-5.630966e-16,3.030613e-17
std,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002
min,-2.680583,-1.922645,-1.964797,-2.39827,-1.480511,-2.232757,-2.28562,-2.018899,-2.591122,-1.309047,-2.017972
25%,-0.7725975,-0.829485,-0.887054,-0.8400537,-0.9915198,-0.9360491,-0.7775117,-0.7306538,-0.721369,-0.8203066,-0.6879171
50%,-0.121373,-0.145062,-0.2112365,-0.4000789,-0.1098594,0.2176689,0.1412923,-0.3259759,-0.4309818,-0.3188401,-0.3031249
75%,0.6427304,0.9384265,0.8749976,0.7880291,0.9712397,0.9656985,0.5692802,0.689371,0.6961856,0.6030187,0.3624859
max,2.417027,2.421583,2.327051,2.206251,1.555038,1.453989,2.263355,2.624525,3.171274,2.756362,2.473545


In [29]:
X = pd.concat([one_hot_train, cont_X], axis=1)
X_test = pd.concat([one_hot_test, cont_X_test], axis=1)
X

Unnamed: 0,cat0_A,cat0_B,cat1_A,cat1_B,cat1_C,cat1_D,cat1_E,cat1_F,cat1_G,cat1_H,...,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10
0,1,0,0,0,0,0,0,0,0,0,...,1.695842,1.130121,1.483204,0.778418,0.494997,0.491902,1.425987,1.827327,2.547402,0.775276
1,1,0,0,0,0,0,0,0,0,0,...,-0.775191,-0.606220,0.309882,-0.509358,-0.598895,0.529090,-0.457197,-0.495519,2.354099,-0.071293
2,1,0,0,0,0,0,0,0,0,0,...,-0.804203,-0.806964,0.656018,1.269770,0.209828,-0.644076,-0.554821,-0.423189,-0.908927,0.202676
3,1,0,0,0,0,0,0,0,0,0,...,1.002598,1.463881,1.539786,1.297666,0.484532,-1.260751,1.945391,0.813161,1.495107,2.094532
4,1,0,0,0,0,0,0,0,0,0,...,-1.030621,-0.827615,-0.909025,0.464803,-1.550166,0.428400,-1.093741,-0.764999,-0.577910,-0.881397
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,1,0,0,0,0,0,0,0,0,0,...,0.031246,0.678596,0.912555,-0.502395,-1.477420,-1.066849,0.061475,0.174621,0.760553,0.883894
299996,1,0,0,0,0,0,0,0,0,0,...,1.392208,1.419707,0.674505,-0.527345,0.968411,0.281554,1.105255,-0.097734,-0.996080,0.642645
299997,1,0,0,0,0,0,0,0,1,0,...,0.132720,-0.510098,-1.116154,1.440320,1.278403,0.514398,-0.948020,-0.637133,0.249480,-0.275749
299998,0,1,0,0,0,0,0,0,0,1,...,1.496541,1.356354,0.722287,-0.694366,0.965374,0.356113,0.531205,1.074326,-0.776509,-0.847657


In [30]:
X.to_csv('data/processed/X_train.csv', index=False)
X_test.to_csv('data/processed/X_test.csv', index=False)
y.to_csv('data/processed/y_train.csv', index=False)