In [15]:
import pandas as pd

In [16]:
df = pd.read_csv("./data/cat_train.csv")
df.shape

(600000, 25)

In [17]:
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [21]:
df.ord_2.value_counts(dropna=False)

0.0    142726
1.0    124239
2.0     97822
3.0     84790
4.0     67508
5.0     64840
NaN     18075
Name: ord_2, dtype: int64

### Label encoding using pandas

In [19]:
mapping = {"Freezing":0,
           "Warm":1,
           "Cold":2,
           "Boiling Hot":3,
           "Hot":4,
           "Lava Hot":5}
df.loc[:,'ord_2'] = df.ord_2.map(mapping)

In [20]:
df.ord_2.value_counts(dropna=False)

0.0    142726
1.0    124239
2.0     97822
3.0     84790
4.0     67508
5.0     64840
NaN     18075
Name: ord_2, dtype: int64

### Label encoding using sklearn

In [7]:
import pandas as pd
from sklearn import preprocessing

In [8]:
df = pd.read_csv("./data/cat_train.csv")
df.shape

(600000, 25)

In [9]:
# Fill NaN values in ord_2 column
df.loc[:,"ord_2"] = df.ord_2.fillna("NONE")
# Label encode
lbl_enc = preprocessing.LabelEncoder()
df.loc[:, "ord_2"] = lbl_enc.fit_transform(df.ord_2.values)

In [12]:
df.ord_2.value_counts()

2    142726
6    124239
1     97822
0     84790
3     67508
4     64840
5     18075
Name: ord_2, dtype: int64

### Binary encoding

In [22]:
# A sparse format (store what important)
import numpy as np
arr = np.array([[0, 0, 1],
                [1, 0, 0],
                [1, 0, 1]])
arr

array([[0, 0, 1],
       [1, 0, 0],
       [1, 0, 1]])

In [23]:
print(arr.nbytes)

36


In [26]:
import numpy as np
from scipy import sparse

arr = np.array([[0, 0, 1],
                [1, 0, 0],
                [1, 0, 1]])
sparse_arr = sparse.csr_matrix(arr)
sparse_arr.data

array([1, 1, 1, 1], dtype=int32)

In [28]:
print(sparse_arr.data.nbytes), print(sparse_arr.indptr.nbytes), print(sparse_arr.indices.nbytes)

16
16
16


(None, None, None)

#### Sparse arrays need less memory

In [37]:
n_rows = 10_000
n_cols = 10_000

arr = np.random.binomial(1, p=0.1, size=(n_rows, n_cols))
print(f"Size of dense array: {arr.nbytes}")

sparse_arr = sparse.csr_matrix(arr)
print(f"Size of sparse array:{sparse_arr.data.nbytes}, less than {np.round(arr.nbytes/sparse_arr.data.nbytes,2)} times.")

Size of dense array: 400000000
Size of sparse array:39994184, less than 10.0 times.


In [38]:
sparse_total_size = sparse_arr.data.nbytes + sparse_arr.indptr.nbytes + sparse_arr.indices.nbytes
print(f"Total size of sparse array:{sparse_total_size}, less than {np.round(arr.nbytes/sparse_total_size,2)} times.")

Total size of sparse array:80028372, less than 5.0 times.


### One hot encoding

In [41]:
import numpy as np
from scipy import sparse

arr = np.array([[0, 0, 0, 0, 1, 0],
                [0, 1, 0, 0, 0, 0],
                [1, 0, 0, 0, 0, 0]])
print(f"Size of dense array: {arr.nbytes}")

Size of dense array: 72


In [43]:
sparse_arr = sparse.csr_matrix(arr)
print(f"Size of sparse array: {sparse_arr.data.nbytes}")

Size of sparse array: 12


### One hot encoding with sklearn

In [47]:
import numpy as np
from sklearn import preprocessing

arr = np.random.randint(1000, size=1000_000)
oh_encoder = preprocessing.OneHotEncoder(sparse=False)
arr_dense = oh_encoder.fit_transform(arr.reshape(-1,1))
print(f"Size of dense array: {arr_dense.nbytes}")

oh_encoder = preprocessing.OneHotEncoder(sparse=True)
arr_sparse = oh_encoder.fit_transform(arr.reshape(-1,1))
print(f"Size of sparse array: {arr_sparse.data.nbytes}, less than {np.round(arr_dense.nbytes/arr_sparse.data.nbytes,2)} times")


Size of dense array: 8000000000
Size of sparse array: 8000000, less than 1000.0 times


In [57]:
df = pd.read_csv("./data/cat_train.csv")
df.shape

(600000, 25)

In [58]:
df[df.ord_2 == "Boiling Hot"].shape

(84790, 25)

In [59]:
%time df.groupby(['ord_2'])['id'].count()

Wall time: 36 ms


ord_2
Boiling Hot     84790
Cold            97822
Freezing       142726
Hot             67508
Lava Hot        64840
Warm           124239
Name: id, dtype: int64

In [60]:
%time df.ord_2.value_counts()

Wall time: 47 ms


Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
Name: ord_2, dtype: int64

In [61]:
df.groupby(["ord_2"])["id"].transform("count")

0          67508.0
1         124239.0
2         142726.0
3          64840.0
4          97822.0
            ...   
599995    142726.0
599996     84790.0
599997    142726.0
599998    124239.0
599999     84790.0
Name: id, Length: 600000, dtype: float64

In [66]:
df.groupby(["ord_1","ord_2"])["id"].count().reset_index(name="count")

Unnamed: 0,ord_1,ord_2,count
0,Contributor,Boiling Hot,15634
1,Contributor,Cold,17734
2,Contributor,Freezing,26082
3,Contributor,Hot,12428
4,Contributor,Lava Hot,11919
5,Contributor,Warm,22774
6,Expert,Boiling Hot,19477
7,Expert,Cold,22956
8,Expert,Freezing,33249
9,Expert,Hot,15792


In [67]:
# Create new feature
df["new_feature"] = (df.ord_1.astype(str) + "_" + df.ord_2.astype(str))
df.new_feature.value_counts()

Novice_Freezing            38233
Novice_Warm                33263
Expert_Freezing            33249
Expert_Warm                28900
Novice_Cold                26271
Contributor_Freezing       26082
Expert_Cold                22956
Grandmaster_Freezing       22818
Contributor_Warm           22774
Novice_Boiling Hot         22718
Grandmaster_Warm           19899
Expert_Boiling Hot         19477
Master_Freezing            18035
Novice_Hot                 17850
Contributor_Cold           17734
Novice_Lava Hot            17373
Expert_Hot                 15792
Master_Warm                15734
Contributor_Boiling Hot    15634
Grandmaster_Cold           15464
Expert_Lava Hot            15078
Grandmaster_Boiling Hot    13623
Contributor_Hot            12428
Master_Cold                12364
Contributor_Lava Hot       11919
Grandmaster_Hot            10805
Master_Boiling Hot         10800
Grandmaster_Lava Hot       10363
Master_Hot                  8594
Master_Lava Hot             8209
Novice_nan

In [68]:
# Test other columns with RARE frequency
df.ord_4.fillna("NONE").value_counts()

N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
NONE    17930
D       17284
F       16721
W        8268
Z        5790
S        4595
G        3404
V        3107
J        1950
L        1657
Name: ord_4, dtype: int64

In [69]:
df.ord_4 = df.ord_4.fillna("NONE")

In [78]:
df.ord_4.value_counts()[df.ord_4]

U    32897
X    32347
P    37890
C    32112
C    32112
     ...  
R    33045
N    39978
H    31189
X    32347
O    25610
Name: ord_4, Length: 600000, dtype: int64

In [79]:
THRES_HOLD = 2000
df.loc[df.ord_4.value_counts()[df.ord_4].values < THRES_HOLD,"ord_4"] = "RARE"

In [80]:
df.ord_4.value_counts()

N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
NONE    17930
D       17284
F       16721
W        8268
Z        5790
S        4595
RARE     3607
G        3404
V        3107
Name: ord_4, dtype: int64