In [2]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn import preprocessing

In [3]:
df = pd.read_csv("../input/cat_train.csv")
df.loc[:, "ord_2"] = df.ord_2.fillna("NONE")
lbl_enc = preprocessing.LabelEncoder()
df.loc[:, "ord_2"] = lbl_enc.fit_transform(df.ord_2.values)

In [4]:
# number of rows 
n_rows = 1000
# number of columns 
n_cols = 10000
# create random binary matrix with only 5% values as 1s 
example = np.random.binomial(1, p=0.05, size=(n_rows, n_cols))
# print size in bytes 
print(f"Size of dense array: {example.nbytes}")
# convert numpy array to sparse CSR matrix 
sparse_example = sparse.csr_matrix(example)
# print size of this sparse matrix 
print(f"Size of sparse array: {sparse_example.data.nbytes}")
full_size = (
        sparse_example.data.nbytes +
        sparse_example.indptr.nbytes +
        sparse_example.indices.nbytes
)
# print full size of this sparse matrix 
print(f"Full size of sparse array: {full_size}")

Size of dense array: 40000000
Size of sparse array: 2003800
Full size of sparse array: 4011604


In [5]:
# create random 1-d array with 1001 different categories (int) 
example = np.random.randint(1000, size=1000000) 
# initialize OneHotEncoder from scikit-learn 
# keep sparse = False to get dense array 
ohe = preprocessing.OneHotEncoder(sparse=False) 
# fit and transform data with dense one hot encoder 
ohe_example = ohe.fit_transform(example.reshape(-1, 1)) 
# print size in bytes for dense array 
print(f"Size of dense array: {ohe_example.nbytes}") 
# initialize OneHotEncoder from scikit-learn 
# keep sparse = True to get sparse array 
ohe = preprocessing.OneHotEncoder(sparse=True) 
# fit and transform data with sparse one-hot encoder 
ohe_example = ohe.fit_transform(example.reshape(-1, 1)) 
# print size of this sparse matrix 
print(f"Size of sparse array: {ohe_example.data.nbytes}") 
full_size = ( 
ohe_example.data.nbytes +  
ohe_example.indptr.nbytes + ohe_example.indices.nbytes 
) 
# print full size of this sparse matrix 
print(f"Full size of sparse array: {full_size}") 

Size of dense array: 8000000000
Size of sparse array: 8000000
Full size of sparse array: 16000004


In [6]:
df = pd.read_csv("../input/cat_train.csv")
df.loc[:, "ord_2"] = df.ord_2.fillna("NONE")

In [7]:
df.groupby("ord_2")["id"].count()

ord_2
Boiling Hot     84790
Cold            97822
Freezing       142726
Hot             67508
Lava Hot        64840
NONE            18075
Warm           124239
Name: id, dtype: int64

In [8]:
df.groupby(["ord_1","ord_2"])["id"].count().reset_index(name="count")

Unnamed: 0,ord_1,ord_2,count
0,Contributor,Boiling Hot,15634
1,Contributor,Cold,17734
2,Contributor,Freezing,26082
3,Contributor,Hot,12428
4,Contributor,Lava Hot,11919
5,Contributor,NONE,3250
6,Contributor,Warm,22774
7,Expert,Boiling Hot,19477
8,Expert,Cold,22956
9,Expert,Freezing,33249


In [9]:
df["new_feature"] = (df["ord_1"] + "_" + df["ord_2"])
df.drop("new_feature", axis=1, inplace=True)

In [10]:
df.ord_2.value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
NONE            18075
Name: ord_2, dtype: int64

In [17]:
train = pd.read_csv("../input/cat_train.csv")
test = pd.read_csv("../input/cat_test.csv")

test.loc[:, "target"] = -1
data = pd.concat([train, test], axis=0).reset_index(drop=True)
features = [x for x in train.columns if x not in ["id", "target"]]
for feature in features:
    lbl_enc = preprocessing.LabelEncoder()
    temp_col = data[feature].fillna("NONE").astype(str).values
    data.loc[:,feature] = lbl_enc.fit_transform(temp_col)
train = data[data["target"] != -1].reset_index(drop=True)
test = data[data["target"] == -1].reset_index(drop=True)