## Label Encoding

In [11]:
import pandas as pd

mapping ={
    "Freezing": 0,"Warm": 1,"Cold": 2,"Boiling Hot": 3,"Hot": 4,"Lava Hot": 5
}

data_path =  r'C:\GN\Projects\Datasets\AAAMLP_datasets/'
output_path = r'C:\GN\Projects\Datasets\AAAMLP_outputs/'

df = pd.read_csv(data_path+"cat_train.csv")

df.loc[:,"ord_2"] = df['ord_2'].map(mapping)
df['ord_2'].dtypes

dtype('float64')

In [None]:
df['ord_2'].value_counts()

In [12]:
## Above can be done using below sklean too

import pandas as pd
from sklearn import preprocessing

# read the data
df = pd.read_csv(data_path+"cat_train.csv")

# LabelEncoder from scikit-learn does not handle NaN values, and ord_2column has NaN values in it
# fill NaN values in ord_2 column
df["ord_2"].fillna("NONE",inplace = True)

# Initialize the LabelEncoder
lbl_enc =  preprocessing.LabelEncoder()

# fit label encoder and transform values on ord_2 column
# P.S: do not use this directly. fit first, then transform

df['ord_2'] = lbl_enc.fit_transform(df['ord_2'].values)

df['ord_2'].dtypes

dtype('int32')

In [None]:
df['ord_2'].value_counts()

## Sparse representation

In [None]:
import numpy as np
from scipy import sparse

# create our example feature matrix
example = np.array([[0, 0, 1],[1, 0, 0],[1, 0, 1]])
# print size in bytes
print(example.nbytes)

In [None]:
# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)
print(sparse_example.data.nbytes)

In [None]:
# Total size of sparse csv matrix
print(sparse_example.data.nbytes + sparse_example.indptr.nbytes + sparse_example.indices.nbytes)

In [None]:
# number of rows
n_rows = 10000
# number of columns
n_cols = 100000
# create random binary matrix with only 5% values as 1
example = np.random.binomial(1, p=0.05, size=(n_rows, n_cols))
# print size in bytes
print(f"Size of dense array: {example.nbytes}")
# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)
# print size of this sparse matrix
print(f"Size of sparse array: {sparse_example.data.nbytes}")
full_size = (sparse_example.data.nbytes + sparse_example.indptr.nbytes + sparse_example.indices.nbytes)
# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")

## One Hot Encoding

In [1]:
import numpy as np
from scipy import sparse
# create binary matrix
example = np.array([[0, 0, 0, 0, 1, 0],[0, 1, 0, 0, 0, 0],[1, 0, 0, 0, 0, 0]])
# print size in bytes
print(f"Size of dense array: {example.nbytes}")
# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)
# print size of this sparse matrix
print(f"Size of sparse array: {sparse_example.data.nbytes}")
full_size = (sparse_example.data.nbytes + sparse_example.indptr.nbytes + sparse_example.indices.nbytes)
# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")

Size of dense array: 72
Size of sparse array: 12
Full size of sparse array: 40


In [25]:
import numpy as np
from sklearn import preprocessing
# create random 1-d array with 1001 different categories (int)
example = np.random.randint(1000, size=1000000)
# initialize OneHotEncoder from scikit-learn# keep sparse = False to get dense array
ohe = preprocessing.OneHotEncoder(sparse=False)
# fit and transform data with dense one hot encoder
ohe_example_1 = ohe.fit_transform(example.reshape(-1, 1))
# print size in bytes for dense array
print(f"Size of dense array: {ohe_example_1.nbytes}")
# initialize OneHotEncoder from scikit-learn
# keep sparse = True to get sparse array
ohe = preprocessing.OneHotEncoder(sparse=True)
# fit and transform data with sparse one-hot encoder
ohe_example = ohe.fit_transform(example.reshape(-1, 1))
# print size of this sparse matrix
print(f"Size of sparse array: {ohe_example.data.nbytes}")
full_size = (ohe_example.data.nbytes + ohe_example.indptr.nbytes + ohe_example.indices.nbytes)
# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")

Size of dense array: 8000000000
Size of sparse array: 8000000
Full size of sparse array: 16000004


In [31]:
## Create new features from Categorical VAriables

df = pd.read_csv(data_path+"cat_train.csv")
df["new_feature"] = (df.ord_1.astype(str) + "_" + df.ord_2.astype(str))
df.new_feature

0                 Contributor_Hot
1                Grandmaster_Warm
2                    nan_Freezing
3                 Novice_Lava Hot
4                Grandmaster_Cold
                   ...           
599995            Novice_Freezing
599996         Novice_Boiling Hot
599997       Contributor_Freezing
599998                Master_Warm
599999    Contributor_Boiling Hot
Name: new_feature, Length: 600000, dtype: object

## Steps in Handling Categorical Variables


In [33]:
# Whenever you get categorical variables, follow these simple steps:
# •fill the NaN values (this is very important!)
# •convert them to integers by applying label encoding using LabelEncoder of scikit-learn or by using a mapping dictionary. If you didn’t fill up NaN values with something, you might have to take care of them in this step
# •create one-hot encoding. Yes, you can skip binarization!
# •go for modelling! I mean the machine learning one.
