In [1]:
mapping = {
 "Freezing": 0,
 "Warm": 1,
 "Cold": 2,
 "Boiling Hot": 3,
 "Hot": 4,
 "Lava Hot": 5
}

In [2]:
import pandas as pd
df = pd.read_csv("train.csv")
df.loc[:, "ord_2"] = df.ord_2.map(mapping)

In [3]:
df.ord_2.value_counts()

ord_2
0    99816
5    63908
3    60627
2    33768
4    22227
1    19654
Name: count, dtype: int64

In [4]:
import pandas as pd
from sklearn import preprocessing
# read the data
df = pd.read_csv("train.csv")
# fill NaN values in ord_2 column
df.loc[:, "ord_2"] = df.ord_2.fillna("NONE")
# initialize LabelEncoder
lbl_enc = preprocessing.LabelEncoder()
# fit label encoder and transform values on ord_2 column
# P.S: do not use this directly. fit first, then transform
df.loc[:, "ord_2"] = lbl_enc.fit_transform(df.ord_2.values)

In [5]:
import numpy as np
# create our example feature matrix
example = np.array(
 [
 [0, 0, 1],
 [1, 0, 0],
 [1, 0, 1]
 ]
)
# print size in bytes
print(example.nbytes)

36


In [6]:
import numpy as np
from scipy import sparse
# create our example feature matrix
example = np.array(
 [
 [0, 0, 1],
 [1, 0, 0],
 [1, 0, 1]
 ]
)
# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)
# print size of this sparse matrix
print(sparse_example.data.nbytes)

16


In [7]:
print(
 sparse_example.data.nbytes +
 sparse_example.indptr.nbytes +
 sparse_example.indices.nbytes
)

48


In [8]:
import numpy as np
from scipy import sparse
# number of rows
n_rows = 10000
# number of columns
n_cols = 100000
# create random binary matrix with only 5% values as 1s
example = np.random.binomial(1, p=0.05, size=(n_rows, n_cols))
# print size in bytes
print(f"Size of dense array: {example.nbytes}")
# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)
# print size of this sparse matrix
print(f"Size of sparse array: {sparse_example.data.nbytes}")
full_size = (
 sparse_example.data.nbytes +
 sparse_example.indptr.nbytes +
 sparse_example.indices.nbytes
)
# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")

Size of dense array: 4000000000
Size of sparse array: 200005340
Full size of sparse array: 400050684


In [9]:
import numpy as np
from scipy import sparse
# create binary matrix
example = np.array(
 [
 [0, 0, 0, 0, 1, 0],
 [0, 1, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0]
 ]
)
# print size in bytes
print(f"Size of dense array: {example.nbytes}")
# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)
# print size of this sparse matrix
print(f"Size of sparse array: {sparse_example.data.nbytes}")
full_size = (
 sparse_example.data.nbytes +
 sparse_example.indptr.nbytes +
 sparse_example.indices.nbytes
)
# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")

Size of dense array: 72
Size of sparse array: 12
Full size of sparse array: 40


In [11]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Create random 1-d array with 1001 different categories (int)
example = np.random.randint(1000, size=1000000)

# Initialize OneHotEncoder from scikit-learn
# Set sparse_output=False to get a dense array
ohe = OneHotEncoder(sparse_output=False)
# Fit and transform data with dense one-hot encoder
ohe_example = ohe.fit_transform(example.reshape(-1, 1))
# Print size in bytes for dense array
print(f"Size of dense array: {ohe_example.nbytes}")

# Initialize OneHotEncoder from scikit-learn
# Set sparse_output=True to get a sparse array
ohe = OneHotEncoder(sparse_output=True)
# Fit and transform data with sparse one-hot encoder
ohe_example = ohe.fit_transform(example.reshape(-1, 1))
# Print size of this sparse matrix
print(f"Size of sparse array (data only): {ohe_example.data.nbytes}")

# Calculate and print the full size of the sparse matrix
full_size = (
    ohe_example.data.nbytes +
    ohe_example.indptr.nbytes +
    ohe_example.indices.nbytes
)
print(f"Full size of sparse array: {full_size}")


Size of dense array: 8000000000
Size of sparse array (data only): 8000000
Full size of sparse array: 16000004


In [12]:
df[df.ord_2 == "Boiling Hot"].shape

(0, 25)

In [13]:
 df.groupby(["ord_2"])["id"].count()

ord_2
0    60627
1    33768
2    99816
3    22227
4    63908
5    19654
Name: id, dtype: int64

In [14]:
df.groupby(["ord_2"])["id"].transform("count")


0         33768
1         22227
2         63908
3         60627
4         99816
          ...  
299995    99816
299996    99816
299997    60627
299998    60627
299999    99816
Name: id, Length: 300000, dtype: int64

In [15]:
df.groupby(
[
"ord_1",
"ord_2"
]
)["id"].count().reset_index(name="count")

Unnamed: 0,ord_1,ord_2,count
0,Contributor,0,8692
1,Contributor,1,4842
2,Contributor,2,14284
3,Contributor,3,3122
4,Contributor,4,9074
5,Contributor,5,2857
6,Expert,0,4980
7,Expert,1,2850
8,Expert,2,8432
9,Expert,3,1887


In [16]:
df["new_feature"] = (
df.ord_1.astype(str)
+ "_"
+ df.ord_2.astype(str)
)
df.new_feature

0         Grandmaster_1
1         Grandmaster_3
2              Expert_4
3         Grandmaster_0
4         Grandmaster_2
              ...      
299995    Contributor_2
299996         Novice_2
299997         Novice_0
299998         Master_0
299999    Contributor_2
Name: new_feature, Length: 300000, dtype: object

In [17]:
df["new_feature"] = (
df.ord_1.astype(str)
+ "_"
+ df.ord_2.astype(str)
+ "_"
+ df.ord_3.astype(str)
)
df.new_feature

0         Grandmaster_1_h
1         Grandmaster_3_a
2              Expert_4_h
3         Grandmaster_0_i
4         Grandmaster_2_a
               ...       
299995    Contributor_2_k
299996         Novice_2_h
299997         Novice_0_o
299998         Master_0_h
299999    Contributor_2_i
Name: new_feature, Length: 300000, dtype: object

In [18]:
df.ord_2.value_counts()

ord_2
2    99816
4    63908
0    60627
1    33768
3    22227
5    19654
Name: count, dtype: int64

In [19]:
df.ord_2.fillna("NONE").value_counts()

  df.ord_2.fillna("NONE").value_counts()


ord_2
2    99816
4    63908
0    60627
1    33768
3    22227
5    19654
Name: count, dtype: int64

In [20]:
import pandas as pd
from sklearn import preprocessing
# read training data
train = pd.read_csv("train.csv")
#read test data
test = pd.read_csv("test.csv")
# create a fake target column for test data
# since this column doesn't exist
test.loc[:, "target"] = -1
# concatenate both training and test data
data = pd.concat([train, test]).reset_index(drop=True)
# make a list of features we are interested in
# id and target is something we should not encode
features = [x for x in train.columns if x not in ["id", "target"]]
# loop over the features list
for feat in features:
 # create a new instance of LabelEncoder for each feature
 lbl_enc = preprocessing.LabelEncoder()

 # note the trick here
 # since its categorical data, we fillna with a string
 # and we convert all the data to string type
 # so, no matter its int or float, its converted to string
 # int/float but categorical!!!
 temp_col = data[feat].fillna("NONE").astype(str).values
 # we can use fit_transform here as we do not
 # have any extra test data that we need to
 # transform on separately
 data.loc[:, feat] = lbl_enc.fit_transform(temp_col)

# split the training and test data again
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)

In [21]:
df.ord_2.fillna("NONE").value_counts()

  df.ord_2.fillna("NONE").value_counts()


ord_2
2    99816
4    63908
0    60627
1    33768
3    22227
5    19654
Name: count, dtype: int64

In [22]:
df.ord_4.fillna("NONE").value_counts()

ord_4
L    19066
G    18899
S    18875
A    18258
R    16927
Q    15925
K    14698
I    14644
Z    14232
T    14220
V    14143
J    12878
P    12839
U    12775
H    12743
F    11717
E    11303
W     9197
Y     8490
X     6292
B     6169
O     5836
D     3974
C     3575
N     2166
M      159
Name: count, dtype: int64

In [23]:
df.ord_4 = df.ord_4.fillna("NONE")

In [24]:
df.loc[
df["ord_4"].value_counts()[df["ord_4"]].values < 2000,
"ord_4"
] = "RARE"
df.ord_4.value_counts()

ord_4
L       19066
G       18899
S       18875
A       18258
R       16927
Q       15925
K       14698
I       14644
Z       14232
T       14220
V       14143
J       12878
P       12839
U       12775
H       12743
F       11717
E       11303
W        9197
Y        8490
X        6292
B        6169
O        5836
D        3974
C        3575
N        2166
RARE      159
Name: count, dtype: int64

In [27]:
# create_folds.py
# import pandas and model_selection module of scikit-learn
import pandas as pd
from sklearn import model_selection
if __name__ == "__main__":
 # Read training data
 df = pd.read_csv("train.csv")
 # we create a new column called kfold and fill it with -1
 df["kfold"] = -1

 # the next step is to randomize the rows of the data
 df = df.sample(frac=1).reset_index(drop=True)

 # fetch labels
 y = df.target.values

 # initiate the kfold class from model_selection module
 kf = model_selection.StratifiedKFold(n_splits=5)

 # fill the new kfold column
 for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
     df.loc[v_, 'kfold'] = f

 # save the new csv with kfold column
 df.to_csv("train_folds.csv", index=False)

In [29]:
import pandas as pd
df = pd.read_csv("train_folds.csv")
df.kfold.value_counts()

kfold
0    60000
1    60000
2    60000
3    60000
4    60000
Name: count, dtype: int64

In [30]:
 df[df.kfold==0].target.value_counts()

target
0    41648
1    18352
Name: count, dtype: int64

In [31]:
df[df.kfold==1].target.value_counts()

target
0    41647
1    18353
Name: count, dtype: int64

In [32]:
df[df.kfold==2].target.value_counts()

target
0    41647
1    18353
Name: count, dtype: int64

In [33]:
df[df.kfold==3].target.value_counts()

target
0    41647
1    18353
Name: count, dtype: int64

In [34]:
df[df.kfold==4].target.value_counts()

target
0    41647
1    18353
Name: count, dtype: int64

In [39]:
# ohe_logres.py
import pandas as pd
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing
def run(fold):
 # load the full training data with folds
 df = pd.read_csv("train_folds.csv")
 # all columns are features except id, target and kfold columns
 features = [
 f for f in df.columns if f not in ("id", "target", "kfold")
 ]
 # fill all NaN values with NONE
 # note that I am converting all columns to "strings"
 # it doesn’t matter because all are categories
 for col in features:
     df.loc[:, col] = df[col].astype(str).fillna("NONE")

 # get training data using folds
 df_train = df[df.kfold != fold].reset_index(drop=True)
 # get validation data using folds
 df_valid = df[df.kfold == fold].reset_index(drop=True)
 # initialize OneHotEncoder from scikit-learn
 ohe = preprocessing.OneHotEncoder()
 # fit ohe on training + validation features
 full_data = pd.concat(
 [df_train[features], df_valid[features]],
 axis=0
 )
 ohe.fit(full_data[features])
 # transform training data
 x_train = ohe.transform(df_train[features])
 # transform validation data
 x_valid = ohe.transform(df_valid[features])
 # initialize Logistic Regression model
 model = linear_model.LogisticRegression()
 # fit model on training data (ohe)
 model.fit(x_train, df_train.target.values)
 # predict on validation data
 # we need the probability values as we are calculating AUC
 # we will use the probability of 1s
 valid_preds = model.predict_proba(x_valid)[:, 1]
 # get roc auc score
 auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
 # print auc
 print(f"Fold = {fold}, AUC = {auc}")
if __name__ == "__main__":
 # run function for fold = 0
 # we can just replace this number and
 # run this for any fold
 for fold_ in range(5):
    run(fold_)

  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold = 0, AUC = 0.798341867269876


  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold = 1, AUC = 0.7971018978725082


  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold = 2, AUC = 0.7993535847628739


  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold = 3, AUC = 0.7980307176844933


  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")


Fold = 4, AUC = 0.7990480208232961


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [45]:
import pandas as pd
from sklearn import ensemble
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

def run(fold):
    # Load the full training data with folds
    df = pd.read_csv("train_folds.csv")
    
    # All columns are features except id, target, and kfold columns
    features = [f for f in df.columns if f not in ("id", "target", "kfold")]
    
    # Fill all NaN values with NONE and convert all columns to "strings"
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
    
    # Label encode the features
    for col in features:
        lbl = LabelEncoder()
        # Fit label encoder on all data
        df.loc[:, col] = lbl.fit_transform(df[col])
        # Explicitly cast to int to avoid future warnings
        df.loc[:, col] = df.loc[:, col].astype(int)
    
    # Get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    # Get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # Extract feature arrays
    x_train = df_train[features].values
    x_valid = df_valid[features].values
    
    # Initialize random forest model
    model = ensemble.RandomForestClassifier(n_jobs=-1)
    
    # Fit model on training data
    model.fit(x_train, df_train.target.values)
    
    # Predict on validation data, we need the probability values as we are calculating AUC
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # Calculate and print AUC score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)


  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")


Fold = 0, AUC = 0.7375227007104589


  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")


Fold = 1, AUC = 0.7347527218811426


  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")


Fold = 2, AUC = 0.7342536026266099


  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")


Fold = 3, AUC = 0.7377363684885005


  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")


Fold = 4, AUC = 0.7330398443134086


In [48]:
# ohe_svd_rf.py
import pandas as pd
from scipy import sparse
from sklearn import decomposition
from sklearn import ensemble
from sklearn import metrics
from sklearn import preprocessing

def run(fold):
    # Load the full training data with folds
    df = pd.read_csv("train_folds.csv")
    
    # All columns are features except id, target, and kfold columns
    features = [f for f in df.columns if f not in ("id", "target", "kfold")]
    
    # Fill all NaN values with NONE and convert all columns to "strings"
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
    
    # Get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    # Get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # Initialize OneHotEncoder from scikit-learn
    ohe = preprocessing.OneHotEncoder()
    
    # Fit OHE on training + validation features
    full_data = pd.concat([df_train[features], df_valid[features]], axis=0)
    ohe.fit(full_data[features])
    
    # Transform training data
    x_train = ohe.transform(df_train[features])
    # Transform validation data
    x_valid = ohe.transform(df_valid[features])
    
    # Initialize Truncated SVD, reducing the data to 120 components
    svd = decomposition.TruncatedSVD(n_components=120)
    
    # Fit SVD on full sparse training data
    full_sparse = sparse.vstack((x_train, x_valid))
    svd.fit(full_sparse)
    
    # Transform sparse training data
    x_train = svd.transform(x_train)
    # Transform sparse validation data
    x_valid = svd.transform(x_valid)
    
    # Initialize random forest model
    model = ensemble.RandomForestClassifier(n_jobs=-1)
    
    # Fit model on training data
    model.fit(x_train, df_train.target.values)
    
    # Predict on validation data, we need the probability values as we are calculating AUC
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # Get ROC AUC score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    # Print AUC
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)


  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")


Fold = 0, AUC = 0.7175925996188925


  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")


Fold = 1, AUC = 0.7140717295913424


  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")


Fold = 2, AUC = 0.7129453713278914


  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")


Fold = 3, AUC = 0.7161600228449004


  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")


Fold = 4, AUC = 0.7126893588101487


In [52]:
# lbl_xgb.py
import pandas as pd
import xgboost as xgb
from sklearn import metrics
from sklearn import preprocessing

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("train_folds.csv")

    # all columns are features except id, target and kfold columns
    features = [f for f in df.columns if f not in ("id", "target", "kfold")]

    # fill all NaN values with NONE
    # note that I am converting all columns to "strings"
    # it doesn't matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")

    # now it’s time to label encode the features
    for col in features:
        # initialize LabelEncoder for each feature column
        lbl = preprocessing.LabelEncoder()
        # fit label encoder on all data
        lbl.fit(df[col])
        # transform all the data
        df.loc[:, col] = lbl.transform(df[col])

    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    # get training data
    x_train = df_train[features].values
    # get validation data
    x_valid = df_valid[features].values

    # initialize xgboost model
    model = xgb.XGBClassifier(
        n_jobs=-1,
        max_depth=7,
        n_estimators=200
    )
    
    # fit model on training data (ohe)
    model.fit(x_train, df_train.target.values)

    # predict on validation data
    # we need the probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]

    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)

    # print auc
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__":
    for fold_ in range(5):
        run(fold_)


  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")


Fold = 0, AUC = 0.7635390419511254


  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")


Fold = 1, AUC = 0.7627472303362647


  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")


Fold = 2, AUC = 0.7604133537233464


  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")


Fold = 3, AUC = 0.7623579681977355


  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")
  df.loc[:, col] = df[col].astype(str).fillna("NONE")


Fold = 4, AUC = 0.7588441359643497


In [54]:
# entity_emebddings.py
import os
import gc
import joblib
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils

def create_model(data, catcols):
    """
    This function returns a compiled tf.keras model
    for entity embeddings
    :param data: this is a pandas dataframe
    :param catcols: list of categorical column names
    :return: compiled tf.keras model
    """
    # init list of inputs for embeddings
    inputs = []
    # init list of outputs for embeddings
    outputs = []
    # loop over all categorical columns
    for c in catcols:
        # find the number of unique values in the column
        num_unique_values = int(data[c].nunique())
        # simple dimension of embedding calculator
        # min size is half of the number of unique values
        # max size is 50. max size depends on the number of unique
        # categories too. 50 is quite sufficient most of the times
        # but if you have millions of unique values, you might need
        # a larger dimension
        embed_dim = int(min(np.ceil((num_unique_values) / 2), 50))
        # simple keras input layer with size 1
        inp = layers.Input(shape=(1,))
        # add embedding layer to raw input
        # embedding size is always 1 more than unique values in input
        out = layers.Embedding(
            num_unique_values + 1, embed_dim, name=c
        )(inp)

        # 1-d spatial dropout is the standard for emebedding layers
        # you can use it in NLP tasks too
        out = layers.SpatialDropout1D(0.3)(out)
        # reshape the input to the dimension of embedding
        # this becomes our output layer for current feature
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        # add input to input list
        inputs.append(inp)
        # add output to output list
        outputs.append(out)

    # concatenate all output layers
    x = layers.Concatenate()(outputs)
    # add a batchnorm layer.
    # from here, everything is up to you
    # you can try different architectures
    # this is the architecture I like to use
    # if you have numerical features, you should add
    # them here or in concatenate layer
    x = layers.BatchNormalization()(x)

    # a bunch of dense layers with dropout.
    # start with 1 or two layers only
    x = layers.Dense(300, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)

    x = layers.Dense(300, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)

    # using softmax and treating it as a two class problem
    # you can also use sigmoid, then you need to use only one
    # output class
    y = layers.Dense(2, activation="softmax")(x)
    # create final model
    model = Model(inputs=inputs, outputs=y)
    # compile the model
    # we use adam and binary cross entropy.
    # feel free to use something else and see how model behaves
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

def run(fold):
    # load the full training data with folds
    df = pd.read_csv("train_folds.csv")
    # all columns are features except id, target and kfold columns
    features = [f for f in df.columns if f not in ("id", "target", "kfold")]
    # fill all NaN values with NONE
    # note that I am converting all columns to "strings"
    # it doesn't matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
    # encode all features with label encoder individually
    # in a live setting you need to save all label encoders
    for feat in features:
        lbl_enc = preprocessing.LabelEncoder()
        df.loc[:, feat] = lbl_enc.fit_transform(df[feat].values)
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    # create tf.keras model
    model = create_model(df, features)
    # our features are lists of lists
    xtrain = [df_train[features].values[:, k] for k in range(len(features))]
    xvalid = [df_valid[features].values[:, k] for k in range(len(features))]
    # fetch target columns
    ytrain = df_train.target.values
    yvalid = df_valid.target.values
    # convert target columns to categories
    # this is just binarization
    ytrain_cat = utils.to_categorical(ytrain)
    yvalid_cat = utils.to_categorical(yvalid)

    # fit the model
    model.fit(xtrain,
              ytrain_cat,
              validation_data=(xvalid, yvalid_cat),
              verbose=1,
              batch_size=1024,
              epochs=3
              )
    # generate validation predictions
    valid_preds = model.predict(xvalid)[:, 1]
    # print roc auc score
    print(metrics.roc_auc_score(yvalid, valid_preds))
    # clear session to free up some GPU memory
    K.clear_session()

if __name__ == "__main__":
    run(0)
    run(1)
    run(2)
    run(3)
    run(4))


ModuleNotFoundError: No module named 'tensorflow'

In [55]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.16.1-cp312-cp312-win_amd64.whl.metadata (3.5 kB)
Collecting tensorflow-intel==2.16.1 (from tensorflow)
  Downloading tensorflow_intel-2.16.1-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.16.1->tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.16.1->tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=23.5.26 (from tensorflow-intel==2.16.1->tensorflow)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.16.1->tensorflow)
  Downloading gast-0.5.4-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.16.1->tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting h5py>=3.10.0 (from tensorflow-