## Mapping

In [None]:
mapping = { 
    "Freezing": 0, 
    "Warm": 1, 
    "Cold": 2, 
    "Boiling Hot": 3, 
    "Hot": 4, 
    "Lava Hot": 5   
}

In [7]:
import pandas as pd 
 
df = pd.read_csv("train.csv") 
df.ord_2.value_counts()


ord_2
Freezing       99816
Lava Hot       63908
Boiling Hot    60627
Cold           33768
Hot            22227
Warm           19654
Name: count, dtype: int64

In [8]:
df.loc[:, "ord_2"] = df.ord_2.map(mapping)

In [9]:
df.ord_2.value_counts()

ord_2
0    99816
5    63908
3    60627
2    33768
4    22227
1    19654
Name: count, dtype: int64

## LabelEncoder

In [10]:
import pandas as pd 
from sklearn import preprocessing 
 
# read the data 
df = pd.read_csv("train.csv") 
 
# fill NaN values in ord_2 column 
df.loc[:, "ord_2"] = df.ord_2.fillna("NONE") 
 
# initialize LabelEncoder 
lbl_enc = preprocessing.LabelEncoder() 
 
# fit label encoder and transform values on ord_2 column 
# P.S: do not use this directly. fit first, then transform 
df.loc[:, "ord_2"] = lbl_enc.fit_transform(df.ord_2.values)

In [12]:
 df.groupby(["ord_2"])["id"].count()

ord_2
0    60627
1    33768
2    99816
3    22227
4    63908
5    19654
Name: id, dtype: int64

## StratifiedKFold

In [14]:
# create_folds.py 
# import pandas and model_selection module of scikit-learn 
import pandas as pd 
from sklearn import model_selection 
 
if __name__ == "__main__": 
 
    # Read training data 
    df = pd.read_csv("train.csv") 
 
    # we create a new column called kfold and fill it with -1 
    df["kfold"] = -1 
     
    # the next step is to randomize the rows of the data 
    df = df.sample(frac=1).reset_index(drop=True) 
     
    # fetch labels 
    y = df.target.values 
     
    # initiate the kfold class from model_selection module 
    kf = model_selection.StratifiedKFold(n_splits=5) 
     
    # fill the new kfold column 
    for f, (t_, v_) in enumerate(kf.split(X=df, y=y)): 
        df.loc[v_, 'kfold'] = f 
     
    # save the new csv with kfold column 
    df.to_csv("cat_train_folds.csv", index=False)

In [15]:
import pandas as pd
df = pd.read_csv("cat_train_folds.csv")

df.kfold.value_counts()

kfold
0    60000
1    60000
2    60000
3    60000
4    60000
Name: count, dtype: int64

In [16]:
df[df.kfold==0].target.value_counts()

target
0    41648
1    18352
Name: count, dtype: int64

##  logistic regression. 

In [None]:
import pandas as pd 
 
from sklearn import linear_model 
from sklearn import metrics 
from sklearn import preprocessing 
 
def run(fold): 
    # load the full training data with folds 
    df = pd.read_csv("cat_train_folds.csv")
 
    # all columns are features except id, target and kfold columns 
    features = [ 
        f for f in df.columns if f not in ("id", "target", "kfold") 
    ] 
 
    # fill all NaN values with NONE 
    # note that I am converting all columns to "strings" 
    # it doesn’t matter because all are categories 
    for col in features: 
        df.loc[:, col] = df[col].astype(str).fillna("NONE") 
     
    # get training data using folds 
    df_train = df[df.kfold != fold].reset_index(drop=True) 
 
    # get validation data using folds 
    df_valid = df[df.kfold == fold].reset_index(drop=True) 
 
    # initialize OneHotEncoder from scikit-learn 
    ohe = preprocessing.OneHotEncoder() 
 
    # fit ohe on training + validation features 
    full_data = pd.concat( 
        [df_train[features], df_valid[features]], 
        axis=0 
    ) 
    ohe.fit(full_data[features]) 
 
    # transform training data 
    x_train = ohe.transform(df_train[features]) 
 
    # transform validation data 
    x_valid = ohe.transform(df_valid[features]) 
 
    # initialize Logistic Regression model 
    model = linear_model.LogisticRegression() 
    # fit model on training data (ohe) 
    model.fit(x_train, df_train.target.values) 
 
    # predict on validation data 
    # we need the probability values as we are calculating AUC 
    # we will use the probability of 1s 
    valid_preds = model.predict_proba(x_valid)[:, 1] 
 
    # get roc auc score 
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds) 
 
    # print auc 
    print(f"Fold = {fold}, AUC = {auc}")  
 
 
if __name__ == "__main__": 
    for fold_ in range(5): 
        run(fold_)

## randomforest

In [18]:
# lbl_rf.py 
import pandas as pd 
 
from sklearn import ensemble 
from sklearn import metrics 
from sklearn import preprocessing 
 
 
def run(fold): 
 
    # load the full training data with folds 
    df = pd.read_csv("cat_train_folds.csv")
 
    # all columns are features except id, target and kfold columns 
    features = [ 
        f for f in df.columns if f not in ("id", "target", "kfold") 
    ] 
 
    # fill all NaN values with NONE 
    # note that I am converting all columns to "strings" 
    # it doesnt matter because all are categories 
    for col in features: 
        df.loc[:, col] = df[col].astype(str).fillna("NONE") 
     
    # now its time to label encode the features 
    for col in features: 
         
        # initialize LabelEncoder for each feature column 
        lbl = preprocessing.LabelEncoder() 
         
        # fit label encoder on all data 
        lbl.fit(df[col]) 
 
        # transform all the data 
        df.loc[:, col] = lbl.transform(df[col]) 
 
    # get training data using folds 
    df_train = df[df.kfold != fold].reset_index(drop=True) 
 
    # get validation data using folds 
    df_valid = df[df.kfold == fold].reset_index(drop=True) 
 
    # get training data 
    x_train = df_train[features].values 
 
    # get validation data 
    x_valid = df_valid[features].values 
 
    # initialize random forest model 
    model = ensemble.RandomForestClassifier(n_jobs=-1) 
 
    # fit model on training data (ohe) 
    model.fit(x_train, df_train.target.values) 
 
    # predict on validation data 
    # we need the probability values as we are calculating AUC 
    # we will use the probability of 1s 
    valid_preds = model.predict_proba(x_valid)[:, 1] 
 
    # get roc auc score 
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds) 
 
    # print auc 
    print(f"Fold = {fold}, AUC = {auc}") 
 
 
if __name__ == "__main__": 
    for fold_ in range(5): 
        run(fold_)


Fold = 0, AUC = 0.7352373115553327
Fold = 1, AUC = 0.7369760420886947
Fold = 2, AUC = 0.7333835348958494
Fold = 3, AUC = 0.7350728302544831
Fold = 4, AUC = 0.7353623177343978


## xgboost

In [2]:
import pandas as pd
import xgboost as xgb 
 
from sklearn import metrics 
from sklearn import preprocessing 
 
 
def run(fold): 
    # load the full training data with folds 
    df = pd.read_csv("cat_train_folds.csv")
 
    # all columns are features except id, target and kfold columns 
    features = [ 
        f for f in df.columns if f not in ("id", "target", "kfold") 
    ] 
 
    # fill all NaN values with NONE 
    # note that I am converting all columns to "strings" 
    # it doesnt matter because all are categories 
    for col in features: 
        df.loc[:, col] = df[col].astype(str).fillna("NONE") 
     
    # now it’s time to label encode the features 
    for col in features: 
         
        # initialize LabelEncoder for each feature column 
        lbl = preprocessing.LabelEncoder() 
         
        # fit label encoder on all data 
        lbl.fit(df[col]) 
 
        # transform all the data 
        df.loc[:, col] = lbl.transform(df[col]) 
 
    # get training data using folds 
    df_train = df[df.kfold != fold].reset_index(drop=True) 
 
    # get validation data using folds 
    df_valid = df[df.kfold == fold].reset_index(drop=True) 
 
    # get training data 
    x_train = df_train[features].values 
 
    # get validation data 
    x_valid = df_valid[features].values 
 
    # initialize xgboost model 
    model = xgb.XGBClassifier( 
        n_jobs=-1,  
        max_depth=7, 
        n_estimators=200 
    ) 
 
    # fit model on training data (ohe) 
    model.fit(x_train, df_train.target.values) 
 
    # predict on validation data 
    # we need the probability values as we are calculating AUC 
    # we will use the probability of 1s 
    valid_preds = model.predict_proba(x_valid)[:, 1] 
 
    # get roc auc score 
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds) 
 
    # print auc 
    print(f"Fold = {fold}, AUC = {auc}") 
 
 
if __name__ == "__main__": 
    for fold_ in range(5): 
        run(fold_)

Fold = 0, AUC = 0.7612318825808679
Fold = 1, AUC = 0.7625039973479808
Fold = 2, AUC = 0.7594118209006878
Fold = 3, AUC = 0.7613782461383451
Fold = 4, AUC = 0.760830631918779


##  US adult census data

In [27]:
df = pd.read_csv('adult.data',  header=None)
df.columns=['age','workclass','fnlwgt','education' ,'education.num','marital.status ','occupation' ,
'relationship' ,
'race' ,
'sex' ,
'capital.gain',
'capital.loss',
'hours.per.week',
'native.country ',
'income']
df.to_csv("adult.csv", index=False)


In [5]:
df = pd.read_csv('adult.csv')
df.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
education.num      0
marital.status     0
occupation         0
relationship       0
race               0
sex                0
capital.gain       0
capital.loss       0
hours.per.week     0
native.country     0
income             0
dtype: int64

In [34]:
# create_folds.py 
# import pandas and model_selection module of scikit-learn 
import pandas as pd 
from sklearn import model_selection 
 
if __name__ == "__main__": 
 
    # Read training data 
    df = pd.read_csv('adult.csv')
 
    # we create a new column called kfold and fill it with -1 
    df["kfold"] = -1 
     
    # the next step is to randomize the rows of the data 
    df = df.sample(frac=1).reset_index(drop=True) 
     
    # fetch labels 
    y = df.income.values 
     
    # initiate the kfold class from model_selection module 
    kf = model_selection.StratifiedKFold(n_splits=5) 
     
    # fill the new kfold column 
    for f, (t_, v_) in enumerate(kf.split(X=df, y=y)): 
        df.loc[v_, 'kfold'] = f 
     
    # save the new csv with kfold column 
    df.to_csv("adult_folds.csv", index=False)

In [56]:
df = pd.read_csv('adult_folds.csv')

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,kfold
0,29,Private,180758,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
1,34,Self-emp-not-inc,177639,Assoc-acdm,12,Never-married,Craft-repair,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
2,47,State-gov,103406,HS-grad,9,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States,<=50K,0
3,47,Self-emp-not-inc,235646,Masters,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,>50K,0
4,45,Federal-gov,56904,Bachelors,13,Married-civ-spouse,Adm-clerical,Husband,White,Male,5013,0,45,United-States,<=50K,0


In [57]:
df.kfold.value_counts()

kfold
0    6513
1    6512
2    6512
3    6512
4    6512
Name: count, dtype: int64

In [73]:
df[df.kfold==0]

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,kfold
0,29,Private,180758,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
1,34,Self-emp-not-inc,177639,Assoc-acdm,12,Never-married,Craft-repair,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
2,47,State-gov,103406,HS-grad,9,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States,<=50K,0
3,47,Self-emp-not-inc,235646,Masters,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,>50K,0
4,45,Federal-gov,56904,Bachelors,13,Married-civ-spouse,Adm-clerical,Husband,White,Male,5013,0,45,United-States,<=50K,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6513,58,Private,49893,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,45,United-States,<=50K,0
6514,32,Private,80058,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,<=50K,0
6515,58,Private,197642,Some-college,10,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,39,United-States,<=50K,0
6516,71,?,193863,7th-8th,4,Widowed,?,Other-relative,White,Female,0,0,16,Poland,<=50K,0


##  logistic regression

In [74]:
import pandas as pd 
 
from sklearn import linear_model 
from sklearn import metrics 
from sklearn import preprocessing

def run(fold): 
    # load the full training data with folds 
    df = pd.read_csv('adult_folds.csv')
 
    # list of numerical columns 
    num_cols = [ 
        "fnlwgt", 
        "age", 
        "capital.gain", 
        "capital.loss", 
        "hours.per.week" ,
        "workclass",
        "occupation"

    ] 
 
    # drop numerical columns 
    df = df.drop(num_cols, axis=1) 
 
    # map targets to 0s and 1s 
    target_mapping = { 
        "<=50K": 0, 
        ">50K": 1 
    } 
    df.loc[:, "income"] = df.income.map(target_mapping) 
 
    # all columns are features except income and kfold columns 
    features = [ 
        f for f in df.columns if f not in ("kfold", "income") 
    ] 
 
    # fill all NaN values with NONE 
    # note that I am converting all columns to "strings" 
    # it doesnt matter because all are categories 
    for col in features: 
        df.loc[:, col] = df[col].astype(str).fillna("NONE") 
    # get training data using folds 
    df_train = df[df.kfold != fold].reset_index(drop=True) 
 
    # get validation data using folds 
    df_valid = df[df.kfold == fold].reset_index(drop=True) 
 
    # initialize OneHotEncoder from scikit-learn 
    ohe = preprocessing.OneHotEncoder() 
 
    # fit ohe on training + validation features 
    full_data = pd.concat( 
        [df_train[features], df_valid[features]], 
        axis=0)
    ohe.fit(full_data[features]) 
 
    # transform training data 
    x_train = ohe.transform(df_train[features]) 
 
    # transform validation data 
    x_valid = ohe.transform(df_valid[features]) 
 
    # initialize Logistic Regression model 
    model = linear_model.LogisticRegression() 
 
    # fit model on training data (ohe) 
    model.fit(x_train, df_train.income.values) 
 
    # predict on validation data 
    # we need the probability values as we are calculating AUC 
    # we will use the probability of 1s 
    valid_preds = model.predict_proba(x_valid)[:, 1] 
 
    # get roc auc score 
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds) 
 
    # print auc 
    print(f"Fold = {fold}, AUC = {auc}")

if __name__ == "__main__": 
    for fold_ in range(5): 
        run(fold_)


ValueError: Input contains NaN

0

## xgboost

In [75]:
# target_encoding.py 
import copy 
import pandas as pd 
 
from sklearn import metrics 
from sklearn import preprocessing 
import xgboost as xgb

In [76]:
def mean_target_encoding(data): 
 
    # make a copy of dataframe 
    df = copy.deepcopy(data) 
 
    # list of numerical columns 
    num_cols = [ 
        "fnlwgt", 
        "age", 
        "capital.gain", 
        "capital.loss", 
        "hours.per.week" 
    ] 
 
    # map targets to 0s and 1s 
    target_mapping = { 
        "<=50K": 0, 
        ">50K": 1 
    } 
 
    df.loc[:, "income"] = df.income.map(target_mapping) 
     
    # all columns are features except income and kfold columns 
    features = [ 
        f for f in df.columns if f not in ("kfold", "income") 
        and f not in num_cols 
    ] 
 
    # all columns are features except kfold & income columns 
    features = [ 
        f for f in df.columns if f not in ("kfold", "income") 
    ] 
 
    # fill all NaN values with NONE
    # note that I am converting all columns to "strings" 
    # it doesnt matter because all are categories 
    for col in features: 
        # do not encode the numerical columns 
        if col not in num_cols: 
            df.loc[:, col] = df[col].astype(str).fillna("NONE") 
     
    # now its time to label encode the features 
    for col in features: 
        if col not in num_cols:         
            # initialize LabelEncoder for each feature column 
            lbl = preprocessing.LabelEncoder() 
             
            # fit label encoder on all data 
            lbl.fit(df[col]) 
 
            # transform all the data 
            df.loc[:, col] = lbl.transform(df[col]) 
 
    # a list to store 5 validation dataframes 
    encoded_dfs = [] 
 
    # go over all folds 
    for fold in range(5): 
        # fetch training and validation data 
        df_train = df[df.kfold != fold].reset_index(drop=True) 
        df_valid = df[df.kfold == fold].reset_index(drop=True) 
        # for all feature columns, i.e. categorical columns 
        for column in features: 
            # create dict of category:mean target 
            mapping_dict = dict( 
                df_train.groupby(column)["income"].mean() 
            ) 
            # column_enc is the new column we have with mean encoding 
            df_valid.loc[ 
                :, column + "_enc" 
            ] = df_valid[column].map(mapping_dict) 
        # append to our list of encoded validation dataframes 
        encoded_dfs.append(df_valid) 
    # create full data frame again and return 
    encoded_df = pd.concat(encoded_dfs, axis=0) 
    return encoded_df

def run(df, fold): 
    # note that folds are same as before 
    # get training data using folds 
    df_train = df[df.kfold != fold].reset_index(drop=True) 
 
    # get validation data using folds 
    df_valid = df[df.kfold == fold].reset_index(drop=True) 
 
    # all columns are features except income and kfold columns 
    features = [ 
        f for f in df.columns if f not in ("kfold", "income") 
    ] 
 
    # scale training data 
    x_train = df_train[features].values 
 
    # scale validation data 
    x_valid = df_valid[features].values 
 
    # initialize xgboost model 
    model = xgb.XGBClassifier( 
        n_jobs=-1, 
        max_depth=7 
    ) 
 
    # fit model on training data (ohe) 
    model.fit(x_train, df_train.income.values) 
 
    # predict on validation data 
    # we need the probability values as we are calculating AUC 
    # we will use the probability of 1s 
    valid_preds = model.predict_proba(x_valid)[:, 1] 
 
    # get roc auc score 
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds) 
 
    # print auc 
    print(f"Fold = {fold}, AUC = {auc}") 
 
 
if __name__ == "__main__": 
    # read data 
    df = pd.read_csv("adult_folds.csv") 
     
    # create mean target encoded categories and 
    # munge data 
    df = mean_target_encoding(df) 
    for fold_ in range(5): 
        run(df, fold_)

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [    0     1     2 ... 26045 26046 26047], got [nan nan nan ... nan nan nan]