In [111]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder

In [112]:
df = pd.read_csv('../input/calcifications.csv')

In [113]:
df["smoker_status"] = df["smoker_status"].apply(lambda v: "no" if v == "no" else "yes")
df["calcification_type_left"] = df["calcification_type_left"].apply(lambda v: v.replace(" ", ""))
df["calcification_type_right"] = df["calcification_type_right"].apply(lambda v: v.replace(" ", ""))

In [114]:
SEED = 1303

# Preprocessing

In [115]:
TO_DROP = ["site", "patient_ID", "infarct_side", "stroke", "tabulator"]
TO_SCALE = ["age", "stenosis_left", "stenosis_right"]
TO_LABEL_ENCODE = ["TIA", "hypertension", "cad", "gender", "diabetes", "hyperlipidemia", "smoker_status"]
TO_OHE = ["calcification_type_left", "calcification_type_right"]
PREFIXES = ["calcification_left", "calcification_right"]
TARGET = "symptoms"

In [116]:
df = df.drop(TO_DROP, axis=1)

In [117]:
X = df.drop(TARGET, axis=1)
y = df.symptoms
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=SEED, 
                                                    stratify=y)

## Training set

In [8]:
# Label encoding for the target
le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [9]:
# Ordinal encoding for categorical binary features
oe = OrdinalEncoder()
X_train[TO_LABEL_ENCODE] = oe.fit_transform(X_train[TO_LABEL_ENCODE]).astype(np.uint8)

In [66]:
# One-hot encoding for categorical features with > 2 levels
X_train = pd.get_dummies(X_train, columns=TO_OHE, prefix=PREFIXES)

In [69]:
# Scale numeric columns
scaler = StandardScaler()
X_train[TO_SCALE] = scaler.fit_transform(X_train[TO_SCALE])

## Test set

In [70]:
y_test = le.transform(y_test)
X_test[TO_LABEL_ENCODE] = oe.transform(X_test[TO_LABEL_ENCODE]).astype(np.uint8)
X_test = pd.get_dummies(X_test, columns=TO_OHE, prefix=PREFIXES)
X_test[TO_SCALE] = scaler.transform(X_test[TO_SCALE])

## Add targets to design matrices

In [76]:
X_train[TARGET] = y_train
X_test[TARGET] = y_test

In [79]:
X_train.to_csv('../input/train.csv', index=False)
X_test.to_csv('../input/test.csv', index=False)

# No one-hot encoding

In [118]:
TO_DROP = ["site", "patient_ID", "infarct_side", "stroke", "tabulator"]
TO_SCALE = ["age", "stenosis_left", "stenosis_right"]
TO_LABEL_ENCODE = ["TIA", "hypertension", "cad", "gender", 
                   "diabetes", "hyperlipidemia", "smoker_status",
                  "calcification_type_left", "calcification_type_right"]
PREFIXES = ["calcification_left", "calcification_right"]
TARGET = "symptoms"

In [119]:
# Label encoding for the target
le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [120]:
# Ordinal encoding for categorical binary features
oe = OrdinalEncoder()
X_train[TO_LABEL_ENCODE] = oe.fit_transform(X_train[TO_LABEL_ENCODE]).astype(np.uint8)

In [41]:
# Scale numeric columns
scaler = StandardScaler()
X_train[TO_SCALE] = scaler.fit_transform(X_train[TO_SCALE])
X_test[TO_SCALE] = scaler.transform(X_test[TO_SCALE])

In [121]:
y_test = le.transform(y_test)
X_test[TO_LABEL_ENCODE] = oe.transform(X_test[TO_LABEL_ENCODE]).astype(np.uint8)

In [28]:
X_train[TARGET] = y_train
X_test[TARGET] = y_test
X_train.to_csv('../input/train_no_ohe.csv', index=False)
X_test.to_csv('../input/test_no_ohe.csv', index=False)

## Experiment with feature engineering

In [122]:
def positive_rim_sign(left, right):
    return 1 if (left == 5.0 or right == 5.0) else 0
    
X_train['prs'] = X_train.apply(lambda x: positive_rim_sign(x['calcification_type_left'], x['calcification_type_right']), axis=1)
X_test['prs'] = X_test.apply(lambda x: positive_rim_sign(x['calcification_type_left'], x['calcification_type_right']), axis=1)

In [124]:
def calc(left, right):
    return 0 if left == 0.0 and right == 0.0 else 1
    
X_train['calcification'] = X_train.apply(lambda x: calc(x['calcification_type_left'], x['calcification_type_right']), axis=1)
X_test['calcification'] = X_test.apply(lambda x: calc(x['calcification_type_left'], x['calcification_type_right']), axis=1)

In [127]:
X_train[TARGET] = y_train
X_test[TARGET] = y_test
X_train.to_csv('../input/train_no_ohe_new_feat.csv', index=False)
X_test.to_csv('../input/test_no_ohe_new_feat.csv', index=False)