## MMTHE01 - Masters Thesis

### B. Thesis - Feature Engineering - Imputing missing fields
* Data Encoding
* Feature Scaling

#### Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os

from sklearn.preprocessing import LabelEncoder

In [2]:
# Check your current working directory
cwd = os.getcwd()

# Define your relative path
relative_path = r"6. Analysis"  # adjust this relative to cwd

# Build the full path
full_path = os.path.join(cwd, relative_path)

# Check if it exists before changing
if os.path.exists(full_path):
    os.chdir(full_path)
    print("Changed directory to:", full_path)
else:
    print("Folder does not exist:", full_path)

Changed directory to: C:\Users\eaber\Documents\11. Masters Thesis Final\6. Analysis


#### Importing the dataset

In [3]:
### Mark categorical fields to ensure they are appropriately delt with
categorical_columns = ['card1 ', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 
                      'M4', 'M5', 'M6', 'M7', 'M8', 'M9']

In [4]:
identity_cat_columns = ['id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14', 
                      'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28',
                      'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']

In [5]:
transactions_dataset = pd.read_csv('train_transaction.csv', dtype={col: 'category' for col in categorical_columns})
identity_dataset = pd.read_csv('train_identity.csv',  dtype={col: 'category' for col in identity_cat_columns})

dataset = transactions_dataset.merge(identity_dataset,on=['TransactionID'], how='left')

In [6]:
### Import features from step on
intermediate_output_path = os.path.join(r'S:\Semester 4\Masters Thesis Report\6. Analysis\intermediate_output', 'final_features_to_stay.csv')
features_to_stay =  pd.read_csv(intermediate_output_path)

In [7]:
features_to_stay_list = features_to_stay["features_to_stay"].tolist()

### Dataset after Step 1

In [8]:
dataset_final = dataset[features_to_stay_list].copy()

In [9]:
dataset_final.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_02,id_05,id_06,id_11,id_13,id_17,id_19,id_20,DeviceInfo,TransactionID
0,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,2987000
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,2987001
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,2987002
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,2987003
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,70787.0,,,100.0,,166.0,542.0,144.0,SAMSUNG SM-G892A Build/NRD90M,2987004


### Step 2: Encode the data and using a SimpleImputer to impute missing feature values

#### 2.1 Impute missing values for categorical variables

##### 2.1.1 Get the cat features

In [10]:
columns_with_missing = dataset_final.columns[dataset_final.isnull().any()].to_list()
len(columns_with_missing)

178

In [11]:
dict_ = {}
for column in columns_with_missing:
    dict_[column] = dataset_final.dtypes[column]

In [12]:
#### Get caregorical feature columns with missing data plus the columns without missing data
categorical_columns = [column for column in columns_with_missing if dataset_final[column].dtype == 'category']
len(categorical_columns)

23

In [13]:
categorical_columns_dataset = dataset_final[categorical_columns].copy()
categorical_columns_dataset.head()

Unnamed: 0,card2,card3,card4,card5,card6,addr1,addr2,P_emaildomain,M3,M4,...,id_01,id_02,id_05,id_06,id_11,id_13,id_17,id_19,id_20,DeviceInfo
0,,150.0,discover,142.0,credit,315.0,87.0,,T,M2,...,,,,,,,,,,
1,404.0,150.0,mastercard,102.0,credit,325.0,87.0,gmail.com,,M0,...,,,,,,,,,,
2,490.0,150.0,visa,166.0,debit,330.0,87.0,outlook.com,T,M0,...,,,,,,,,,,
3,567.0,150.0,mastercard,117.0,debit,476.0,87.0,yahoo.com,,M0,...,,,,,,,,,,
4,514.0,150.0,mastercard,102.0,credit,420.0,87.0,gmail.com,,,...,0.0,70787.0,,,100.0,,166.0,542.0,144.0,SAMSUNG SM-G892A Build/NRD90M


In [14]:
cat_missing_check = categorical_columns_dataset.columns[categorical_columns_dataset.isnull().any()].to_list()
len(cat_missing_check)

23

##### 2.1.2 Impute missing categorical values

In [15]:
from sklearn.impute import SimpleImputer

In [16]:
cat_imputer = SimpleImputer(strategy='most_frequent')

In [17]:
imputed_cat_data = cat_imputer.fit_transform(categorical_columns_dataset)

In [18]:
imputed_cat_df = pd.DataFrame(imputed_cat_data, columns=categorical_columns_dataset.columns)

In [19]:
imputed_cat_df.head()

Unnamed: 0,card2,card3,card4,card5,card6,addr1,addr2,P_emaildomain,M3,M4,...,id_01,id_02,id_05,id_06,id_11,id_13,id_17,id_19,id_20,DeviceInfo
0,321.0,150.0,discover,142.0,credit,315.0,87.0,gmail.com,T,M2,...,-5.0,1102.0,0.0,0.0,100.0,52.0,166.0,266.0,507.0,Windows
1,404.0,150.0,mastercard,102.0,credit,325.0,87.0,gmail.com,T,M0,...,-5.0,1102.0,0.0,0.0,100.0,52.0,166.0,266.0,507.0,Windows
2,490.0,150.0,visa,166.0,debit,330.0,87.0,outlook.com,T,M0,...,-5.0,1102.0,0.0,0.0,100.0,52.0,166.0,266.0,507.0,Windows
3,567.0,150.0,mastercard,117.0,debit,476.0,87.0,yahoo.com,T,M0,...,-5.0,1102.0,0.0,0.0,100.0,52.0,166.0,266.0,507.0,Windows
4,514.0,150.0,mastercard,102.0,credit,420.0,87.0,gmail.com,T,M0,...,0.0,70787.0,0.0,0.0,100.0,52.0,166.0,542.0,144.0,SAMSUNG SM-G892A Build/NRD90M


In [20]:
cat_missing_check = imputed_cat_df.columns[imputed_cat_df.isnull().any()].to_list()
cat_missing_check

[]

#### 2.2 Impute missing values for numeric variable

##### 2.2.1 Get the numeric features

In [21]:
columns_with_missing = dataset_final.columns[dataset_final.isnull().any()].to_list()
len(columns_with_missing)

178

In [22]:
dict_ = {}
for column in columns_with_missing:
    dict_[column] = dataset_final.dtypes[column]

In [23]:
#### Get numeric feature columns with missing data plus the columns without missing data
numeric_columns = [column for column in columns_with_missing if dataset_final[column].dtype == 'float64']
len(numeric_columns)

155

In [24]:
numeric_columns_dataset = dataset_final[numeric_columns].copy()
numeric_columns_dataset.head()

Unnamed: 0,dist1,D1,D3,D4,D5,D10,D11,D15,V1,V2,...,V304,V305,V309,V310,V311,V312,V314,V315,V318,V321
0,19.0,14.0,13.0,,,13.0,13.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,,0.0,,0.0,,0.0,,0.0,,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,287.0,0.0,,0.0,,0.0,315.0,315.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,112.0,0.0,94.0,0.0,84.0,,111.0,,,...,0.0,1.0,0.0,354.0,0.0,135.0,0.0,0.0,790.0,0.0
4,,0.0,,,,,,,,,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
num_missing_check = numeric_columns_dataset.columns[numeric_columns_dataset.isnull().any()].to_list()
len(num_missing_check)

155

##### 2.2.2 Impute missing numeric values

In [26]:
num_imputer = SimpleImputer(strategy='mean')

In [27]:
imputed_num_data = num_imputer.fit_transform(numeric_columns_dataset)

In [28]:
imputed_num_df = pd.DataFrame(imputed_num_data, columns=numeric_columns_dataset.columns)

In [29]:
imputed_num_df.head()

Unnamed: 0,dist1,D1,D3,D4,D5,D10,D11,D15,V1,V2,...,V304,V305,V309,V310,V311,V312,V314,V315,V318,V321
0,19.0,14.0,13.0,140.002441,42.335965,13.0,13.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,118.50218,0.0,28.343348,0.0,42.335965,0.0,146.621465,0.0,0.999945,1.045204,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,287.0,0.0,28.343348,0.0,42.335965,0.0,315.0,315.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,118.50218,112.0,0.0,94.0,0.0,84.0,146.621465,111.0,0.999945,1.045204,...,0.0,1.0,0.0,354.0,0.0,135.0,0.0,0.0,790.0,0.0
4,118.50218,0.0,28.343348,140.002441,42.335965,123.982137,146.621465,163.744579,0.999945,1.045204,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
num_missing_check = imputed_num_df.columns[imputed_num_df.isnull().any()].to_list()
num_missing_check

[]

### Step 3: Combining the data and encoding categorical data using the label encoder

#### 3.1 Combine the data

In [31]:
full_columns = dataset_final.columns[dataset_final.notna().all()].to_list()

In [32]:
dataset_full_features = dataset_final[full_columns]

In [33]:
dataset_full_features.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,C3,C9,C12,C13,C14,TransactionID
0,0,86400,68.5,W,13926,0.0,1.0,0.0,1.0,1.0,2987000
1,0,86401,29.0,W,2755,0.0,0.0,0.0,1.0,1.0,2987001
2,0,86469,59.0,W,4663,0.0,1.0,0.0,1.0,1.0,2987002
3,0,86499,50.0,W,18132,0.0,1.0,0.0,25.0,1.0,2987003
4,0,86506,50.0,H,4497,0.0,0.0,0.0,1.0,1.0,2987004


In [34]:
train_dataset_j1 = dataset_full_features.merge(imputed_cat_df, on= imputed_cat_df.index ,how='inner' )
train_dataset_j1.drop('key_0', axis=1, inplace=True)

In [35]:
train_dataset_j1.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,C3,C9,C12,C13,C14,...,id_01,id_02,id_05,id_06,id_11,id_13,id_17,id_19,id_20,DeviceInfo
0,0,86400,68.5,W,13926,0.0,1.0,0.0,1.0,1.0,...,-5.0,1102.0,0.0,0.0,100.0,52.0,166.0,266.0,507.0,Windows
1,0,86401,29.0,W,2755,0.0,0.0,0.0,1.0,1.0,...,-5.0,1102.0,0.0,0.0,100.0,52.0,166.0,266.0,507.0,Windows
2,0,86469,59.0,W,4663,0.0,1.0,0.0,1.0,1.0,...,-5.0,1102.0,0.0,0.0,100.0,52.0,166.0,266.0,507.0,Windows
3,0,86499,50.0,W,18132,0.0,1.0,0.0,25.0,1.0,...,-5.0,1102.0,0.0,0.0,100.0,52.0,166.0,266.0,507.0,Windows
4,0,86506,50.0,H,4497,0.0,0.0,0.0,1.0,1.0,...,0.0,70787.0,0.0,0.0,100.0,52.0,166.0,542.0,144.0,SAMSUNG SM-G892A Build/NRD90M


In [36]:
train_dataset = train_dataset_j1.merge(imputed_num_df, on= imputed_num_df.index ,how='inner' )
train_dataset.drop('key_0', axis=1, inplace=True)

In [37]:
train_dataset.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,C3,C9,C12,C13,C14,...,V304,V305,V309,V310,V311,V312,V314,V315,V318,V321
0,0,86400,68.5,W,13926,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,86401,29.0,W,2755,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,86469,59.0,W,4663,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,86499,50.0,W,18132,0.0,1.0,0.0,25.0,1.0,...,0.0,1.0,0.0,354.0,0.0,135.0,0.0,0.0,790.0,0.0
4,0,86506,50.0,H,4497,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### 3.4 Export data Unencoded data

In [38]:
### Export the unencoded data
train_dataset.to_csv('train_dataset_final_unencoded.csv',index=False)

In [39]:
train_dataset.shape

(590540, 189)

#### 3.2 Encode categorical data

##### 3.2.1 Understanding the categorical variables which need encoding

In [40]:
#list of categorical features which require encoding 
categorical_cols = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'M3', 'M4', 'M5', 'M6', 'M9', 'DeviceInfo']

In [41]:
# Count unique values in each categorical column
category_counts = train_dataset[categorical_cols].nunique()

print(category_counts)

ProductCD           5
card4               4
card6               4
P_emaildomain      59
M3                  2
M4                  3
M5                  2
M6                  2
M9                  2
DeviceInfo       1786
dtype: int64


In [42]:
### Encoding methods to use:
#1. LE - P_emaildomain and DeviceInfo (because of high cardinality)
#2. Map T and False - M3, M5, M6 and M9 (because it is T & F)
#3. OHE - ProductCD, card4, card6, M4 (low cardinality so it is efficient to use OHE)

##### 3.2.2 Label Encoding

In [43]:
def label_encode_keep_nan(series):
    """
    Label encodes a pandas Series while leaving NaNs untouched.
    Returns the encoded series and the fitted encoder.
    """
    # Keep mask of missing values
    mask = series.isna()

    # Apply LabelEncoder to non-NaN values only
    le = LabelEncoder()
    encoded = le.fit_transform(series[~mask])

    # Create a full-length array with NaNs
    full_encoded = pd.Series(np.nan, index=series.index)
    full_encoded[~mask] = encoded

    return full_encoded, le

In [44]:
train_dataset['P_emaildomain'], email_encoder = label_encode_keep_nan(train_dataset['P_emaildomain'])
train_dataset['DeviceInfo'], device_infoencoder = label_encode_keep_nan(train_dataset['DeviceInfo'])
train_dataset["M3"] = train_dataset["M3"].map({"T": 1, "F": 0})
train_dataset["M5"] = train_dataset["M5"].map({"T": 1, "F": 0})
train_dataset["M6"] = train_dataset["M6"].map({"T": 1, "F": 0})
train_dataset["M9"] = train_dataset["M9"].map({"T": 1, "F": 0})

##### 3.2.3 One Hot Encoding

In [45]:
from sklearn.preprocessing import OneHotEncoder

In [46]:
cols_for_ohe = ['ProductCD', 'card4', 'card6', 'M4']

In [47]:
# Initialize OHE encoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [48]:
# Fit and transform
encoded_array = encoder.fit_transform(train_dataset[cols_for_ohe])

In [49]:
# Create new DataFrame with encoded column names
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(cols_for_ohe))

In [50]:
# Combine with the rest of the data
train_dataset_final = pd.concat([train_dataset.drop(columns=cols_for_ohe), encoded_df], axis=1)

In [51]:
train_dataset_final.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,card1,C3,C9,C12,C13,C14,TransactionID,...,card4_discover,card4_mastercard,card4_visa,card6_charge card,card6_credit,card6_debit,card6_debit or credit,M4_M0,M4_M1,M4_M2
0,0,86400,68.5,13926,0.0,1.0,0.0,1.0,1.0,2987000,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0,86401,29.0,2755,0.0,0.0,0.0,1.0,1.0,2987001,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0,86469,59.0,4663,0.0,1.0,0.0,1.0,1.0,2987002,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0,86499,50.0,18132,0.0,1.0,0.0,25.0,1.0,2987003,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0,86506,50.0,4497,0.0,0.0,0.0,1.0,1.0,2987004,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [52]:
train_dataset_final.groupby('isFraud').count()

Unnamed: 0_level_0,TransactionDT,TransactionAmt,card1,C3,C9,C12,C13,C14,TransactionID,card2,...,card4_discover,card4_mastercard,card4_visa,card6_charge card,card6_credit,card6_debit,card6_debit or credit,M4_M0,M4_M1,M4_M2
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,569877,569877,569877,569877,569877,569877,569877,569877,569877,569877,...,569877,569877,569877,569877,569877,569877,569877,569877,569877,569877
1,20663,20663,20663,20663,20663,20663,20663,20663,20663,20663,...,20663,20663,20663,20663,20663,20663,20663,20663,20663,20663


#### 3.3 Export encoded data

In [53]:
### Export encoded data for Non SMOTE
train_dataset_final.to_csv('train_dataset_final_encoded.csv',index=False)

In [54]:
train_dataset_final.shape

(590540, 201)