## MMTHE01 - Masters Thesis

### 1. Thesis - Create a working model
* Feature engineering and feature selection procedures will be done
* A base model will be created
* Several models will be tried and a good model selected
* All assumptions made in this step will be documented

#### Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os

from sklearn.preprocessing import LabelEncoder

In [2]:
os.chdir(r'S:\Semester 4\Masters Thesis Report\6. Analysis')

#### Importing the dataset

In [3]:
### Mark categorical fields to ensure they are appropriately delt with
categorical_columns = ['card1 ', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 
                       'M4', 'M5', 'M6', 'M7', 'M8', 'M9']

In [4]:
identity_cat_columns = ['id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14', 
                        'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28',
                        'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']

In [5]:
transactions_dataset = pd.read_csv('train_transaction.csv', dtype={col: 'category' for col in categorical_columns})
identity_dataset = pd.read_csv('train_identity.csv',  dtype={col: 'category' for col in identity_cat_columns})
dataset = transactions_dataset.merge(identity_dataset,on=['TransactionID'], how='left')

In [6]:
### Import features from step on
intermediate_output_path = os.path.join(r'S:\Semester 4\Masters Thesis Report\6. Analysis\intermediate_output', 'final_features_to_stay.csv')
features_to_stay =  pd.read_csv(intermediate_output_path)

In [7]:
features_to_stay_list = features_to_stay["features_to_stay"].tolist()

### Dataset after Step 1

In [8]:
dataset_final = dataset[features_to_stay_list].copy()

In [9]:
dataset_final.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_02,id_05,id_06,id_11,id_13,id_17,id_19,id_20,DeviceInfo,TransactionID
0,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,2987000
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,2987001
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,2987002
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,2987003
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,70787.0,,,100.0,,166.0,542.0,144.0,SAMSUNG SM-G892A Build/NRD90M,2987004


### Step 2: Impute missing features using MICE

In [10]:
### Import necessary libraries
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import HistGradientBoostingRegressor ### For numeric variables
from sklearn.impute import SimpleImputer
from fancyimpute import SoftImpute
import time

#### 2.1 Split numeric and categorical variables

In [11]:
def label_encode_keep_nan(series):
    """
    Label encodes a pandas Series while leaving NaNs untouched.
    Returns the encoded series and the fitted encoder.
    """
    # Keep mask of missing values
    mask = series.isna()

    # Apply LabelEncoder to non-NaN values only
    le = LabelEncoder()
    encoded = le.fit_transform(series[~mask])

    # Create a full-length array with NaNs
    full_encoded = pd.Series(np.nan, index=series.index)
    full_encoded[~mask] = encoded

    return full_encoded, le

In [12]:
m_columns = dataset_final.columns[dataset_final.columns.str.startswith("M")]
m_columns

Index(['M3', 'M4', 'M5', 'M6', 'M9'], dtype='object')

In [13]:
dataset_final['card4'], card4_encoder = label_encode_keep_nan(dataset_final['card4'])
dataset_final['card6'], card6_encoder = label_encode_keep_nan(dataset_final['card6'])
dataset_final['P_emaildomain'], email_encoder = label_encode_keep_nan(dataset_final['P_emaildomain'])
dataset_final['ProductCD'], product_encoder = label_encode_keep_nan(dataset_final['ProductCD'])
dataset_final['M4'], M4_encoder = label_encode_keep_nan(dataset_final['M4'])
dataset_final['DeviceInfo'], device_infoencoder = label_encode_keep_nan(dataset_final['DeviceInfo'])
dataset_final["M3"] = dataset_final["M3"].map({"T": 1, "F": 0})
dataset_final["M5"] = dataset_final["M5"].map({"T": 1, "F": 0})
dataset_final["M6"] = dataset_final["M6"].map({"T": 1, "F": 0})
dataset_final["M9"] = dataset_final["M9"].map({"T": 1, "F": 0})

In [14]:
dataset_final

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_02,id_05,id_06,id_11,id_13,id_17,id_19,id_20,DeviceInfo,TransactionID
0,0,86400,68.50,4.0,13926,,150.0,1.0,142.0,1.0,...,,,,,,,,,,2987000
1,0,86401,29.00,4.0,2755,404.0,150.0,2.0,102.0,1.0,...,,,,,,,,,,2987001
2,0,86469,59.00,4.0,4663,490.0,150.0,3.0,166.0,2.0,...,,,,,,,,,,2987002
3,0,86499,50.00,4.0,18132,567.0,150.0,2.0,117.0,2.0,...,,,,,,,,,,2987003
4,0,86506,50.00,1.0,4497,514.0,150.0,2.0,102.0,1.0,...,70787.0,,,100.0,,166.0,542.0,144.0,954.0,2987004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,0,15811047,49.00,4.0,6550,,150.0,3.0,226.0,2.0,...,,,,,,,,,,3577535
590536,0,15811049,39.50,4.0,10444,225.0,150.0,2.0,224.0,2.0,...,,,,,,,,,,3577536
590537,0,15811079,30.95,4.0,12037,595.0,150.0,2.0,224.0,2.0,...,,,,,,,,,,3577537
590538,0,15811088,117.00,4.0,7826,481.0,150.0,2.0,224.0,2.0,...,,,,,,,,,,3577538


In [15]:
#dataset_final[m_columns]

In [16]:
#dataset_final[dataset_final['id_02'].notna()]

In [17]:
columns_with_missing = dataset_final.columns[dataset_final.isnull().any()].to_list()
len(columns_with_missing)

178

In [18]:
dict_ = {}
for column in columns_with_missing:
    dict_[column] = dataset_final.dtypes[column]

In [19]:
#dict_

In [20]:
#### Get numeric feature columns with missing data plus the columns without missing data
full_columns = dataset_final.columns[dataset_final.notna().all()].to_list()
numeric_columns = full_columns + [column for column in columns_with_missing if dataset_final[column].dtype == 'float64']

In [21]:
#### Get caregorical feature columns with missing data plus the columns without missing data
full_columns = dataset_final.columns[dataset_final.notna().all()].to_list()
categorical_columns = full_columns + [column for column in columns_with_missing if dataset_final[column].dtype == 'category']

In [22]:
numeric_columns_dataset = dataset_final[numeric_columns].copy()
numeric_columns_dataset.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,C3,C9,C12,C13,C14,...,V305,V309,V310,V311,V312,V314,V315,V318,V321,DeviceInfo
0,0,86400,68.5,4.0,13926,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,0,86401,29.0,4.0,2755,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,0,86469,59.0,4.0,4663,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,0,86499,50.0,4.0,18132,0.0,1.0,0.0,25.0,1.0,...,1.0,0.0,354.0,0.0,135.0,0.0,0.0,790.0,0.0,
4,0,86506,50.0,1.0,4497,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,954.0


In [23]:
categorical_columns_dataset = dataset_final[categorical_columns].copy()
categorical_columns_dataset.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,C3,C9,C12,C13,C14,...,M9,id_01,id_02,id_05,id_06,id_11,id_13,id_17,id_19,id_20
0,0,86400,68.5,4.0,13926,0.0,1.0,0.0,1.0,1.0,...,,,,,,,,,,
1,0,86401,29.0,4.0,2755,0.0,0.0,0.0,1.0,1.0,...,,,,,,,,,,
2,0,86469,59.0,4.0,4663,0.0,1.0,0.0,1.0,1.0,...,0.0,,,,,,,,,
3,0,86499,50.0,4.0,18132,0.0,1.0,0.0,25.0,1.0,...,,,,,,,,,,
4,0,86506,50.0,1.0,4497,0.0,0.0,0.0,1.0,1.0,...,,0.0,70787.0,,,100.0,,166.0,542.0,144.0


In [24]:
#id_columns = categorical_columns_dataset.columns[categorical_columns_dataset.columns.str.startswith("id")]
#id_columns

In [25]:
#categorical_columns_dataset[categorical_columns_dataset['id_01'].notna()]

#### 2.3 Impute missing values for categorical variables using matrix imputation

In [26]:
#### Check if categorical columns have classes with only one instance
for col in categorical_columns_dataset.columns:
    print(col)
    print(categorical_columns_dataset[col].value_counts(dropna=False))

isFraud
isFraud
0    569877
1     20663
Name: count, dtype: int64
TransactionDT
TransactionDT
9474817     8
11576951    5
4397066     5
7236588     5
1544629     4
           ..
4629634     1
4629631     1
4629557     1
4629466     1
15811131    1
Name: count, Length: 573349, dtype: int64
TransactionAmt
TransactionAmt
59.000     30582
117.000    28933
107.950    23954
57.950     23600
100.000    20362
           ...  
215.550        1
298.600        1
203.270        1
105.614        1
400.780        1
Name: count, Length: 20902, dtype: int64
ProductCD
ProductCD
4.0    439670
0.0     68519
2.0     37699
1.0     33024
3.0     11628
Name: count, dtype: int64
card1
card1
7919     14932
9500     14162
15885    10361
17188    10344
15066     7945
         ...  
17084        1
14620        1
11440        1
13231        1
18038        1
Name: count, Length: 13553, dtype: int64
C3
C3
0.0     588111
1.0       2137
2.0        180
3.0         58
4.0          7
16.0         6
9.0          5
11.0   

In [27]:
start_time = time.time()

In [28]:
cat_imputer = SimpleImputer(strategy='most_frequent')
cat_imputed = pd.DataFrame(cat_imputer.fit_transform(categorical_columns_dataset), columns=categorical_columns_dataset.columns)

In [29]:
end_time = time.time()
print(f"Execution time Cat Imputation: {end_time - start_time:.4f} seconds")

Execution time Cat Imputation: 8.6246 seconds


In [30]:
'''
# Mark missing values - categorical
for col in categorical_columns_dataset.columns:
    if categorical_columns_dataset[col].isnull().any():
        categorical_columns_dataset[f'{col}_missing'] = categorical_columns_dataset[col].isnull().astype(np.int8)
'''

"\n# Mark missing values - categorical\nfor col in categorical_columns_dataset.columns:\n    if categorical_columns_dataset[col].isnull().any():\n        categorical_columns_dataset[f'{col}_missing'] = categorical_columns_dataset[col].isnull().astype(np.int8)\n"

In [31]:
'''
# Impute categorical data with a RandomForestClassifier (for categorical variables)
imputer = IterativeImputer(
    estimator=HistGradientBoostingClassifier(random_state=0),
    max_iter=1,
    random_state=0
)
'''

'\n# Impute categorical data with a RandomForestClassifier (for categorical variables)\nimputer = IterativeImputer(\n    estimator=HistGradientBoostingClassifier(random_state=0),\n    max_iter=1,\n    random_state=0\n)\n'

In [32]:
'''
# Apply the imputation process
cat_data_imputed = pd.DataFrame(imputer.fit_transform(categorical_columns_dataset), columns=categorical_columns_dataset.columns)
'''

'\n# Apply the imputation process\ncat_data_imputed = pd.DataFrame(imputer.fit_transform(categorical_columns_dataset), columns=categorical_columns_dataset.columns)\n'

#### 2.4 Impute missing values for numeric variable

In [33]:
# Mark missing values - numeric

# Dictionary to collect all new columns
missing_indicators = {}


# Loop through and generate missing indicators
for col in numeric_columns_dataset.columns:
    if numeric_columns_dataset[col].isnull().any():
        missing_indicators[f'{col}_missing'] = numeric_columns_dataset[col].isnull().astype(np.int8)
        
# Convert to DataFrame and concatenate all at once
missing_df = pd.DataFrame(missing_indicators)

# Join to the original DataFrame
numeric_columns_dataset_new = pd.concat([numeric_columns_dataset, missing_df], axis=1)

In [34]:
# Impute categorical data with a RandomForestClassifier (for categorical variables)
imputer = IterativeImputer(
    estimator=HistGradientBoostingRegressor(random_state=0),
    max_iter=10,
    random_state=0
)

In [35]:
start_time = time.time()

In [36]:
# Apply the imputation process
num_data_imputed = pd.DataFrame(imputer.fit_transform(numeric_columns_dataset_new), columns=numeric_columns_dataset_new.columns)



In [37]:
end_time = time.time()
print(f"Execution time Numeric Imputation: {end_time - start_time:.4f} seconds")

Execution time Numeric Imputation: 7984.0059 seconds


In [38]:
### Remove the missing_indicator columns
missing_indicator_cols = [col for col in numeric_columns_dataset_new.columns if col.endswith('missing')]

In [39]:
numeric_columns_dataset_final = numeric_columns_dataset_new.drop(columns=missing_indicator_cols)

In [40]:
numeric_columns_dataset_final

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,C3,C9,C12,C13,C14,...,V305,V309,V310,V311,V312,V314,V315,V318,V321,DeviceInfo
0,0,86400,68.50,4.0,13926,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,
1,0,86401,29.00,4.0,2755,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,
2,0,86469,59.00,4.0,4663,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,
3,0,86499,50.00,4.0,18132,0.0,1.0,0.0,25.0,1.0,...,1.0,0.0,354.000000,0.0,135.0,0.000000,0.000000,790.0,0.000000,
4,0,86506,50.00,1.0,4497,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,954.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,0,15811047,49.00,4.0,6550,0.0,2.0,0.0,3.0,2.0,...,1.0,0.0,47.950001,0.0,0.0,47.950001,47.950001,0.0,0.000000,
590536,0,15811049,39.50,4.0,10444,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,
590537,0,15811079,30.95,4.0,12037,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,
590538,0,15811088,117.00,4.0,7826,0.0,2.0,1.0,5.0,1.0,...,1.0,117.0,669.500000,0.0,117.0,669.500000,317.500000,0.0,0.000000,


In [41]:
columns_with_missing = numeric_columns_dataset_final.columns[numeric_columns_dataset_final.isnull().any()].to_list()
len(columns_with_missing)

160

In [42]:
numeric_columns_dataset_final[columns_with_missing]

Unnamed: 0,card4,card6,dist1,P_emaildomain,D1,D3,D4,D5,D10,D11,...,V305,V309,V310,V311,V312,V314,V315,V318,V321,DeviceInfo
0,1.0,1.0,19.0,,14.0,13.0,,,13.0,13.0,...,1.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,
1,2.0,1.0,,16.0,0.0,,0.0,,0.0,,...,1.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,
2,3.0,2.0,287.0,35.0,0.0,,0.0,,0.0,315.0,...,1.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,
3,2.0,2.0,,53.0,112.0,0.0,94.0,0.0,84.0,,...,1.0,0.0,354.000000,0.0,135.0,0.000000,0.000000,790.0,0.000000,
4,2.0,1.0,,16.0,0.0,,,,,,...,1.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,954.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3.0,2.0,48.0,,29.0,30.0,,,56.0,56.0,...,1.0,0.0,47.950001,0.0,0.0,47.950001,47.950001,0.0,0.000000,
590536,2.0,2.0,,16.0,0.0,,0.0,,0.0,0.0,...,1.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,
590537,2.0,2.0,,16.0,0.0,,0.0,,0.0,0.0,...,1.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,
590538,2.0,2.0,3.0,2.0,22.0,0.0,22.0,0.0,22.0,22.0,...,1.0,117.0,669.500000,0.0,117.0,669.500000,317.500000,0.0,0.000000,


In [44]:
numeric_columns_dataset_final[columns_with_missing].isnull().sum()

card4              1577
card6              1571
dist1            352271
P_emaildomain     94456
D1                 1269
                  ...  
V314               1269
V315               1269
V318                 12
V321                 12
DeviceInfo       471874
Length: 160, dtype: int64